This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
t/re/re_tests: Add tests for multi-char fold bug
[perl5.git] / t / uni / fold.t
CommitLineData
daf3b8d4
KW
1use strict;
2use warnings;
3
4# re/fold_grind.t has more complex tests, but doesn't test every fold
5
9e55ce06
JH
6BEGIN {
7 chdir 't' if -d 't';
8 @INC = '../lib';
daf3b8d4 9 require './test.pl';
9e55ce06
JH
10}
11
daf3b8d4
KW
12binmode *STDOUT, ":utf8";
13
daf3b8d4 14our $TODO;
9e55ce06 15
daf3b8d4
KW
16plan("no_plan");
17
18# Read in the official case folding definitions.
519ecd2c 19my $CF = '../lib/unicore/CaseFolding.txt';
9e55ce06 20
daf3b8d4
KW
21die qq[$0: failed to open "$CF": $!\n] if ! open(my $fh, "<", $CF);
22
23my @CF;
24my %reverse_fold;
25while (<$fh>) {
26 # Skip S since we are going for 'F'ull case folding. I is obsolete starting
27 # with Unicode 3.2, but leaving it in does no harm, and allows backward
28 # compatibility
29 next unless my ($code, $type, $mapping, $name) = $_ =~
30 /^([0-9A-F]+); ([CFI]); ((?:[0-9A-F]+)(?: [0-9A-F]+)*); \# (.+)/;
31
32 # Convert any 0-255 range chars to native.
33 $code = sprintf("%04X", ord_latin1_to_native(hex $code)) if hex $code < 0x100;
34 $mapping = join " ", map { $_ =
35 sprintf("%04X", ord_latin1_to_native(hex $_)) }
36 split / /, $mapping;
37
38 push @CF, [$code, $mapping, $type, $name];
39
40 # Get the inverse fold for single-char mappings.
41 $reverse_fold{pack "U0U*", hex $mapping} = pack "U0U*", hex $code if $type ne 'F';
42}
80bf4fef 43
daf3b8d4 44close($fh) or die "$0 Couldn't close $CF";
9e55ce06 45
daf3b8d4
KW
46foreach my $test_ref (@CF) {
47 my ($code, $mapping, $type, $name) = @$test_ref;
48 my $c = pack("U0U*", hex $code);
49 my $f = pack("U0U*", map { hex } split " ", $mapping);
50 my $f_length = length $f;
51 foreach my $test (
52 qq[":$c:" =~ /:$c:/],
53 qq[":$c:" =~ /:$c:/i],
54 qq[":$c:" =~ /:[_$c]:/], # Place two chars in [] so doesn't get
55 # optimized to a non-charclass
56 qq[":$c:" =~ /:[_$c]:/i],
57 qq[":$c:" =~ /:$f:/i],
58 qq[":$f:" =~ /:$c:/i],
59 ) {
60 ok eval $test, "$code - $name - $mapping - $type - $test";
9e55ce06
JH
61 }
62
daf3b8d4
KW
63 # Certain tests weren't convenient to put in the list above since they are
64 # TODO's in multi-character folds.
65 if ($f_length == 1) {
66
67 # The qq loses the utf8ness of ":$f:". These tests are not about
68 # finding bugs in utf8ness, so make sure it's utf8.
69 my $test = qq[my \$s = ":$f:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i];
70 ok eval $test, "$code - $name - $mapping - $type - $test";
71 $test = qq[":$c:" =~ /:[_$f]:/i];
72 ok eval $test, "$code - $name - $mapping - $type - $test";
73 }
74 else {
75
03b99f60 76 # There are two classes of multi-char folds that need more work. For
daf3b8d4
KW
77 # example,
78 # ":ß:" =~ /:[_s]{2}:/i
79 # ":ss:" =~ /:[_ß]:/i
80 #
81 # Some of the old tests for the second case happened to pass somewhat
82 # coincidentally. But none would pass if changed to this.
83 # ":SS:" =~ /:[_ß]:/i
84 #
85 # As the capital SS doesn't get folded. When those pass, it means
86 # that the code has been changed to take into account folding in the
03b99f60
KW
87 # string, and all should pass, capitalized or not (this wouldn't be
88 # true for [^complemented character classes], for which the fold case
89 # is better, but these aren't used in this .t currently. So, what is
90 # done is to essentially upper-case the string for this class (but use
91 # the reverse fold not uc(), as that is more correct)
daf3b8d4
KW
92 my $u;
93 for my $i (0 .. $f_length - 1) {
94 my $cur_char = substr($f, $i, 1);
95 $u .= $reverse_fold{$cur_char} || $cur_char;
96 }
97 my $test;
98
5a6441ac
KW
99 # A multi-char fold should not match just one char;
100 # e.g., ":ß:" !~ /:[_s]:/i
101 $test = qq[":$c:" !~ /:[_$f]:/i];
102 ok eval $test, "$code - $name - $mapping - $type - $test";
103
daf3b8d4 104 TODO: { # e.g., ":ß:" =~ /:[_s]{2}:/i
03b99f60
KW
105 local $TODO = 'Multi-char fold in [character class]';
106
daf3b8d4
KW
107 $test = qq[":$c:" =~ /:[_$f]{$f_length}:/i];
108 ok eval $test, "$code - $name - $mapping - $type - $test";
109 }
03b99f60
KW
110
111 # e.g., ":SS:" =~ /:[_ß]:/i now pass, so TODO has been removed, but
112 # since they use '$u', they are left out of the main loop
113 $test = qq[ my \$s = ":$u:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i];
114 ok eval $test, "$code - $name - $mapping - $type - $test";
9e55ce06 115 }
9e55ce06 116}
daf3b8d4
KW
117
118my $num_tests = curr_test() - 1;
119
120die qq[$0: failed to find casefoldings from "$CF"\n] unless $num_tests > 0;
121
122plan($num_tests);