1 # Grind out a lot of combinatoric tests for folding.
5 binmode STDOUT, ":utf8";
13 my $DEBUG = 0; # Outputs extra information for debugging this .t
19 # Tests both unicode and not, so make sure not implicitly testing unicode
20 no feature 'unicode_strings';
22 # Case-insensitive matching is a large and complicated issue. Perl does not
23 # implement it fully, properly. For example, it doesn't include normalization
24 # as part of the equation. To test every conceivable combination is clearly
25 # impossible; these tests are mostly drawn from visual inspection of the code
26 # and experience, trying to exercise all areas.
28 # There are three basic ranges of characters that Perl may treat differently:
29 # 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
30 # referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants
31 # are all controls that fold to themselves.
34 # 2) Other characters that fit into a byte but are different in utf8 than not;
35 # here referred to, taking some liberties, as Latin1.
38 # 3) Characters that won't fit in a byte; here referred to as Unicode
41 # Within these basic groups are equivalence classes that testing any character
42 # in is likely to lead to the same results as any other character. This is
43 # used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
45 my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
50 return $ASCII if $ord < 128;
51 return $Latin1 if $ord < 256;
56 map { $todos{$_} = '1' } (
63 sub format_test($$$) {
64 my ($test, $count, $debug) = @_;
66 # Create a test entry, with TODO set if it is one of the known problem
69 $debug = "" unless $DEBUG;
71 my $todo = (exists $todos{$count}) ? "Known problem" : 0;
73 return qq[TODO: { local \$::TODO = "$todo"; ok(eval '$test', '$test; $debug'); }];
76 my %tests; # The final set of tests. keys are the code points to test
80 # First, analyze the current Unicode's folding rules
82 my $file="../lib/unicore/CaseFolding.txt";
83 open my $fh, "<", $file or die "Failed to read '$file': $!";
87 # Lines look like (though without the initial '#')
88 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
90 my ($line, $comment) = split / \s+ \# \s+ /x, $_;
91 next if $line eq "" || substr($line, 0, 1) eq '#';
92 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
94 my $from = hex $hex_from;
96 if ($fold_type eq 'F') {
97 next; # XXX TODO multi-char folds
98 my $from_range_type = range_type($from);
99 @folded = map { hex $_ } @folded;
101 # Include three code points that are handled internally by the regex
102 # engine specially, plus all non-above-255 multi folds (which actually
103 # the only one is already included in the three, but this makes sure)
104 # And if any member of the fold is not the same range type as the
105 # source, add it directly to the tests. It needs to be an array of an
106 # array, so that it is distinguished from multiple single folds
107 if ($from == 0xDF || $from == 0x390 || $from == 0x3B0
108 || $from_range_type != $Unicode
109 || grep { range_type($_) != $from_range_type } @folded)
111 $tests{$from} = [ [ @folded ] ];
115 # Must be Unicode here, so chr is automatically utf8. Get the
116 # number of bytes in each. This is because the optimizer cares
117 # about length differences.
118 my $from_length = length encode('utf-8', chr($from));
119 my $to_length = length encode('utf-8', pack 'U*', @folded);
120 push @{$multi_folds{$from_length}{$to_length}}, { $from => [ @folded ] };
124 # Perl only deals with C and F folds
125 next if $fold_type ne 'C';
127 # C folds are single-char $from to single-char $folded, in chr terms
128 # folded_from{'s'} = [ 'S', \N{LATIN SMALL LETTER LONG S} ]
129 push @{$folded_from{hex $folded[0]}}, $from;
132 # Now try to sort the single char folds into equivalence classes that are
133 # likely to have identical successes and failures. Any fold that crosses
134 # range types is suspect, and is automatically tested. Otherwise, store by
135 # the number of characters that participate in a fold. Likely all folds in a
136 # range type that fold to each other like B->b->B will have identical success
137 # and failure; similarly all folds that have three characters participating
138 # are likely to have the same successes and failures, etc.
139 foreach my $folded (sort numerically keys %folded_from) {
140 my $target_range_type = range_type($folded);
141 my $count = @{$folded_from{$folded}};
143 # Automatically test any fold that crosses range types
144 if (grep { range_type($_) != $target_range_type } @{$folded_from{$folded}})
146 $tests{$folded} = $folded_from{$folded};
149 push @{$simple_folds{$target_range_type}{$count}},
150 { $folded => $folded_from{$folded} };
154 foreach my $from_length (keys %multi_folds) {
155 foreach my $fold_length (keys %{$multi_folds{$from_length}}) {
156 #print __LINE__, ref $multi_folds{$from_length}{$fold_length}, Dumper $multi_folds{$from_length}{$fold_length};
157 foreach my $test (@{$multi_folds{$from_length}{$fold_length}}) {
158 #print __LINE__, ": $from_length, $fold_length, $test:\n";
159 my ($target, $pattern) = each %$test;
160 #print __LINE__, ": $target: $pattern\n";
161 $tests{$target} = $pattern;
162 last if $skip_apparently_redundant;
167 # Add in tests for single character folds. Add tests for each range type,
168 # and within those tests for each number of characters participating in a
169 # fold. Thus B->b has two characters participating. But K->k and Kelvin
170 # Sign->k has three characters participating. So we would make sure that
171 # there is a test for 3 chars, 4 chars, ... . (Note that the 'k' example is a
172 # bad one because it crosses range types, so is automatically tested. In the
173 # Unicode range there are various of these 3 and 4 char classes, but aren't as
174 # easily described as the 'k' one.)
175 foreach my $type (keys %simple_folds) {
176 foreach my $count (keys %{$simple_folds{$type}}) {
177 foreach my $test (@{$simple_folds{$type}{$count}}) {
178 my ($target, $pattern) = each %$test;
179 $tests{$target} = $pattern;
180 last if $skip_apparently_redundant;
185 # For each range type, test additionally a character that folds to itself
186 $tests{0x3A} = [ 0x3A ];
187 $tests{0xF7} = [ 0xF7 ];
188 $tests{0x2C7} = [ 0x2C7 ];
190 my $clump_execs = 10000; # Speed up by building an 'exec' of many tests
193 # For use by pairs() in generating combinations
199 # Returns all ordered combinations of pairs of elements from the input array.
200 # It doesn't return pairs like (a, a), (b, b). Change the slice to an array
201 # to do that. This was just to have fewer tests.
203 #print __LINE__, ": ", join(" XXX ", @_), "\n";
204 map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
208 # Finally ready to do the tests
210 foreach my $test (sort { numerically } keys %tests) {
213 my $previous_pattern;
214 my @pairs = pairs(sort numerically $test, @{$tests{$test}});
216 # Each fold can be viewed as a closure of all the characters that
217 # participate in it. Look at each possible pairing from a closure, with the
218 # first member of the pair the target string to match against, and the
219 # second member forming the pattern. Thus each fold member gets tested as
220 # the string, and the pattern with every other member in the opposite role.
221 while (my $pair = shift @pairs) {
222 my ($target, $pattern) = @$pair;
224 # When testing a char that doesn't fold, we can get the same
225 # permutation twice; so skip all but the first.
226 next if $previous_target
227 && $previous_target == $target
228 && $previous_pattern == $pattern;
229 ($previous_target, $previous_pattern) = ($target, $pattern);
231 # Each side may be either a single char or a string. Extract each into an
232 # array (perhaps of length 1)
233 my @target, my @pattern;
234 @target = (ref $target) ? @$target : $target;
235 @pattern = (ref $pattern) ? @$pattern : $pattern;
237 # Have to convert non-utf8 chars to native char set
238 @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
239 @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;
242 my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
243 my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
245 my $target_above_latin1 = grep { $_ > 255 } @target;
246 my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
247 my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
249 # We don't test multi-char folding into other multi-chars. We are testing
250 # a code point that folds to or from other characters. Find the single
251 # code point for diagnostic purposes. (If both are single, choose the
253 my $ord = @target == 1 ? $target[0] : $pattern[0];
254 my $progress = sprintf "\"%s\" and /%s/",
256 join("", @x_pattern);
257 #print $progress, "\n";
260 # Now grind out tests, using various combinations.
261 # XXX foreach my $charset ('d', 'u', 'l') {
262 foreach my $charset ('d', 'u') {
263 foreach my $utf8_target (0, 1) { # Both utf8 and not, for
265 my $upgrade_target = "";
267 # These must already be in utf8 because the string to match has
268 # something above latin1. So impossible to test if to not to be in
269 # utf8; and otherwise, no upgrade is needed.
270 next if $target_above_latin1 && ! $utf8_target;
271 $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;
273 foreach my $utf8_pattern (0, 1) {
274 next if $pattern_above_latin1 && ! $utf8_pattern;
275 my $uni_semantics = $utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern);
276 my $upgrade_pattern = "";
277 $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
279 my $lhs = join "", @x_target;
280 my @rhs = @x_pattern;
281 my $rhs = join "", @rhs;
282 my $should_fail = ! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self;
284 # Do simple tests of referencing capture buffers, named and
287 $op = '!~' if $should_fail;
289 my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
290 push @eval_tests, format_test($eval, ++$count, "");
292 $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
293 push @eval_tests, format_test($eval, ++$count, "");
296 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
297 push @eval_tests, format_test($eval, ++$count, "");
299 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
300 push @eval_tests, format_test($eval, ++$count, "");
303 foreach my $bracketed (0, 1) { # Put rhs in [...], or not
304 foreach my $inverted (0,1) {
305 next if $inverted && ! $bracketed; # inversion only valid in [^...]
307 # In some cases, add an extra character that doesn't fold, and
308 # looks ok in the output.
309 my $extra_char = "_";
310 foreach my $prepend ("", $extra_char) {
311 foreach my $append ("", $extra_char) {
313 # Assemble the rhs. Put each character in a separate
314 # bracketed if using charclasses. This creates a stress on
315 # the code to span a match across multiple elements
317 foreach my $rhs_char (@rhs) {
318 $rhs .= '[' if $bracketed;
319 $rhs .= '^' if $inverted;
322 # Add a character to the class, so class doesn't get
324 $rhs .= '_]' if $bracketed;
327 # Add one of: no capturing parens
330 # Use quantifiers and extra variable width matches inside
331 # them to keep some optimizations from happening
332 foreach my $parend (0, 1, 2) {
333 my $interior = (! $parend)
338 foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
340 # A ? or * quantifier normally causes the thing to be
341 # able to match a null string
342 my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*';
344 # But since we only quantify the last character in a
345 # multiple fold, the other characters will have width,
346 # except if we are quantifying the whole rhs
347 my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend);
349 foreach my $l_anchor ("", '^') { # '\A' didn't change result)
350 foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result)
352 # The folded part can match the null string if it
353 # isn't required to have width, and there's not
354 # something on one or both sides that force it to.
355 my $must_match = ! $can_match_null || ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append);
356 #next unless $must_match;
357 my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
359 if ($must_match && $should_fail) {
364 $op = ! $op if $must_match && $inverted;
365 $op = ($op) ? '=~' : '!~';
367 my $debug .= " uni_semantics=$uni_semantics, should_fail=$should_fail, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quantifier=$quantifier, l_anchor=$l_anchor, r_anchor=$r_anchor";
368 $debug .= "; pattern_above_latin1=$pattern_above_latin1; utf8_pattern=$utf8_pattern";
369 my $eval = "my \$c = \"$prepend$lhs$append\"; my \$p = qr/$quantified/i;$upgrade_target$upgrade_pattern \$c $op \$p";
371 # XXX Doesn't currently test multi-char folds in pattern
372 next if @pattern != 1;
373 push @eval_tests, format_test($eval, ++$count, $debug);
376 if (@eval_tests >= $clump_execs) {
377 #eval "use re qw(Debug COMPILE EXECUTE);" . join ";\n", @eval_tests;
378 eval join ";\n", @eval_tests;
399 # Finish up any tests not already done
400 eval join ";\n", @eval_tests;