This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move the tests that blow away %ENV to the end of t/op/magic.t
[perl5.git] / t / re / fold_grind.t
CommitLineData
371a505e 1# Grind out a lot of combinatoric tests for folding.
a2d9a01a
KW
2
3use charnames ":full";
4
5binmode STDOUT, ":utf8";
6
7BEGIN {
8 chdir 't' if -d 't';
9 @INC = '../lib';
10 require './test.pl';
a59efd0a 11 skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX");
a2d9a01a
KW
12}
13
29d01a3e 14my $DEBUG = 0; # Outputs extra information for debugging this .t
abf4d645 15
a2d9a01a
KW
16use strict;
17use warnings;
d08723ac 18use Encode;
a59efd0a 19use POSIX;
a2d9a01a
KW
20
21# Tests both unicode and not, so make sure not implicitly testing unicode
22no feature 'unicode_strings';
23
24# Case-insensitive matching is a large and complicated issue. Perl does not
25# implement it fully, properly. For example, it doesn't include normalization
26# as part of the equation. To test every conceivable combination is clearly
27# impossible; these tests are mostly drawn from visual inspection of the code
28# and experience, trying to exercise all areas.
29
30# There are three basic ranges of characters that Perl may treat differently:
31# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
32# referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants
33# are all controls that fold to themselves.
34my $ASCII = 1;
35
36# 2) Other characters that fit into a byte but are different in utf8 than not;
37# here referred to, taking some liberties, as Latin1.
38my $Latin1 = 2;
39
40# 3) Characters that won't fit in a byte; here referred to as Unicode
41my $Unicode = 3;
42
43# Within these basic groups are equivalence classes that testing any character
44# in is likely to lead to the same results as any other character. This is
45# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
46# set.
47my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
48
49sub range_type {
50 my $ord = shift;
51
52 return $ASCII if $ord < 128;
53 return $Latin1 if $ord < 256;
54 return $Unicode;
55}
56
57sub numerically {
58 return $a <=> $b
59}
60
7a0a13a6
KW
61sub format_test($$$$) {
62 my ($test, $count, $todo, $debug) = @_;
abf4d645 63
29d01a3e
KW
64 # Create a test entry, with TODO set if it is one of the known problem
65 # code points
66
abf4d645 67 $debug = "" unless $DEBUG;
7a0a13a6 68 $todo = "Known problem" if $todo;
abf4d645
KW
69
70 return qq[TODO: { local \$::TODO = "$todo"; ok(eval '$test', '$test; $debug'); }];
71}
72
29d01a3e 73my %tests; # The final set of tests. keys are the code points to test
a2d9a01a
KW
74my %simple_folds;
75my %multi_folds;
76
77# First, analyze the current Unicode's folding rules
78my %folded_from;
79my $file="../lib/unicore/CaseFolding.txt";
80open my $fh, "<", $file or die "Failed to read '$file': $!";
81while (<$fh>) {
82 chomp;
83
84 # Lines look like (though without the initial '#')
85 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
86
87 my ($line, $comment) = split / \s+ \# \s+ /x, $_;
88 next if $line eq "" || substr($line, 0, 1) eq '#';
89 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
90
91 my $from = hex $hex_from;
92
93 if ($fold_type eq 'F') {
27f6057f
KW
94 my $from_range_type = range_type($from);
95
96 # If we were testing comprehensively, we would try every combination
97 # of upper and lower case in the fold, but it is quite likely that if
98 # the code can handle all combinations if it can handle the cases
99 # where everything is upper and when everything is lower. Because of
100 # complement matching, we need to do both. And we use the
101 # reverse-fold instead of uppercase.
a2d9a01a 102 @folded = map { hex $_ } @folded;
27f6057f
KW
103 # XXX better to use reverse fold of these instead of uc
104 my @uc_folded = map { ord uc chr $_ } @folded;
a2d9a01a
KW
105
106 # Include three code points that are handled internally by the regex
29d01a3e 107 # engine specially, plus all non-above-255 multi folds (which actually
a2d9a01a
KW
108 # the only one is already included in the three, but this makes sure)
109 # And if any member of the fold is not the same range type as the
110 # source, add it directly to the tests. It needs to be an array of an
111 # array, so that it is distinguished from multiple single folds
112 if ($from == 0xDF || $from == 0x390 || $from == 0x3B0
113 || $from_range_type != $Unicode
114 || grep { range_type($_) != $from_range_type } @folded)
115 {
27f6057f 116 $tests{$from} = [ [ @folded ], [ @uc_folded ] ];
a2d9a01a
KW
117 }
118 else {
119
27f6057f
KW
120 # The only multi-char non-utf8 fold is DF, which is handled above,
121 # so here chr() must be utf8. Get the number of bytes in each.
122 # This is because the optimizer cares about length differences.
123 my $from_length = length encode('UTF-8', chr($from));
124 my $to_length = length encode('UTF-8', pack 'U*', @folded);
125 push @{$multi_folds{$from_length}{$to_length}}, { $from => [ [ @folded ], [ @uc_folded ] ] };
a2d9a01a
KW
126 }
127 }
128
129 # Perl only deals with C and F folds
130 next if $fold_type ne 'C';
131
d2025f57
KW
132 # C folds are single-char $from to single-char $folded, in chr terms
133 # folded_from{'s'} = [ 'S', \N{LATIN SMALL LETTER LONG S} ]
a2d9a01a
KW
134 push @{$folded_from{hex $folded[0]}}, $from;
135}
136
d2025f57 137# Now try to sort the single char folds into equivalence classes that are
a2d9a01a
KW
138# likely to have identical successes and failures. Any fold that crosses
139# range types is suspect, and is automatically tested. Otherwise, store by
140# the number of characters that participate in a fold. Likely all folds in a
141# range type that fold to each other like B->b->B will have identical success
142# and failure; similarly all folds that have three characters participating
143# are likely to have the same successes and failures, etc.
144foreach my $folded (sort numerically keys %folded_from) {
145 my $target_range_type = range_type($folded);
146 my $count = @{$folded_from{$folded}};
147
148 # Automatically test any fold that crosses range types
149 if (grep { range_type($_) != $target_range_type } @{$folded_from{$folded}})
150 {
151 $tests{$folded} = $folded_from{$folded};
152 }
153 else {
154 push @{$simple_folds{$target_range_type}{$count}},
155 { $folded => $folded_from{$folded} };
a7caa9e8 156 }
a2d9a01a
KW
157}
158
159foreach my $from_length (keys %multi_folds) {
160 foreach my $fold_length (keys %{$multi_folds{$from_length}}) {
161 #print __LINE__, ref $multi_folds{$from_length}{$fold_length}, Dumper $multi_folds{$from_length}{$fold_length};
162 foreach my $test (@{$multi_folds{$from_length}{$fold_length}}) {
163 #print __LINE__, ": $from_length, $fold_length, $test:\n";
164 my ($target, $pattern) = each %$test;
165 #print __LINE__, ": $target: $pattern\n";
166 $tests{$target} = $pattern;
167 last if $skip_apparently_redundant;
168 }
169 }
170}
171
172# Add in tests for single character folds. Add tests for each range type,
173# and within those tests for each number of characters participating in a
174# fold. Thus B->b has two characters participating. But K->k and Kelvin
175# Sign->k has three characters participating. So we would make sure that
176# there is a test for 3 chars, 4 chars, ... . (Note that the 'k' example is a
177# bad one because it crosses range types, so is automatically tested. In the
178# Unicode range there are various of these 3 and 4 char classes, but aren't as
179# easily described as the 'k' one.)
180foreach my $type (keys %simple_folds) {
181 foreach my $count (keys %{$simple_folds{$type}}) {
182 foreach my $test (@{$simple_folds{$type}{$count}}) {
183 my ($target, $pattern) = each %$test;
184 $tests{$target} = $pattern;
185 last if $skip_apparently_redundant;
186 }
187 }
188}
189
190# For each range type, test additionally a character that folds to itself
191$tests{0x3A} = [ 0x3A ];
192$tests{0xF7} = [ 0xF7 ];
193$tests{0x2C7} = [ 0x2C7 ];
194
29069c2e 195my $clump_execs = 1000; # Speed up by building an 'exec' of many tests
a2d9a01a
KW
196my @eval_tests;
197
2f7f8cb1
KW
198# To cut down on the number of tests
199my $has_tested_aa_above_latin1;
200my $has_tested_latin1_aa;
17580e7a
KW
201my $has_tested_l_above_latin1;
202my $has_tested_latin1_l;
2f7f8cb1 203
a2d9a01a
KW
204# For use by pairs() in generating combinations
205sub prefix {
206 my $p = shift;
a7caa9e8 207 map [ $p, $_ ], @_
a2d9a01a
KW
208}
209
210# Returns all ordered combinations of pairs of elements from the input array.
211# It doesn't return pairs like (a, a), (b, b). Change the slice to an array
212# to do that. This was just to have fewer tests.
a7caa9e8 213sub pairs (@) {
a2d9a01a 214 #print __LINE__, ": ", join(" XXX ", @_), "\n";
a7caa9e8 215 map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
a2d9a01a
KW
216}
217
a59efd0a
KW
218my @charsets = qw(d u aa);
219my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
220push @charsets, 'l' if $current_locale eq 'C';
a2d9a01a
KW
221
222# Finally ready to do the tests
abf4d645 223my $count=0;
a2d9a01a
KW
224foreach my $test (sort { numerically } keys %tests) {
225
226 my $previous_target;
227 my $previous_pattern;
228 my @pairs = pairs(sort numerically $test, @{$tests{$test}});
229
230 # Each fold can be viewed as a closure of all the characters that
231 # participate in it. Look at each possible pairing from a closure, with the
232 # first member of the pair the target string to match against, and the
233 # second member forming the pattern. Thus each fold member gets tested as
234 # the string, and the pattern with every other member in the opposite role.
235 while (my $pair = shift @pairs) {
236 my ($target, $pattern) = @$pair;
237
238 # When testing a char that doesn't fold, we can get the same
239 # permutation twice; so skip all but the first.
240 next if $previous_target
241 && $previous_target == $target
242 && $previous_pattern == $pattern;
243 ($previous_target, $previous_pattern) = ($target, $pattern);
244
245 # Each side may be either a single char or a string. Extract each into an
246 # array (perhaps of length 1)
247 my @target, my @pattern;
248 @target = (ref $target) ? @$target : $target;
249 @pattern = (ref $pattern) ? @$pattern : $pattern;
250
251 # Have to convert non-utf8 chars to native char set
252 @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
253 @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;
254
255 # Get in hex form.
256 my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
257 my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
258
259 my $target_above_latin1 = grep { $_ > 255 } @target;
260 my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
2f7f8cb1
KW
261 my $target_has_ascii = grep { $_ < 128 } @target;
262 my $pattern_has_ascii = grep { $_ < 128 } @pattern;
17580e7a
KW
263 my $target_has_latin1 = grep { $_ < 256 } @target;
264 my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
a2d9a01a
KW
265 my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
266
267 # We don't test multi-char folding into other multi-chars. We are testing
268 # a code point that folds to or from other characters. Find the single
269 # code point for diagnostic purposes. (If both are single, choose the
270 # target string)
271 my $ord = @target == 1 ? $target[0] : $pattern[0];
7fea222d
KW
272 my $progress = sprintf "%04X: \"%s\" and /%s/",
273 $test,
a2d9a01a
KW
274 join("", @x_target),
275 join("", @x_pattern);
276 #print $progress, "\n";
277 #diag $progress;
278
279 # Now grind out tests, using various combinations.
a59efd0a 280 foreach my $charset (@charsets) {
2f7f8cb1
KW
281
282 # /aa should only affect things with folds in the ASCII range. But, try
283 # it on one pair in the other ranges just to make sure it doesn't break
284 # them. Set these flags. They are set to the ord of the character
285 # tested so that all pairs of that ord get tested.
286 if ($charset eq 'aa') {
287 if (! $target_has_ascii && ! $pattern_has_ascii) {
288 if ($target_above_latin1 || $pattern_above_latin1) {
289 next if defined $has_tested_aa_above_latin1
41ce0a5e
KW
290 && $has_tested_aa_above_latin1 != $test;
291 $has_tested_aa_above_latin1 = $test;
2f7f8cb1 292 }
41ce0a5e
KW
293 next if defined $has_tested_latin1_aa && $has_tested_latin1_aa != $test;
294 $has_tested_latin1_aa = $test;
2f7f8cb1
KW
295 }
296 }
17580e7a
KW
297 elsif ($charset eq 'l') {
298 if (! $target_has_latin1 && ! $pattern_has_latin1) {
299 next if defined $has_tested_latin1_l && $has_tested_latin1_l != $test;
300 $has_tested_latin1_l = $test;
301 }
302 }
2f7f8cb1 303
a2d9a01a
KW
304 foreach my $utf8_target (0, 1) { # Both utf8 and not, for
305 # code points < 256
306 my $upgrade_target = "";
307
308 # These must already be in utf8 because the string to match has
309 # something above latin1. So impossible to test if to not to be in
310 # utf8; and otherwise, no upgrade is needed.
311 next if $target_above_latin1 && ! $utf8_target;
d08723ac 312 $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;
a2d9a01a 313
d08723ac
KW
314 foreach my $utf8_pattern (0, 1) {
315 next if $pattern_above_latin1 && ! $utf8_pattern;
17580e7a
KW
316
317 # Our testing of 'l' uses the POSIX locale, which is ASCII-only
318 my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/);
a2d9a01a 319 my $upgrade_pattern = "";
d08723ac 320 $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
a2d9a01a
KW
321
322 my $lhs = join "", @x_target;
323 my @rhs = @x_pattern;
371a505e 324 my $rhs = join "", @rhs;
2f7f8cb1 325 my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self)
17580e7a
KW
326 || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii)
327 || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1);
371a505e
KW
328
329 # Do simple tests of referencing capture buffers, named and
330 # numbered.
331 my $op = '=~';
332 $op = '!~' if $should_fail;
d2025f57 333
7a0a13a6
KW
334 # I'm afraid this was derived from trial and error.
335 my $todo = ($test == 0xdf
336 && $lhs =~ /DF/
337 && $uni_semantics
338 && ($charset eq 'u' || $charset eq 'd')
339 && ! ($charset eq 'u' && (($upgrade_target eq "") != ($upgrade_pattern eq "")))
340 && ! ($charset eq 'd' && (! $upgrade_target || ! $upgrade_pattern))
341 );
371a505e 342 my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
7a0a13a6 343 push @eval_tests, format_test($eval, ++$count, $todo, "");
abf4d645 344
371a505e 345 $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
7a0a13a6 346 push @eval_tests, format_test($eval, ++$count, $todo, "");
abf4d645 347
371a505e
KW
348 if ($lhs ne $rhs) {
349 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
7a0a13a6 350 push @eval_tests, format_test($eval, ++$count, "", "");
abf4d645 351
371a505e 352 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
7a0a13a6 353 push @eval_tests, format_test($eval, ++$count, "", "");
371a505e 354 }
371a505e 355
1ef17b72 356 foreach my $bracketed (0, 1) { # Put rhs in [...], or not
a2d9a01a 357 foreach my $inverted (0,1) {
d2025f57 358 next if $inverted && ! $bracketed; # inversion only valid in [^...]
a2d9a01a
KW
359
360 # In some cases, add an extra character that doesn't fold, and
361 # looks ok in the output.
362 my $extra_char = "_";
363 foreach my $prepend ("", $extra_char) {
364 foreach my $append ("", $extra_char) {
a2d9a01a
KW
365
366 # Assemble the rhs. Put each character in a separate
367 # bracketed if using charclasses. This creates a stress on
368 # the code to span a match across multiple elements
369 my $rhs = "";
370 foreach my $rhs_char (@rhs) {
371 $rhs .= '[' if $bracketed;
372 $rhs .= '^' if $inverted;
373 $rhs .= $rhs_char;
374
375 # Add a character to the class, so class doesn't get
376 # optimized out
377 $rhs .= '_]' if $bracketed;
378 }
379
380 # Add one of: no capturing parens
381 # a single set
382 # a nested set
383 # Use quantifiers and extra variable width matches inside
384 # them to keep some optimizations from happening
385 foreach my $parend (0, 1, 2) {
386 my $interior = (! $parend)
387 ? $rhs
388 : ($parend == 1)
389 ? "(${rhs},?)"
390 : "((${rhs})+,?)";
391 foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
392
393 # A ? or * quantifier normally causes the thing to be
394 # able to match a null string
395 my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*';
396
397 # But since we only quantify the last character in a
398 # multiple fold, the other characters will have width,
399 # except if we are quantifying the whole rhs
400 my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend);
401
402 foreach my $l_anchor ("", '^') { # '\A' didn't change result)
403 foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result)
404
405 # The folded part can match the null string if it
406 # isn't required to have width, and there's not
407 # something on one or both sides that force it to.
2f7f8cb1
KW
408 my $both_sides = ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append);
409 my $must_match = ! $can_match_null || $both_sides;
410 # for performance, but doing this missed many failures
a2d9a01a 411 #next unless $must_match;
d08723ac 412 my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
a2d9a01a 413 my $op;
d08723ac 414 if ($must_match && $should_fail) {
a2d9a01a
KW
415 $op = 0;
416 } else {
417 $op = 1;
418 }
419 $op = ! $op if $must_match && $inverted;
27f6057f
KW
420
421 if ($inverted && @target > 1) {
422 # When doing an inverted match against a
423 # multi-char target, and there is not something on
424 # the left to anchor the match, if it shouldn't
425 # succeed, skip, as what will happen (when working
426 # correctly) is that it will match the first
427 # position correctly, and then be inverted to not
428 # match; then it will go to the second position
429 # where it won't match, but get inverted to match,
430 # and hence succeeding.
431 next if ! ($l_anchor || $prepend) && ! $op;
432
433 # Can't ever match for latin1 code points non-uni
434 # semantics that have a inverted multi-char fold
435 # when there is something on both sides and the
436 # quantifier isn't such as to span the required
437 # width, which is 2 or 3.
438 $op = 0 if $ord < 255
439 && ! $uni_semantics
440 && $both_sides
441 && ( ! $quantifier || $quantifier eq '?')
442 && $parend < 2;
443
444 # Similarly can't ever match when inverting a multi-char
445 # fold for /aa and the quantifier isn't sufficient
446 # to allow it to span to both sides.
447 $op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
448
449 # Or for /l
450 $op = 0 if $target_has_latin1 && $charset eq 'l' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
451 }
452
a2d9a01a
KW
453 $op = ($op) ? '=~' : '!~';
454
abf4d645
KW
455 my $debug .= " uni_semantics=$uni_semantics, should_fail=$should_fail, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quantifier=$quantifier, l_anchor=$l_anchor, r_anchor=$r_anchor";
456 $debug .= "; pattern_above_latin1=$pattern_above_latin1; utf8_pattern=$utf8_pattern";
457 my $eval = "my \$c = \"$prepend$lhs$append\"; my \$p = qr/$quantified/i;$upgrade_target$upgrade_pattern \$c $op \$p";
a2d9a01a 458
abf4d645 459 # XXX Doesn't currently test multi-char folds in pattern
a2d9a01a 460 next if @pattern != 1;
7a0a13a6 461 push @eval_tests, format_test($eval, ++$count, "", $debug);
a2d9a01a
KW
462
463 # Group tests
464 if (@eval_tests >= $clump_execs) {
abf4d645 465 #eval "use re qw(Debug COMPILE EXECUTE);" . join ";\n", @eval_tests;
a2d9a01a 466 eval join ";\n", @eval_tests;
abf4d645
KW
467 if ($@) {
468 fail($@);
469 exit 1;
470 }
a2d9a01a
KW
471 undef @eval_tests;
472 }
473 }
474 }
475 }
476 }
477 }
478 }
479 }
480 }
481 }
482 }
483 }
484 }
485}
486
487# Finish up any tests not already done
488eval join ";\n", @eval_tests;
abf4d645
KW
489if ($@) {
490 fail($@);
491 exit 1;
492}
a2d9a01a 493
abf4d645 494plan($count);
a2d9a01a
KW
495
4961