1 # Grind out a lot of combinatoric tests for folding.
3 binmode STDOUT, ":utf8";
9 require Config; import Config;
10 skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX");
13 use charnames ":full";
15 my $DEBUG = 0; # Outputs extra information for debugging this .t
22 # Special-cased characters in the .c's that we want to make sure get tested.
23 my %be_sure_to_test = (
24 "\xDF" => 1, # LATIN_SMALL_LETTER_SHARP_S
25 "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S
26 "\x{390}" => 1, # GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
27 "\x{3B0}" => 1, # GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
28 "\x{1FD3}" => 1, # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
29 "\x{1FE3}" => 1, # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
33 # Tests both unicode and not, so make sure not implicitly testing unicode
34 no feature 'unicode_strings';
36 # Case-insensitive matching is a large and complicated issue. Perl does not
37 # implement it fully, properly. For example, it doesn't include normalization
38 # as part of the equation. To test every conceivable combination is clearly
39 # impossible; these tests are mostly drawn from visual inspection of the code
40 # and experience, trying to exercise all areas.
42 # There are three basic ranges of characters that Perl may treat differently:
43 # 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
44 # referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants
45 # are all controls that fold to themselves.
48 # 2) Other characters that fit into a byte but are different in utf8 than not;
49 # here referred to, taking some liberties, as Latin1.
52 # 3) Characters that won't fit in a byte; here referred to as Unicode
55 # Within these basic groups are equivalence classes that testing any character
56 # in is likely to lead to the same results as any other character. This is
57 # used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
59 my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
61 # Additionally parts of this test run a lot of subtests, outputting the
62 # resulting TAP can be expensive so the tests are summarised internally. The
63 # PERL_DEBUG_FULL_TEST environment variable can be set to produce the full
64 # output for debugging purposes.
69 return $ASCII if $ord < 128;
70 return $Latin1 if $ord < 256;
78 my $list_all_tests = $ENV{PERL_DEBUG_FULL_TEST} || $DEBUG;
79 $| = 1 if $list_all_tests;
81 # Significant time is saved by not outputting each test but grouping the
82 # output into subtests
83 my $okays; # Number of ok's in current subtest
84 my $this_iteration; # Number of possible tests in current subtest
85 my $count=0; # Number of subtests = number of total tests
88 my ($test, $todo, $debug) = @_;
90 $debug = "" unless $DEBUG;
93 if (!$res || $list_all_tests) {
94 # Failed or debug; output the result
96 ok($res, "$test; $debug");
98 # Just count the test as passed
104 my %has_test_by_participants; # Makes sure has tests for each range and each
105 # number of characters that fold to the same
107 my %has_test_by_byte_count; # Makes sure has tests for each combination of
108 # n bytes folds to m bytes
110 my %tests; # The set of tests.
111 # Each key is a code point that folds to something else.
112 # Each value is a list of things that the key folds to. If the 'thing' is a
113 # single code point, it is that ordinal. If it is a multi-char fold, it is an
114 # ordered list of the code points in that fold. Here's an example for 'S':
115 # '83' => [ 115, 383 ]
117 # And one for a multi-char fold: \xDF
127 # [ # LATIN SMALL LETTER LONG S
131 # 7838 # LATIN_CAPITAL_LETTER_SHARP_S
134 my %folds; # keys are code points that fold;
135 # values are each a list of code points the key folds to
136 my %inverse_folds; # keys are strings of the folded-to;
137 # values are lists of characters that fold to them
140 my ($to, @from) = @_;
142 # Called to cause the input to be tested by adding to %tests. @from is
143 # the list of characters that fold to the string $to. @from should be
144 # sorted so the lowest code point is first....
145 # The input is in string form; %tests uses code points, so have to
148 my $to_chars = length $to;
149 my @test_to; # List of tests for $to
151 if ($to_chars == 1) {
155 push @test_to, [ map { ord $_ } split "", $to ];
157 # For multi-char folds, we also test that things that can fold to each
158 # individual character in the fold also work. If we were testing
159 # comprehensively, we would try every combination of upper and lower
160 # case in the fold, but it will have to suffice to avoid running
161 # forever to make sure that each thing that folds to these is tested
162 # at least once. Because of complement matching ([^...]), we need to
163 # do both the folded, and the folded-from.
164 # We first look at each character in the multi-char fold, and save how
165 # many characters fold to it; and also the maximum number of such
167 my @folds_to_count; # 0th char in fold is index 0 ...
168 my $max_folds_to = 0;
170 for (my $i = 0; $i < $to_chars; $i++) {
171 my $to_char = substr($to, $i, 1);
172 if (exists $inverse_folds{$to_char}) {
173 $folds_to_count[$i] = scalar @{$inverse_folds{$to_char}};
174 $max_folds_to = $folds_to_count[$i] if $max_folds_to < $folds_to_count[$i];
177 $folds_to_count[$i] = 0;
181 # We will need to generate as many tests as the maximum number of
182 # folds, so that each fold will have at least one test.
183 # For example, consider character X which folds to the three character
184 # string 'xyz'. If 2 things fold to x (X and x), 4 to y (Y, Y'
185 # (Y-prime), Y'' (Y-prime-prime), and y), and 1 thing to z (itself), 4
186 # tests will be generated:
191 for (my $i = 0; $i < $max_folds_to; $i++) {
192 my @this_test_to; # Assemble a single test
194 # For each character in the multi-char fold ...
195 for (my $j = 0; $j < $to_chars; $j++) {
196 my $this_char = substr($to, $j, 1);
198 # Use its corresponding inverse fold, if available.
199 if ($i < $folds_to_count[$j]) {
200 push @this_test_to, ord $inverse_folds{$this_char}[$i];
202 else { # Or else itself.
203 push @this_test_to, ord $this_char;
207 # Add this test to the list
208 push @test_to, [ @this_test_to ];
211 # Here, have assembled all the tests for the multi-char fold. Sort so
212 # lowest code points are first for consistency and aesthetics in
213 # output. We know there are at least two characters in the fold, but
214 # I haven't bothered to worry about sorting on an optional third
215 # character if the first two are identical.
216 @test_to = sort { ($a->[0] == $b->[0])
217 ? $a->[1] <=> $b->[1]
218 : $a->[0] <=> $b->[0]
223 # This test is from n bytes to m bytes. Record that so won't try to add
224 # another test that does the same.
226 my $to_bytes = length $to;
227 foreach my $from_map (@from) {
228 $has_test_by_byte_count{length $from_map}{$to_bytes} = $to;
232 my $ord_smallest_from = ord shift @from;
233 if (exists $tests{$ord_smallest_from}) {
234 die "There are already tests for $ord_smallest_from"
237 # Add in the fold tests,
238 push @{$tests{$ord_smallest_from}}, @test_to;
240 # Then any remaining froms in the equivalence class.
241 push @{$tests{$ord_smallest_from}}, map { ord $_ } @from;
244 # Get the Unicode rules and construct inverse mappings from them
247 my $file="../lib/unicore/CaseFolding.txt";
249 # Use the Unicode data file if we are on an ASCII platform (which its data is
250 # for), and it is in the modern format (starting in Unicode 3.1.0) and it is
251 # available. This avoids being affected by potential bugs introduced by other
254 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
255 && open my $fh, "<", $file)
260 # Lines look like (though without the initial '#')
261 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
263 # Get rid of comments, ignore blank or comment-only lines
264 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
265 next unless length $line;
266 my ($hex_from, $fold_type, @hex_folded) = split /[\s;]+/, $line;
268 next if $fold_type =~ / ^ [IT] $/x; # Perl doesn't do Turkish folding
269 next if $fold_type eq 'S'; # If Unicode's tables are correct, the F
270 # should be a superset of S
272 my $from = hex $hex_from;
273 my @to = map { hex $_ } @hex_folded;
274 @{$folds{$from}} = @to;
275 my $folded_str = pack ("U0U*", @to);
276 push @{$inverse_folds{$folded_str}}, chr $from;
279 else { # Here, can't use the .txt file: read the Unicode rules file and
280 # construct inverse mappings from it
282 my ($invlist_ref, $invmap_ref, undef, $default)
283 = Unicode::UCD::prop_invmap('Case_Folding');
284 for my $i (0 .. @$invlist_ref - 1 - 1) {
285 next if $invmap_ref->[$i] == $default;
287 # Make into an array if not so already, so can treat uniformly below
288 $invmap_ref->[$i] = [ $invmap_ref->[$i] ] if ! ref $invmap_ref->[$i];
290 # Each subsequent element of the range requires adjustment of +1 from
291 # the previous element
293 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
295 my @to = map { $_ + $adjust } @{$invmap_ref->[$i]};
296 push @{$folds{$j}}, @to;
297 my $folded_str = pack "U0U*", @to;
298 #note (sprintf "%d: %04X: %s", __LINE__, $j, join " ",
299 # map { sprintf "%04X", $_ + $adjust } @{$invmap_ref->[$i]});
300 push @{$inverse_folds{$folded_str}}, chr $j;
305 # Analyze the data and generate tests to get adequate test coverage. We sort
306 # things so that smallest code points are done first.
308 foreach my $to (sort { (length $a == length $b)
310 : length $a <=> length $b
311 } keys %inverse_folds)
314 # Within each fold, sort so that the smallest code points are done first
315 @{$inverse_folds{$to}} = sort { $a cmp $b } @{$inverse_folds{$to}};
316 my @from = @{$inverse_folds{$to}};
318 # Just add it to the tests if doing complete coverage
319 if (! $skip_apparently_redundant) {
320 add_test($to, @from);
324 my $to_chars = length $to;
325 my $to_range_type = range_type(substr($to, 0, 1));
327 # If this is required to be tested, do so. We check for these first, as
328 # they will take up slots of byte-to-byte combinations that we otherwise
329 # would have to have other tests to get.
330 foreach my $from_map (@from) {
331 if (exists $be_sure_to_test{$from_map}) {
332 add_test($to, @from);
337 # If the fold contains heterogeneous range types, is suspect and should be
340 foreach my $char (split "", $to) {
341 if (range_type($char) != $to_range_type) {
342 add_test($to, @from);
348 # If the mapping crosses range types, is suspect and should be tested
349 foreach my $from_map (@from) {
350 if (range_type($from_map) != $to_range_type) {
351 add_test($to, @from);
356 # Here, all components of the mapping are in the same range type. For
357 # single character folds, we test one case in each range type that has 2
358 # particpants, 3 particpants, etc.
359 if ($to_chars == 1) {
360 if (! exists $has_test_by_participants{scalar @from}{$to_range_type}) {
361 add_test($to, @from);
362 $has_test_by_participants{scalar @from}{$to_range_type} = $to;
367 # We also test all combinations of mappings from m to n bytes. This is
368 # because the regex optimizer cares. (Don't bother worrying about that
369 # Latin1 chars will occupy a different number of bytes under utf8, as
370 # there are plenty of other cases that catch these byte numbers.)
372 my $to_bytes = length $to;
373 foreach my $from_map (@from) {
374 if (! exists $has_test_by_byte_count{length $from_map}{$to_bytes}) {
375 add_test($to, @from);
381 # For each range type, test additionally a character that folds to itself
382 add_test(chr 0x3A, chr 0x3A);
383 add_test(chr 0xF7, chr 0xF7);
384 add_test(chr 0x2C7, chr 0x2C7);
386 # To cut down on the number of tests
387 my $has_tested_aa_above_latin1;
388 my $has_tested_latin1_aa;
389 my $has_tested_ascii_aa;
390 my $has_tested_l_above_latin1;
391 my $has_tested_above_latin1_l;
392 my $has_tested_ascii_l;
393 my $has_tested_above_latin1_d;
394 my $has_tested_ascii_d;
395 my $has_tested_non_latin1_d;
396 my $has_tested_above_latin1_a;
397 my $has_tested_ascii_a;
398 my $has_tested_non_latin1_a;
400 # For use by pairs() in generating combinations
406 # Returns all ordered combinations of pairs of elements from the input array.
407 # It doesn't return pairs like (a, a), (b, b). Change the slice to an array
408 # to do that. This was just to have fewer tests.
410 #print __LINE__, ": ", join(" XXX ", map { sprintf "%04X", $_ } @_), "\n";
411 map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
414 my @charsets = qw(d u a aa);
415 if($Config{d_setlocale}) {
416 my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
417 if ($current_locale eq 'C') {
418 require locale; import locale;
420 # Some implementations don't have the 128-255 range characters all
421 # mean nothing under the C locale (an example being VMS). This is
422 # legal, but since we don't know what the right answers should be,
423 # skip the locale tests in that situation.
424 for my $i (128 .. 255) {
426 goto untestable_locale if uc($char) ne $char || lc($char) ne $char;
433 # Finally ready to do the tests
434 foreach my $test (sort { numerically } keys %tests) {
437 my $previous_pattern;
438 my @pairs = pairs(sort numerically $test, @{$tests{$test}});
440 # Each fold can be viewed as a closure of all the characters that
441 # participate in it. Look at each possible pairing from a closure, with the
442 # first member of the pair the target string to match against, and the
443 # second member forming the pattern. Thus each fold member gets tested as
444 # the string, and the pattern with every other member in the opposite role.
445 while (my $pair = shift @pairs) {
446 my ($target, $pattern) = @$pair;
448 # When testing a char that doesn't fold, we can get the same
449 # permutation twice; so skip all but the first.
450 next if $previous_target
451 && $previous_target == $target
452 && $previous_pattern == $pattern;
453 ($previous_target, $previous_pattern) = ($target, $pattern);
455 # Each side may be either a single char or a string. Extract each into an
456 # array (perhaps of length 1)
457 my @target, my @pattern;
458 @target = (ref $target) ? @$target : $target;
459 @pattern = (ref $pattern) ? @$pattern : $pattern;
461 # We are testing just folds to/from a single character. If our pairs
462 # happens to generate multi/multi, skip.
463 next if @target > 1 && @pattern > 1;
465 # Have to convert non-utf8 chars to native char set
466 @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
467 @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;
470 my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
471 my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
473 my $target_above_latin1 = grep { $_ > 255 } @target;
474 my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
475 my $target_has_ascii = grep { $_ < 128 } @target;
476 my $pattern_has_ascii = grep { $_ < 128 } @pattern;
477 my $target_only_ascii = ! grep { $_ > 127 } @target;
478 my $pattern_only_ascii = ! grep { $_ > 127 } @pattern;
479 my $target_has_latin1 = grep { $_ < 256 } @target;
480 my $target_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @target;
481 my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @pattern;
482 my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
483 my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
485 # We don't test multi-char folding into other multi-chars. We are testing
486 # a code point that folds to or from other characters. Find the single
487 # code point for diagnostic purposes. (If both are single, choose the
489 my $ord = @target == 1 ? $target[0] : $pattern[0];
490 my $progress = sprintf "%04X: \"%s\" and /%s/",
493 join("", @x_pattern);
496 # Now grind out tests, using various combinations.
497 foreach my $charset (@charsets) {
501 # To cut down somewhat on the enormous quantity of tests this currently
502 # runs, skip some for some of the character sets whose results aren't
503 # likely to differ from others. But run all tests on the code points
504 # that don't fold, plus one other set in each range group.
507 # /aa should only affect things with folds in the ASCII range. But, try
508 # it on one set in the other ranges just to make sure it doesn't break
510 if ($charset eq 'aa') {
512 # It may be that this $pair of code points to test are both
513 # non-ascii, but if either of them actually fold to ascii, that is
514 # suspect and should be tested. So for /aa, use whether their folds
516 my $target_has_ascii = $target_has_ascii;
517 my $pattern_has_ascii = $pattern_has_ascii;
518 if (! $target_has_ascii) {
519 foreach my $cp (@target) {
520 if (exists $folds{$cp}
521 && grep { ord_native_to_latin1($_) < 128 } @{$folds{$cp}} )
523 $target_has_ascii = 1;
528 if (! $pattern_has_ascii) {
529 foreach my $cp (@pattern) {
530 if (exists $folds{$cp}
531 && grep { ord_native_to_latin1($_) < 128 } @{$folds{$cp}} )
533 $pattern_has_ascii = 1;
539 if (! $target_has_ascii && ! $pattern_has_ascii) {
540 if ($target_above_latin1 || $pattern_above_latin1) {
541 next if defined $has_tested_aa_above_latin1
542 && $has_tested_aa_above_latin1 != $test;
543 $has_tested_aa_above_latin1 = $test;
545 next if defined $has_tested_latin1_aa
546 && $has_tested_latin1_aa != $test;
547 $has_tested_latin1_aa = $test;
549 elsif ($target_only_ascii && $pattern_only_ascii) {
551 # And, except for one set just to make sure, skip tests
552 # where both elements in the pair are ASCII. If one works for
553 # aa, the others are likely too. This skips tests where the
554 # fold is from non-ASCII to ASCII, but this part of the test
555 # is just about the ASCII components.
556 next if defined $has_tested_ascii_l
557 && $has_tested_ascii_l != $test;
558 $has_tested_ascii_l = $test;
561 elsif ($charset eq 'l') {
563 # For l, don't need to test beyond one set those things that are
564 # all above latin1, because unlikely to have different successes
565 # than /u. But, for the same reason as described in the /aa above,
566 # it is suspect and should be tested, if either of the folds are to
568 my $target_has_latin1 = $target_has_latin1;
569 my $pattern_has_latin1 = $pattern_has_latin1;
570 if (! $target_has_latin1) {
571 foreach my $cp (@target) {
572 if (exists $folds{$cp}
573 && grep { $_ < 256 } @{$folds{$cp}} )
575 $target_has_latin1 = 1;
580 if (! $pattern_has_latin1) {
581 foreach my $cp (@pattern) {
582 if (exists $folds{$cp}
583 && grep { $_ < 256 } @{$folds{$cp}} )
585 $pattern_has_latin1 = 1;
590 if (! $target_has_latin1 && ! $pattern_has_latin1) {
591 next if defined $has_tested_above_latin1_l
592 && $has_tested_above_latin1_l != $test;
593 $has_tested_above_latin1_l = $test;
595 elsif ($target_only_ascii && $pattern_only_ascii) {
597 # And, except for one set just to make sure, skip tests
598 # where both elements in the pair are ASCII. This is
599 # essentially the same reasoning as above for /aa.
600 next if defined $has_tested_ascii_l
601 && $has_tested_ascii_l != $test;
602 $has_tested_ascii_l = $test;
605 elsif ($charset eq 'd') {
606 # Similarly for d. Beyond one test (besides self) each, we don't
607 # test pairs that are both ascii; or both above latin1, or are
608 # combinations of ascii and above latin1.
609 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
610 if ($target_has_ascii && $pattern_has_ascii) {
611 next if defined $has_tested_ascii_d
612 && $has_tested_ascii_d != $test;
613 $has_tested_ascii_d = $test
615 elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
616 next if defined $has_tested_above_latin1_d
617 && $has_tested_above_latin1_d != $test;
618 $has_tested_above_latin1_d = $test;
621 next if defined $has_tested_non_latin1_d
622 && $has_tested_non_latin1_d != $test;
623 $has_tested_non_latin1_d = $test;
627 elsif ($charset eq 'a') {
628 # Similarly for a. This should match identically to /u, so wasn't
629 # tested at all until a bug was found that was thereby missed.
630 # As a compromise, beyond one test (besides self) each, we don't
631 # test pairs that are both ascii; or both above latin1, or are
632 # combinations of ascii and above latin1.
633 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
634 if ($target_has_ascii && $pattern_has_ascii) {
635 next if defined $has_tested_ascii_a
636 && $has_tested_ascii_a != $test;
637 $has_tested_ascii_a = $test
639 elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
640 next if defined $has_tested_above_latin1_a
641 && $has_tested_above_latin1_a != $test;
642 $has_tested_above_latin1_a = $test;
645 next if defined $has_tested_non_latin1_a
646 && $has_tested_non_latin1_a != $test;
647 $has_tested_non_latin1_a = $test;
653 foreach my $utf8_target (0, 1) { # Both utf8 and not, for
655 my $upgrade_target = "";
657 # These must already be in utf8 because the string to match has
658 # something above latin1. So impossible to test if to not to be in
659 # utf8; and otherwise, no upgrade is needed.
660 next if $target_above_latin1 && ! $utf8_target;
661 $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;
663 foreach my $utf8_pattern (0, 1) {
664 next if $pattern_above_latin1 && ! $utf8_pattern;
666 # Our testing of 'l' uses the POSIX locale, which is ASCII-only
667 my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/);
668 my $upgrade_pattern = "";
669 $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
671 my $lhs = join "", @x_target;
672 my $lhs_str = eval qq{"$lhs"}; fail($@) if $@;
673 my @rhs = @x_pattern;
674 my $rhs = join "", @rhs;
675 my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self)
676 || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii)
677 || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1);
679 # Do simple tests of referencing capture buffers, named and
682 $op = '!~' if $should_fail;
684 my $todo = 0; # No longer any todo's
685 my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
686 run_test($eval, $todo, "");
688 $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
689 run_test($eval, $todo, "");
692 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
693 run_test($eval, "", "");
695 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
696 run_test($eval, "", "");
699 # See if works on what could be a simple trie.
702 # Keep the alternate | branch the same length as the tested one so
703 # that it's length doesn't influence things
704 my $evaled = eval "\"$rhs\""; # Convert e.g. \x{foo} into its
707 $alternate = 'q' x length $evaled;
709 $eval = "my \$c = \"$lhs\"; my \$p = qr/$rhs|$alternate/i$charset;$upgrade_target$upgrade_pattern \$c $op \$p";
710 run_test($eval, "", "");
712 # Check that works when the folded character follows something that
713 # is quantified. This test knows the regex code internals to the
714 # extent that it knows this is a potential problem, and that there
715 # are three different types of quantifiers generated: 1) The thing
716 # being quantified matches a single character; 2) it matches more
717 # than one character, but is fixed width; 3) it can match a variable
718 # number of characters. (It doesn't know that case 3 shouldn't
719 # matter, since it doesn't do anything special for the character
720 # following the quantifier; nor that some of the different
721 # quantifiers execute the same underlying code, as these tests are
722 # quick, and this insulates these tests from changes in the
724 for my $quantifier ('?', '??', '*', '*?', '+', '+?', '{1,2}', '{1,2}?') {
725 $eval = "my \$c = \"_$lhs\"; my \$p = qr/(?$charset:.$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
726 run_test($eval, "", "");
727 $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset:(?:..)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
728 run_test($eval, "", "");
729 $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset:(?:.|\\R)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
730 run_test($eval, "", "");
733 foreach my $bracketed (0, 1) { # Put rhs in [...], or not
734 next if $bracketed && @pattern != 1; # bracketed makes these
735 # or's instead of a sequence
736 foreach my $optimize_bracketed (0, 1) {
737 next if $optimize_bracketed && ! $bracketed;
738 foreach my $inverted (0,1) {
739 next if $inverted && ! $bracketed; # inversion only valid
741 next if $inverted && @target != 1; # [perl #89750] multi-char
742 # not valid in [^...]
744 # In some cases, add an extra character that doesn't fold, and
745 # looks ok in the output.
746 my $extra_char = "_";
747 foreach my $prepend ("", $extra_char) {
748 foreach my $append ("", $extra_char) {
750 # Assemble the rhs. Put each character in a separate
751 # bracketed if using charclasses. This creates a stress on
752 # the code to span a match across multiple elements
754 foreach my $rhs_char (@rhs) {
755 $rhs .= '[' if $bracketed;
756 $rhs .= '^' if $inverted;
759 # Add a character to the class, so class doesn't get
760 # optimized out, unless we are testing that optimization
761 $rhs .= '_' if $optimize_bracketed;
762 $rhs .= ']' if $bracketed;
765 # Add one of: no capturing parens
768 # Use quantifiers and extra variable width matches inside
769 # them to keep some optimizations from happening
770 foreach my $parend (0, 1, 2) {
771 my $interior = (! $parend)
776 foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
778 # Perhaps should be TODOs, as are unimplemented, but
779 # maybe will never be implemented
780 next if @pattern != 1 && $quantifier;
782 # A ? or * quantifier normally causes the thing to be
783 # able to match a null string
784 my $quantifier_can_match_null = $quantifier eq '?'
785 || $quantifier eq '*';
787 # But since we only quantify the last character in a
788 # multiple fold, the other characters will have width,
789 # except if we are quantifying the whole rhs
790 my $can_match_null = $quantifier_can_match_null
791 && (@rhs == 1 || $parend);
793 foreach my $l_anchor ("", '^') { # '\A' didn't change
795 foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't
797 # The folded part can match the null string if it
798 # isn't required to have width, and there's not
799 # something on one or both sides that force it to.
800 my $both_sides = ($l_anchor && $r_anchor)
801 || ($l_anchor && $append)
802 || ($r_anchor && $prepend)
803 || ($prepend && $append);
804 my $must_match = ! $can_match_null || $both_sides;
805 # for performance, but doing this missed many failures
806 #next unless $must_match;
807 my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
809 if ($must_match && $should_fail) {
814 $op = ! $op if $must_match && $inverted;
816 if ($inverted && @target > 1) {
817 # When doing an inverted match against a
818 # multi-char target, and there is not something on
819 # the left to anchor the match, if it shouldn't
820 # succeed, skip, as what will happen (when working
821 # correctly) is that it will match the first
822 # position correctly, and then be inverted to not
823 # match; then it will go to the second position
824 # where it won't match, but get inverted to match,
825 # and hence succeeding.
826 next if ! ($l_anchor || $prepend) && ! $op;
828 # Can't ever match for latin1 code points non-uni
829 # semantics that have a inverted multi-char fold
830 # when there is something on both sides and the
831 # quantifier isn't such as to span the required
832 # width, which is 2 or 3.
833 $op = 0 if $ord < 255
836 && ( ! $quantifier || $quantifier eq '?')
839 # Similarly can't ever match when inverting a
840 # multi-char fold for /aa and the quantifier
841 # isn't sufficient to allow it to span to both
843 $op = 0 if $target_has_ascii
846 && ( ! $quantifier || $quantifier eq '?')
850 $op = 0 if $target_has_latin1 && $charset eq 'l'
852 && ( ! $quantifier || $quantifier eq '?')
857 my $desc = "my \$c = \"$prepend$lhs$append\"; "
858 . "my \$p = qr/$quantified/i;"
859 . "$upgrade_target$upgrade_pattern "
860 . "\$c " . ($op ? "=~" : "!~") . " \$p; ";
863 "; uni_semantics=$uni_semantics, "
864 . "should_fail=$should_fail, "
865 . "bracketed=$bracketed, "
866 . "prepend=$prepend, "
869 . "quantifier=$quantifier, "
870 . "l_anchor=$l_anchor, "
871 . "r_anchor=$r_anchor; "
872 . "pattern_above_latin1=$pattern_above_latin1; "
873 . "utf8_pattern=$utf8_pattern"
877 my $c = "$prepend$lhs_str$append";
878 my $p = qr/$quantified/i;
879 utf8::upgrade($c) if length($upgrade_target);
880 utf8::upgrade($p) if length($upgrade_pattern);
881 my $res = $op ? ($c =~ $p): ($c !~ $p);
883 if (!$res || $list_all_tests) {
884 # Failed or debug; output the result
886 ok($res, "test $count - $desc");
888 # Just count the test as passed
903 unless($list_all_tests) {
905 is $okays, $this_iteration, "$okays subtests ok for"
907 . ' target="' . join("", @x_target) . '",'
908 . ' pat="' . join("", @x_pattern) . '"';