t/re/fold_grind.t

   1 # Grind out a lot of combinatoric tests for folding.
   2
   3 binmode STDOUT, ":utf8";
   4
   5 BEGIN {
   6     chdir 't' if -d 't';
   7     @INC = '../lib';
   8     require './test.pl';
   9     skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX");
  10 }
  11
  12 use charnames ":full";
  13
  14 my $DEBUG = 0;  # Outputs extra information for debugging this .t
  15
  16 use strict;
  17 use warnings;
  18 use Encode;
  19 use POSIX;
  20
  21 # Special-cased characters in the .c's that we want to make sure get tested.
  22 my %be_sure_to_test = (
  23         "\xDF" => 1, # LATIN_SMALL_LETTER_SHARP_S
  24         "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S
  25         "\x{390}" => 1, # GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
  26         "\x{3B0}" => 1, # GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
  27         "\x{1FD3}" => 1, # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
  28         "\x{1FE3}" => 1, # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
  29     );
  30
  31
  32 # Tests both unicode and not, so make sure not implicitly testing unicode
  33 no feature 'unicode_strings';
  34
  35 # Case-insensitive matching is a large and complicated issue.  Perl does not
  36 # implement it fully, properly.  For example, it doesn't include normalization
  37 # as part of the equation.  To test every conceivable combination is clearly
  38 # impossible; these tests are mostly drawn from visual inspection of the code
  39 # and experience, trying to exercise all areas.
  40
  41 # There are three basic ranges of characters that Perl may treat differently:
  42 # 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
  43 #    referred to here as ASCII.  On EBCDIC machines, the non-ASCII invariants
  44 #    are all controls that fold to themselves.
  45 my $ASCII = 1;
  46
  47 # 2) Other characters that fit into a byte but are different in utf8 than not;
  48 #    here referred to, taking some liberties, as Latin1.
  49 my $Latin1 = 2;
  50
  51 # 3) Characters that won't fit in a byte; here referred to as Unicode
  52 my $Unicode = 3;
  53
  54 # Within these basic groups are equivalence classes that testing any character
  55 # in is likely to lead to the same results as any other character.  This is
  56 # used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
  57 # set.
  58 my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
  59
  60 # Additionally parts of this test run a lot of subtests, outputting the
  61 # resulting TAP can be expensive so the tests are summarised internally. The
  62 # PERL_DEBUG_FULL_TEST environment variable can be set to produce the full
  63 # output for debugging purposes.
  64
  65 sub range_type {
  66     my $ord = ord shift;
  67
  68     return $ASCII if $ord < 128;
  69     return $Latin1 if $ord < 256;
  70     return $Unicode;
  71 }
  72
  73 sub numerically {
  74     return $a <=> $b
  75 }
  76
  77 # Significant time is saved by not outputting each test but grouping the
  78 # output into subtests
  79 my $okays;          # Number of ok's in current subtest
  80 my $this_iteration; # Number of possible tests in current subtest
  81 my $count=0;        # Number of subtests = number of total tests
  82
  83 sub run_test($$$) {
  84     my ($test, $todo, $debug) = @_;
  85
  86     $debug = "" unless $DEBUG;
  87     my $res = eval $test;
  88
  89     if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) {
  90       # Failed or debug; output the result
  91       $count++;
  92       ok($res, "$test; $debug");
  93     } else {
  94       # Just count the test as passed
  95       $okays++;
  96     }
  97     $this_iteration++;
  98 }
  99
 100 my %has_test_by_participants;   # Makes sure has tests for each range and each
 101                                 # number of characters that fold to the same
 102                                 # thing
 103 my %has_test_by_byte_count; # Makes sure has tests for each combination of
 104                             # n bytes folds to m bytes
 105
 106 my %tests; # The set of tests.
 107 # Each key is a code point that folds to something else.
 108 # Each value is a list of things that the key folds to.  If the 'thing' is a
 109 # single code point, it is that ordinal.  If it is a multi-char fold, it is an
 110 # ordered list of the code points in that fold.  Here's an example for 'S':
 111 #  '83' => [ 115, 383 ]
 112 #
 113 # And one for a multi-char fold: \xDF
 114 #  223 => [
 115 #            [  # 'ss'
 116 #                83,
 117 #                83
 118 #            ],
 119 #            [  # 'SS'
 120 #                115,
 121 #                115
 122 #            ],
 123 #            [  # LATIN SMALL LETTER LONG S
 124 #                383,
 125 #                383
 126 #            ],
 127 #          7838 # LATIN_CAPITAL_LETTER_SHARP_S
 128 #        ],
 129
 130 my %inverse_folds;  # keys are strings of the folded-to;
 131                     # values are lists of characters that fold to them
 132
 133 sub add_test($@) {
 134     my ($to, @from) = @_;
 135
 136     # Called to cause the input to be tested by adding to %tests.  @from is
 137     # the list of characters that fold to the string $to.  @from should be
 138     # sorted so the lowest code point is first....
 139     # The input is in string form; %tests uses code points, so have to
 140     # convert.
 141
 142     my $to_chars = length $to;
 143     my @test_to;        # List of tests for $to
 144
 145     if ($to_chars == 1) {
 146         @test_to = ord $to;
 147     }
 148     else {
 149         push @test_to, [ map { ord $_ } split "", $to ];
 150
 151         # For multi-char folds, we also test that things that can fold to each
 152         # individual character in the fold also work.  If we were testing
 153         # comprehensively, we would try every combination of upper and lower
 154         # case in the fold, but it will have to suffice to avoid running
 155         # forever to make sure that each thing that folds to these is tested
 156         # at least once.  Because of complement matching ([^...]), we need to
 157         # do both the folded, and the folded-from.
 158         # We first look at each character in the multi-char fold, and save how
 159         # many characters fold to it; and also the maximum number of such
 160         # folds
 161         my @folds_to_count;     # 0th char in fold is index 0 ...
 162         my $max_folds_to = 0;
 163
 164         for (my $i = 0; $i < $to_chars; $i++) {
 165             my $to_char = substr($to, $i, 1);
 166             if (exists $inverse_folds{$to_char}) {
 167                 $folds_to_count[$i] = scalar @{$inverse_folds{$to_char}};
 168                 $max_folds_to = $folds_to_count[$i] if $max_folds_to < $folds_to_count[$i];
 169             }
 170             else {
 171                 $folds_to_count[$i] = 0;
 172             }
 173         }
 174
 175         # We will need to generate as many tests as the maximum number of
 176         # folds, so that each fold will have at least one test.
 177         # For example, consider character X which folds to the three character
 178         # string 'xyz'.  If 2 things fold to x (X and x), 4 to y (Y, Y'
 179         # (Y-prime), Y'' (Y-prime-prime), and y), and 1 thing to z (itself), 4
 180         # tests will be generated:
 181         #   xyz
 182         #   XYz
 183         #   xY'z
 184         #   xY''z
 185         for (my $i = 0; $i < $max_folds_to; $i++) {
 186             my @this_test_to;   # Assemble a single test
 187
 188             # For each character in the multi-char fold ...
 189             for (my $j = 0; $j < $to_chars; $j++) {
 190                 my $this_char = substr($to, $j, 1);
 191
 192                 # Use its corresponding inverse fold, if available.
 193                 if ($i < $folds_to_count[$j]) {
 194                     push @this_test_to, ord $inverse_folds{$this_char}[$i];
 195                 }
 196                 else {  # Or else itself.
 197                     push @this_test_to, ord $this_char;
 198                 }
 199             }
 200
 201             # Add this test to the list
 202             push @test_to, [ @this_test_to ];
 203         }
 204
 205         # Here, have assembled all the tests for the multi-char fold.  Sort so
 206         # lowest code points are first for consistency and aesthetics in
 207         # output.  We know there are at least two characters in the fold, but
 208         # I haven't bothered to worry about sorting on an optional third
 209         # character if the first two are identical.
 210         @test_to = sort { ($a->[0] == $b->[0])
 211                            ? $a->[1] <=> $b->[1]
 212                            : $a->[0] <=> $b->[0]
 213                         } @test_to;
 214     }
 215
 216
 217     # This test is from n bytes to m bytes.  Record that so won't try to add
 218     # another test that does the same.
 219     use bytes;
 220     my $to_bytes = length $to;
 221     foreach my $from_map (@from) {
 222         $has_test_by_byte_count{length $from_map}{$to_bytes} = $to;
 223     }
 224     no bytes;
 225
 226     my $ord_smallest_from = ord shift @from;
 227     if (exists $tests{$ord_smallest_from}) {
 228         die "There are already tests for $ord_smallest_from"
 229     };
 230
 231     # Add in the fold tests,
 232     push @{$tests{$ord_smallest_from}}, @test_to;
 233
 234     # Then any remaining froms in the equivalence class.
 235     push @{$tests{$ord_smallest_from}}, map { ord $_ } @from;
 236 }
 237
 238 # Read the Unicode rules file and construct inverse mappings from it
 239
 240 my $file="../lib/unicore/CaseFolding.txt";
 241 open my $fh, "<", $file or die "Failed to read '$file': $!";
 242
 243 while (<$fh>) {
 244     chomp;
 245
 246     # Lines look like (though without the initial '#')
 247     #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 248
 249     my ($line, $comment) = split / \s+ \# \s+ /x, $_;
 250     next if $line eq "" || $line =~ /^#/;
 251     my ($hex_from, $fold_type, @hex_folded) = split /[\s;]+/, $line;
 252
 253     next if $fold_type eq 'T';  # Perl doesn't do Turkish folding
 254     next if $fold_type eq 'S';  # If Unicode's tables are correct, the F
 255                                 # should be a superset of S
 256
 257     my $folded_str = pack ("U0U*", map { hex $_ } @hex_folded);
 258     push @{$inverse_folds{$folded_str}}, chr hex $hex_from;
 259 }
 260
 261 # Analyze the data and generate tests to get adequate test coverage.  We sort
 262 # things so that smallest code points are done first.
 263 TO:
 264 foreach my $to (sort { (length $a == length $b)
 265                         ? $a cmp $b
 266                         : length $a <=> length $b
 267                     } keys %inverse_folds) {
 268
 269     # Within each fold, sort so that the smallest code points are done first
 270     @{$inverse_folds{$to}} = sort { $a cmp $b } @{$inverse_folds{$to}};
 271     my @from = @{$inverse_folds{$to}};
 272
 273     # Just add it to the tests if doing complete coverage
 274     if (! $skip_apparently_redundant) {
 275         add_test($to, @from);
 276         next TO;
 277     }
 278
 279     my $to_chars = length $to;
 280     my $to_range_type = range_type(substr($to, 0, 1));
 281
 282     # If this is required to be tested, do so.  We check for these first, as
 283     # they will take up slots of byte-to-byte combinations that we otherwise
 284     # would have to have other tests to get.
 285     foreach my $from_map (@from) {
 286         if (exists $be_sure_to_test{$from_map}) {
 287             add_test($to, @from);
 288             next TO;
 289         }
 290     }
 291
 292     # If the fold contains heterogeneous range types, is suspect and should be
 293     # tested.
 294     if ($to_chars > 1) {
 295         foreach my $char (split "", $to) {
 296             if (range_type($char) != $to_range_type) {
 297                 add_test($to, @from);
 298                 next TO;
 299             }
 300         }
 301     }
 302
 303     # If the mapping crosses range types, is suspect and should be tested
 304     foreach my $from_map (@from) {
 305         if (range_type($from_map) != $to_range_type) {
 306             add_test($to, @from);
 307             next TO;
 308         }
 309     }
 310
 311     # Here, all components of the mapping are in the same range type.  For
 312     # single character folds, we test one case in each range type that has 2
 313     # particpants, 3 particpants, etc.
 314     if ($to_chars == 1) {
 315         if (! exists $has_test_by_participants{scalar @from}{$to_range_type}) {
 316             add_test($to, @from);
 317             $has_test_by_participants{scalar @from}{$to_range_type} = $to;
 318             next TO;
 319         }
 320     }
 321
 322     # We also test all combinations of mappings from m to n bytes.  This is
 323     # because the regex optimizer cares.  (Don't bother worrying about that
 324     # Latin1 chars will occupy a different number of bytes under utf8, as
 325     # there are plenty of other cases that catch these byte numbers.)
 326     use bytes;
 327     my $to_bytes = length $to;
 328     foreach my $from_map (@from) {
 329         if (! exists $has_test_by_byte_count{length $from_map}{$to_bytes}) {
 330             add_test($to, @from);
 331             next TO;
 332         }
 333     }
 334 }
 335
 336 # For each range type, test additionally a character that folds to itself
 337 add_test(chr 0x3A, chr 0x3A);
 338 add_test(chr 0xF7, chr 0xF7);
 339 add_test(chr 0x2C7, chr 0x2C7);
 340
 341 # To cut down on the number of tests
 342 my $has_tested_aa_above_latin1;
 343 my $has_tested_latin1_aa;
 344 my $has_tested_ascii_aa;
 345 my $has_tested_l_above_latin1;
 346 my $has_tested_above_latin1_l;
 347 my $has_tested_ascii_l;
 348 my $has_tested_above_latin1_d;
 349 my $has_tested_ascii_d;
 350 my $has_tested_non_latin1_d;
 351 my $has_tested_above_latin1_a;
 352 my $has_tested_ascii_a;
 353 my $has_tested_non_latin1_a;
 354
 355 # For use by pairs() in generating combinations
 356 sub prefix {
 357     my $p = shift;
 358     map [ $p, $_ ], @_
 359 }
 360
 361 # Returns all ordered combinations of pairs of elements from the input array.
 362 # It doesn't return pairs like (a, a), (b, b).  Change the slice to an array
 363 # to do that.  This was just to have fewer tests.
 364 sub pairs (@) {
 365     #print __LINE__, ": ", join(" XXX ", @_), "\n";
 366     map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
 367 }
 368
 369 my @charsets = qw(d u a aa);
 370 my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
 371 if ($current_locale eq 'C') {
 372     use locale;
 373
 374     # Some locale implementations don't have the range 128-255 characters all
 375     # mean nothing.  Skip the locale tests in that situation.
 376     for my $i (128 .. 255) {
 377         my $char = chr($i);
 378         goto bad_locale if uc($char) ne $char || lc($char) ne $char;
 379     }
 380     push @charsets, 'l';
 381 bad_locale:
 382 }
 383
 384 # Finally ready to do the tests
 385 foreach my $test (sort { numerically } keys %tests) {
 386
 387   my $previous_target;
 388   my $previous_pattern;
 389   my @pairs = pairs(sort numerically $test, @{$tests{$test}});
 390
 391   # Each fold can be viewed as a closure of all the characters that
 392   # participate in it.  Look at each possible pairing from a closure, with the
 393   # first member of the pair the target string to match against, and the
 394   # second member forming the pattern.  Thus each fold member gets tested as
 395   # the string, and the pattern with every other member in the opposite role.
 396   while (my $pair = shift @pairs) {
 397     my ($target, $pattern) = @$pair;
 398
 399     # When testing a char that doesn't fold, we can get the same
 400     # permutation twice; so skip all but the first.
 401     next if $previous_target
 402             && $previous_target == $target
 403             && $previous_pattern == $pattern;
 404     ($previous_target, $previous_pattern) = ($target, $pattern);
 405
 406     # Each side may be either a single char or a string.  Extract each into an
 407     # array (perhaps of length 1)
 408     my @target, my @pattern;
 409     @target = (ref $target) ? @$target : $target;
 410     @pattern = (ref $pattern) ? @$pattern : $pattern;
 411
 412     # We are testing just folds to/from a single character.  If our pairs
 413     # happens to generate multi/multi, skip.
 414     next if @target > 1 && @pattern > 1;
 415
 416     # Have to convert non-utf8 chars to native char set
 417     @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
 418     @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;
 419
 420     # Get in hex form.
 421     my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
 422     my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
 423
 424     my $target_above_latin1 = grep { $_ > 255 } @target;
 425     my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
 426     my $target_has_ascii = grep { $_ < 128 } @target;
 427     my $pattern_has_ascii = grep { $_ < 128 } @pattern;
 428     my $target_only_ascii = ! grep { $_ > 127 } @target;
 429     my $pattern_only_ascii = ! grep { $_ > 127 } @pattern;
 430     my $target_has_latin1 = grep { $_ < 256 } @target;
 431     my $target_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @target;
 432     my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @pattern;
 433     my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
 434     my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
 435
 436     # We don't test multi-char folding into other multi-chars.  We are testing
 437     # a code point that folds to or from other characters.  Find the single
 438     # code point for diagnostic purposes.  (If both are single, choose the
 439     # target string)
 440     my $ord = @target == 1 ? $target[0] : $pattern[0];
 441     my $progress = sprintf "%04X: \"%s\" and /%s/",
 442                             $test,
 443                             join("", @x_target),
 444                             join("", @x_pattern);
 445     #note $progress;
 446
 447     # Now grind out tests, using various combinations.
 448     foreach my $charset (@charsets) {
 449       $okays = 0;
 450       $this_iteration = 0;
 451
 452       # To cut down somewhat on the enormous quantity of tests this currently
 453       # runs, skip some for some of the character sets whose results aren't
 454       # likely to differ from others.  But run all tests on the code points
 455       # that don't fold, plus one other set in each range group.
 456       if (! $is_self) {
 457
 458         # /aa should only affect things with folds in the ASCII range.  But, try
 459         # it on one set in the other ranges just to make sure it doesn't break
 460         # them.
 461         if ($charset eq 'aa') {
 462           if (! $target_has_ascii && ! $pattern_has_ascii) {
 463             if ($target_above_latin1 || $pattern_above_latin1) {
 464               next if defined $has_tested_aa_above_latin1
 465                       && $has_tested_aa_above_latin1 != $test;
 466               $has_tested_aa_above_latin1 = $test;
 467             }
 468             next if defined $has_tested_latin1_aa
 469                     && $has_tested_latin1_aa != $test;
 470             $has_tested_latin1_aa = $test;
 471           }
 472           elsif ($target_only_ascii && $pattern_only_ascii) {
 473
 474               # And, except for one set just to make sure, skip tests
 475               # where both elements in the pair are ASCII.  If one works for
 476               # aa, the others are likely too.  This skips tests where the
 477               # fold is from non-ASCII to ASCII, but this part of the test
 478               # is just about the ASCII components.
 479               next if defined $has_tested_ascii_l
 480                       && $has_tested_ascii_l != $test;
 481               $has_tested_ascii_l = $test;
 482           }
 483         }
 484         elsif ($charset eq 'l') {
 485
 486           # For l, don't need to test beyond one set those things that are
 487           # all above latin1, because unlikely to have different successes
 488           # than /u
 489           if (! $target_has_latin1 && ! $pattern_has_latin1) {
 490             next if defined $has_tested_above_latin1_l
 491                     && $has_tested_above_latin1_l != $test;
 492             $has_tested_above_latin1_l = $test;
 493           }
 494           elsif ($target_only_ascii && $pattern_only_ascii) {
 495
 496               # And, except for one set just to make sure, skip tests
 497               # where both elements in the pair are ASCII.  This is
 498               # essentially the same reasoning as above for /aa.
 499               next if defined $has_tested_ascii_l
 500                       && $has_tested_ascii_l != $test;
 501               $has_tested_ascii_l = $test;
 502           }
 503         }
 504         elsif ($charset eq 'd') {
 505           # Similarly for d.  Beyond one test (besides self) each, we  don't
 506           # test pairs that are both ascii; or both above latin1, or are
 507           # combinations of ascii and above latin1.
 508           if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
 509             if ($target_has_ascii && $pattern_has_ascii) {
 510               next if defined $has_tested_ascii_d
 511                       && $has_tested_ascii_d != $test;
 512               $has_tested_ascii_d = $test
 513             }
 514             elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
 515               next if defined $has_tested_above_latin1_d
 516                       && $has_tested_above_latin1_d != $test;
 517               $has_tested_above_latin1_d = $test;
 518             }
 519             else {
 520               next if defined $has_tested_non_latin1_d
 521                       && $has_tested_non_latin1_d != $test;
 522               $has_tested_non_latin1_d = $test;
 523             }
 524           }
 525         }
 526         elsif ($charset eq 'a') {
 527           # Similarly for a.  This should match identically to /u, so wasn't
 528           # tested at all until a bug was found that was thereby missed.
 529           # As a compromise, beyond one test (besides self) each, we  don't
 530           # test pairs that are both ascii; or both above latin1, or are
 531           # combinations of ascii and above latin1.
 532           if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
 533             if ($target_has_ascii && $pattern_has_ascii) {
 534               next if defined $has_tested_ascii_a
 535                       && $has_tested_ascii_a != $test;
 536               $has_tested_ascii_a = $test
 537             }
 538             elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
 539               next if defined $has_tested_above_latin1_a
 540                       && $has_tested_above_latin1_a != $test;
 541               $has_tested_above_latin1_a = $test;
 542             }
 543             else {
 544               next if defined $has_tested_non_latin1_a
 545                       && $has_tested_non_latin1_a != $test;
 546               $has_tested_non_latin1_a = $test;
 547             }
 548           }
 549         }
 550       }
 551
 552       foreach my $utf8_target (0, 1) {    # Both utf8 and not, for
 553                                           # code points < 256
 554         my $upgrade_target = "";
 555
 556         # These must already be in utf8 because the string to match has
 557         # something above latin1.  So impossible to test if to not to be in
 558         # utf8; and otherwise, no upgrade is needed.
 559         next if $target_above_latin1 && ! $utf8_target;
 560         $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;
 561
 562         foreach my $utf8_pattern (0, 1) {
 563           next if $pattern_above_latin1 && ! $utf8_pattern;
 564
 565           # Our testing of 'l' uses the POSIX locale, which is ASCII-only
 566           my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/);
 567           my $upgrade_pattern = "";
 568           $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
 569
 570           my $lhs = join "", @x_target;
 571           my $lhs_str = eval qq{"$lhs"}; fail($@) if $@;
 572           my @rhs = @x_pattern;
 573           my $rhs = join "", @rhs;
 574           my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self)
 575                             || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii)
 576                             || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1);
 577
 578           # Do simple tests of referencing capture buffers, named and
 579           # numbered.
 580           my $op = '=~';
 581           $op = '!~' if $should_fail;
 582
 583           my $todo = 0;  # No longer any todo's
 584           my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 585           run_test($eval, $todo, "");
 586
 587           $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 588           run_test($eval, $todo, "");
 589
 590           if ($lhs ne $rhs) {
 591             $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 592             run_test($eval, "", "");
 593
 594             $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 595             run_test($eval, "", "");
 596           }
 597
 598           # See if works on what could be a simple trie.
 599           $eval = "my \$c = \"$lhs\"; my \$p = qr/$rhs|xyz/i$charset;$upgrade_target$upgrade_pattern \$c $op \$p";
 600           run_test($eval, "", "");
 601
 602           foreach my $bracketed (0, 1) {   # Put rhs in [...], or not
 603             next if $bracketed && @pattern != 1;    # bracketed makes these
 604                                                     # or's instead of a sequence
 605             foreach my $inverted (0,1) {
 606                 next if $inverted && ! $bracketed;  # inversion only valid in [^...]
 607                 next if $inverted && @target != 1;  # [perl #89750] multi-char
 608                                                     # not valid in [^...]
 609
 610               # In some cases, add an extra character that doesn't fold, and
 611               # looks ok in the output.
 612               my $extra_char = "_";
 613               foreach my $prepend ("", $extra_char) {
 614                 foreach my $append ("", $extra_char) {
 615
 616                   # Assemble the rhs.  Put each character in a separate
 617                   # bracketed if using charclasses.  This creates a stress on
 618                   # the code to span a match across multiple elements
 619                   my $rhs = "";
 620                   foreach my $rhs_char (@rhs) {
 621                       $rhs .= '[' if $bracketed;
 622                       $rhs .= '^' if $inverted;
 623                       $rhs .=  $rhs_char;
 624
 625                       # Add a character to the class, so class doesn't get
 626                       # optimized out
 627                       $rhs .= '_]' if $bracketed;
 628                   }
 629
 630                   # Add one of: no capturing parens
 631                   #             a single set
 632                   #             a nested set
 633                   # Use quantifiers and extra variable width matches inside
 634                   # them to keep some optimizations from happening
 635                   foreach my $parend (0, 1, 2) {
 636                     my $interior = (! $parend)
 637                                     ? $rhs
 638                                     : ($parend == 1)
 639                                         ? "(${rhs},?)"
 640                                         : "((${rhs})+,?)";
 641                     foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
 642
 643                       # Perhaps should be TODOs, as are unimplemented, but
 644                       # maybe will never be implemented
 645                       next if @pattern != 1 && $quantifier;
 646
 647                       # A ? or * quantifier normally causes the thing to be
 648                       # able to match a null string
 649                       my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*';
 650
 651                       # But since we only quantify the last character in a
 652                       # multiple fold, the other characters will have width,
 653                       # except if we are quantifying the whole rhs
 654                       my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend);
 655
 656                       foreach my $l_anchor ("", '^') { # '\A' didn't change result)
 657                         foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result)
 658
 659                           # The folded part can match the null string if it
 660                           # isn't required to have width, and there's not
 661                           # something on one or both sides that force it to.
 662                           my $both_sides = ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append);
 663                           my $must_match = ! $can_match_null || $both_sides;
 664                           # for performance, but doing this missed many failures
 665                           #next unless $must_match;
 666                           my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
 667                           my $op;
 668                           if ($must_match && $should_fail)  {
 669                               $op = 0;
 670                           } else {
 671                               $op = 1;
 672                           }
 673                           $op = ! $op if $must_match && $inverted;
 674
 675                           if ($inverted && @target > 1) {
 676                             # When doing an inverted match against a
 677                             # multi-char target, and there is not something on
 678                             # the left to anchor the match, if it shouldn't
 679                             # succeed, skip, as what will happen (when working
 680                             # correctly) is that it will match the first
 681                             # position correctly, and then be inverted to not
 682                             # match; then it will go to the second position
 683                             # where it won't match, but get inverted to match,
 684                             # and hence succeeding.
 685                             next if ! ($l_anchor || $prepend) && ! $op;
 686
 687                             # Can't ever match for latin1 code points non-uni
 688                             # semantics that have a inverted multi-char fold
 689                             # when there is something on both sides and the
 690                             # quantifier isn't such as to span the required
 691                             # width, which is 2 or 3.
 692                             $op = 0 if $ord < 255
 693                                        && ! $uni_semantics
 694                                        && $both_sides
 695                                        && ( ! $quantifier || $quantifier eq '?')
 696                                        && $parend < 2;
 697
 698                             # Similarly can't ever match when inverting a multi-char
 699                             # fold for /aa and the quantifier isn't sufficient
 700                             # to allow it to span to both sides.
 701                             $op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
 702
 703                             # Or for /l
 704                             $op = 0 if $target_has_latin1 && $charset eq 'l' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
 705                           }
 706
 707
 708                           my $desc = "my \$c = \"$prepend$lhs$append\"; "
 709                                    . "my \$p = qr/$quantified/i;"
 710                                    . "$upgrade_target$upgrade_pattern "
 711                                    . "\$c " . ($op ? "=~" : "!~") . " \$p; ";
 712                           if ($DEBUG) {
 713                             $desc .= (
 714                              "; uni_semantics=$uni_semantics, "
 715                              . "should_fail=$should_fail, "
 716                              . "bracketed=$bracketed, "
 717                              . "prepend=$prepend, "
 718                              . "append=$append, "
 719                              . "parend=$parend, "
 720                              . "quantifier=$quantifier, "
 721                              . "l_anchor=$l_anchor, "
 722                              . "r_anchor=$r_anchor; "
 723                              . "pattern_above_latin1=$pattern_above_latin1; "
 724                              . "utf8_pattern=$utf8_pattern"
 725                             );
 726                           }
 727
 728                           my $c = "$prepend$lhs_str$append";
 729                           my $p = qr/$quantified/i;
 730                           utf8::upgrade($c) if length($upgrade_target);
 731                           utf8::upgrade($p) if length($upgrade_pattern);
 732                           my $res = $op ? ($c =~ $p): ($c !~ $p);
 733
 734                           if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) {
 735                             # Failed or debug; output the result
 736                             $count++;
 737                             ok($res, $desc);
 738                           } else {
 739                             # Just count the test as passed
 740                             $okays++;
 741                           }
 742                           $this_iteration++;
 743                         }
 744                       }
 745                     }
 746                   }
 747                 }
 748               }
 749             }
 750           }
 751         }
 752       }
 753       unless($ENV{PERL_DEBUG_FULL_TEST}) {
 754         $count++;
 755         is $okays, $this_iteration, "$okays subtests ok for"
 756           . " /$charset,"
 757           . ' target="' . join("", @x_target) . '",'
 758           . ' pat="' . join("", @x_pattern) . '"';
 759       }
 760     }
 761   }
 762 }
 763
 764 plan($count);
 765
 766 1