t/re/fold_grind.t

   1 # Grind out a lot of combinatoric tests for folding.
   2
   3 binmode STDOUT, ":utf8";
   4
   5 BEGIN {
   6     chdir 't' if -d 't';
   7     @INC = '../lib';
   8     require './test.pl';
   9     require Config; import Config;
  10     skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX");
  11 }
  12
  13 use charnames ":full";
  14
  15 my $DEBUG = 0;  # Outputs extra information for debugging this .t
  16
  17 use strict;
  18 use warnings;
  19 use Encode;
  20 use POSIX;
  21
  22 # Special-cased characters in the .c's that we want to make sure get tested.
  23 my %be_sure_to_test = (
  24         "\xDF" => 1, # LATIN_SMALL_LETTER_SHARP_S
  25         "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S
  26         "\x{390}" => 1, # GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
  27         "\x{3B0}" => 1, # GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
  28         "\x{1FD3}" => 1, # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
  29         "\x{1FE3}" => 1, # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
  30     );
  31
  32
  33 # Tests both unicode and not, so make sure not implicitly testing unicode
  34 no feature 'unicode_strings';
  35
  36 # Case-insensitive matching is a large and complicated issue.  Perl does not
  37 # implement it fully, properly.  For example, it doesn't include normalization
  38 # as part of the equation.  To test every conceivable combination is clearly
  39 # impossible; these tests are mostly drawn from visual inspection of the code
  40 # and experience, trying to exercise all areas.
  41
  42 # There are three basic ranges of characters that Perl may treat differently:
  43 # 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
  44 #    referred to here as ASCII.  On EBCDIC machines, the non-ASCII invariants
  45 #    are all controls that fold to themselves.
  46 my $ASCII = 1;
  47
  48 # 2) Other characters that fit into a byte but are different in utf8 than not;
  49 #    here referred to, taking some liberties, as Latin1.
  50 my $Latin1 = 2;
  51
  52 # 3) Characters that won't fit in a byte; here referred to as Unicode
  53 my $Unicode = 3;
  54
  55 # Within these basic groups are equivalence classes that testing any character
  56 # in is likely to lead to the same results as any other character.  This is
  57 # used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
  58 # set.
  59 my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
  60
  61 # Additionally parts of this test run a lot of subtests, outputting the
  62 # resulting TAP can be expensive so the tests are summarised internally. The
  63 # PERL_DEBUG_FULL_TEST environment variable can be set to produce the full
  64 # output for debugging purposes.
  65
  66 sub range_type {
  67     my $ord = ord shift;
  68
  69     return $ASCII if $ord < 128;
  70     return $Latin1 if $ord < 256;
  71     return $Unicode;
  72 }
  73
  74 sub numerically {
  75     return $a <=> $b
  76 }
  77
  78 my $list_all_tests = $ENV{PERL_DEBUG_FULL_TEST} || $DEBUG;
  79 $| = 1 if $list_all_tests;
  80
  81 # Significant time is saved by not outputting each test but grouping the
  82 # output into subtests
  83 my $okays;          # Number of ok's in current subtest
  84 my $this_iteration; # Number of possible tests in current subtest
  85 my $count=0;        # Number of subtests = number of total tests
  86
  87 sub run_test($$$) {
  88     my ($test, $todo, $debug) = @_;
  89
  90     $debug = "" unless $DEBUG;
  91     my $res = eval $test;
  92
  93     if (!$res || $list_all_tests) {
  94       # Failed or debug; output the result
  95       $count++;
  96       ok($res, "$test; $debug");
  97     } else {
  98       # Just count the test as passed
  99       $okays++;
 100     }
 101     $this_iteration++;
 102 }
 103
 104 my %has_test_by_participants;   # Makes sure has tests for each range and each
 105                                 # number of characters that fold to the same
 106                                 # thing
 107 my %has_test_by_byte_count; # Makes sure has tests for each combination of
 108                             # n bytes folds to m bytes
 109
 110 my %tests; # The set of tests.
 111 # Each key is a code point that folds to something else.
 112 # Each value is a list of things that the key folds to.  If the 'thing' is a
 113 # single code point, it is that ordinal.  If it is a multi-char fold, it is an
 114 # ordered list of the code points in that fold.  Here's an example for 'S':
 115 #  '83' => [ 115, 383 ]
 116 #
 117 # And one for a multi-char fold: \xDF
 118 #  223 => [
 119 #            [  # 'ss'
 120 #                83,
 121 #                83
 122 #            ],
 123 #            [  # 'SS'
 124 #                115,
 125 #                115
 126 #            ],
 127 #            [  # LATIN SMALL LETTER LONG S
 128 #                383,
 129 #                383
 130 #            ],
 131 #          7838 # LATIN_CAPITAL_LETTER_SHARP_S
 132 #        ],
 133
 134 my %folds;          # keys are code points that fold;
 135                     # values are each a list of code points the key folds to
 136 my %inverse_folds;  # keys are strings of the folded-to;
 137                     # values are lists of characters that fold to them
 138
 139 sub add_test($@) {
 140     my ($to, @from) = @_;
 141
 142     # Called to cause the input to be tested by adding to %tests.  @from is
 143     # the list of characters that fold to the string $to.  @from should be
 144     # sorted so the lowest code point is first....
 145     # The input is in string form; %tests uses code points, so have to
 146     # convert.
 147
 148     my $to_chars = length $to;
 149     my @test_to;        # List of tests for $to
 150
 151     if ($to_chars == 1) {
 152         @test_to = ord $to;
 153     }
 154     else {
 155         push @test_to, [ map { ord $_ } split "", $to ];
 156
 157         # For multi-char folds, we also test that things that can fold to each
 158         # individual character in the fold also work.  If we were testing
 159         # comprehensively, we would try every combination of upper and lower
 160         # case in the fold, but it will have to suffice to avoid running
 161         # forever to make sure that each thing that folds to these is tested
 162         # at least once.  Because of complement matching ([^...]), we need to
 163         # do both the folded, and the folded-from.
 164         # We first look at each character in the multi-char fold, and save how
 165         # many characters fold to it; and also the maximum number of such
 166         # folds
 167         my @folds_to_count;     # 0th char in fold is index 0 ...
 168         my $max_folds_to = 0;
 169
 170         for (my $i = 0; $i < $to_chars; $i++) {
 171             my $to_char = substr($to, $i, 1);
 172             if (exists $inverse_folds{$to_char}) {
 173                 $folds_to_count[$i] = scalar @{$inverse_folds{$to_char}};
 174                 $max_folds_to = $folds_to_count[$i] if $max_folds_to < $folds_to_count[$i];
 175             }
 176             else {
 177                 $folds_to_count[$i] = 0;
 178             }
 179         }
 180
 181         # We will need to generate as many tests as the maximum number of
 182         # folds, so that each fold will have at least one test.
 183         # For example, consider character X which folds to the three character
 184         # string 'xyz'.  If 2 things fold to x (X and x), 4 to y (Y, Y'
 185         # (Y-prime), Y'' (Y-prime-prime), and y), and 1 thing to z (itself), 4
 186         # tests will be generated:
 187         #   xyz
 188         #   XYz
 189         #   xY'z
 190         #   xY''z
 191         for (my $i = 0; $i < $max_folds_to; $i++) {
 192             my @this_test_to;   # Assemble a single test
 193
 194             # For each character in the multi-char fold ...
 195             for (my $j = 0; $j < $to_chars; $j++) {
 196                 my $this_char = substr($to, $j, 1);
 197
 198                 # Use its corresponding inverse fold, if available.
 199                 if ($i < $folds_to_count[$j]) {
 200                     push @this_test_to, ord $inverse_folds{$this_char}[$i];
 201                 }
 202                 else {  # Or else itself.
 203                     push @this_test_to, ord $this_char;
 204                 }
 205             }
 206
 207             # Add this test to the list
 208             push @test_to, [ @this_test_to ];
 209         }
 210
 211         # Here, have assembled all the tests for the multi-char fold.  Sort so
 212         # lowest code points are first for consistency and aesthetics in
 213         # output.  We know there are at least two characters in the fold, but
 214         # I haven't bothered to worry about sorting on an optional third
 215         # character if the first two are identical.
 216         @test_to = sort { ($a->[0] == $b->[0])
 217                            ? $a->[1] <=> $b->[1]
 218                            : $a->[0] <=> $b->[0]
 219                         } @test_to;
 220     }
 221
 222
 223     # This test is from n bytes to m bytes.  Record that so won't try to add
 224     # another test that does the same.
 225     use bytes;
 226     my $to_bytes = length $to;
 227     foreach my $from_map (@from) {
 228         $has_test_by_byte_count{length $from_map}{$to_bytes} = $to;
 229     }
 230     no bytes;
 231
 232     my $ord_smallest_from = ord shift @from;
 233     if (exists $tests{$ord_smallest_from}) {
 234         die "There are already tests for $ord_smallest_from"
 235     };
 236
 237     # Add in the fold tests,
 238     push @{$tests{$ord_smallest_from}}, @test_to;
 239
 240     # Then any remaining froms in the equivalence class.
 241     push @{$tests{$ord_smallest_from}}, map { ord $_ } @from;
 242 }
 243
 244 # Get the Unicode rules and construct inverse mappings from them
 245
 246 use Unicode::UCD;
 247 my $file="../lib/unicore/CaseFolding.txt";
 248
 249 # Use the Unicode data file if we are on an ASCII platform (which its data is
 250 # for), and it is in the modern format (starting in Unicode 3.1.0) and it is
 251 # available.  This avoids being affected by potential bugs introduced by other
 252 # layers of Perl
 253 if (ord('A') == 65
 254     && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
 255     && open my $fh, "<", $file)
 256 {
 257     while (<$fh>) {
 258         chomp;
 259
 260         # Lines look like (though without the initial '#')
 261         #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 262
 263         # Get rid of comments, ignore blank or comment-only lines
 264         my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
 265         next unless length $line;
 266         my ($hex_from, $fold_type, @hex_folded) = split /[\s;]+/, $line;
 267
 268         next if $fold_type =~ / ^ [IT] $/x; # Perl doesn't do Turkish folding
 269         next if $fold_type eq 'S';  # If Unicode's tables are correct, the F
 270                                     # should be a superset of S
 271
 272         my $from = hex $hex_from;
 273         my @to = map { hex $_ } @hex_folded;
 274         @{$folds{$from}} = @to;
 275         my $folded_str = pack ("U0U*", @to);
 276         push @{$inverse_folds{$folded_str}}, chr $from;
 277     }
 278 }
 279 else {  # Here, can't use the .txt file: read the Unicode rules file and
 280         # construct inverse mappings from it
 281
 282     my ($invlist_ref, $invmap_ref, undef, $default)
 283                                     = Unicode::UCD::prop_invmap('Case_Folding');
 284     for my $i (0 .. @$invlist_ref - 1 - 1) {
 285         next if $invmap_ref->[$i] == $default;
 286
 287         # Make into an array if not so already, so can treat uniformly below
 288         $invmap_ref->[$i] = [ $invmap_ref->[$i] ] if ! ref $invmap_ref->[$i];
 289
 290         # Each subsequent element of the range requires adjustment of +1 from
 291         # the previous element
 292         my $adjust = -1;
 293         for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
 294             $adjust++;
 295             my @to = map { $_ + $adjust } @{$invmap_ref->[$i]};
 296             push @{$folds{$j}}, @to;
 297             my $folded_str = pack "U0U*", @to;
 298             #note (sprintf "%d: %04X: %s", __LINE__, $j, join " ",
 299             #    map { sprintf "%04X", $_  + $adjust } @{$invmap_ref->[$i]});
 300             push @{$inverse_folds{$folded_str}}, chr $j;
 301         }
 302     }
 303 }
 304
 305 # Analyze the data and generate tests to get adequate test coverage.  We sort
 306 # things so that smallest code points are done first.
 307 TO:
 308 foreach my $to (sort { (length $a == length $b)
 309                         ? $a cmp $b
 310                         : length $a <=> length $b
 311                     } keys %inverse_folds)
 312 {
 313
 314     # Within each fold, sort so that the smallest code points are done first
 315     @{$inverse_folds{$to}} = sort { $a cmp $b } @{$inverse_folds{$to}};
 316     my @from = @{$inverse_folds{$to}};
 317
 318     # Just add it to the tests if doing complete coverage
 319     if (! $skip_apparently_redundant) {
 320         add_test($to, @from);
 321         next TO;
 322     }
 323
 324     my $to_chars = length $to;
 325     my $to_range_type = range_type(substr($to, 0, 1));
 326
 327     # If this is required to be tested, do so.  We check for these first, as
 328     # they will take up slots of byte-to-byte combinations that we otherwise
 329     # would have to have other tests to get.
 330     foreach my $from_map (@from) {
 331         if (exists $be_sure_to_test{$from_map}) {
 332             add_test($to, @from);
 333             next TO;
 334         }
 335     }
 336
 337     # If the fold contains heterogeneous range types, is suspect and should be
 338     # tested.
 339     if ($to_chars > 1) {
 340         foreach my $char (split "", $to) {
 341             if (range_type($char) != $to_range_type) {
 342                 add_test($to, @from);
 343                 next TO;
 344             }
 345         }
 346     }
 347
 348     # If the mapping crosses range types, is suspect and should be tested
 349     foreach my $from_map (@from) {
 350         if (range_type($from_map) != $to_range_type) {
 351             add_test($to, @from);
 352             next TO;
 353         }
 354     }
 355
 356     # Here, all components of the mapping are in the same range type.  For
 357     # single character folds, we test one case in each range type that has 2
 358     # particpants, 3 particpants, etc.
 359     if ($to_chars == 1) {
 360         if (! exists $has_test_by_participants{scalar @from}{$to_range_type}) {
 361             add_test($to, @from);
 362             $has_test_by_participants{scalar @from}{$to_range_type} = $to;
 363             next TO;
 364         }
 365     }
 366
 367     # We also test all combinations of mappings from m to n bytes.  This is
 368     # because the regex optimizer cares.  (Don't bother worrying about that
 369     # Latin1 chars will occupy a different number of bytes under utf8, as
 370     # there are plenty of other cases that catch these byte numbers.)
 371     use bytes;
 372     my $to_bytes = length $to;
 373     foreach my $from_map (@from) {
 374         if (! exists $has_test_by_byte_count{length $from_map}{$to_bytes}) {
 375             add_test($to, @from);
 376             next TO;
 377         }
 378     }
 379 }
 380
 381 # For each range type, test additionally a character that folds to itself
 382 add_test(chr 0x3A, chr 0x3A);
 383 add_test(chr 0xF7, chr 0xF7);
 384 add_test(chr 0x2C7, chr 0x2C7);
 385
 386 # To cut down on the number of tests
 387 my $has_tested_aa_above_latin1;
 388 my $has_tested_latin1_aa;
 389 my $has_tested_ascii_aa;
 390 my $has_tested_l_above_latin1;
 391 my $has_tested_above_latin1_l;
 392 my $has_tested_ascii_l;
 393 my $has_tested_above_latin1_d;
 394 my $has_tested_ascii_d;
 395 my $has_tested_non_latin1_d;
 396 my $has_tested_above_latin1_a;
 397 my $has_tested_ascii_a;
 398 my $has_tested_non_latin1_a;
 399
 400 # For use by pairs() in generating combinations
 401 sub prefix {
 402     my $p = shift;
 403     map [ $p, $_ ], @_
 404 }
 405
 406 # Returns all ordered combinations of pairs of elements from the input array.
 407 # It doesn't return pairs like (a, a), (b, b).  Change the slice to an array
 408 # to do that.  This was just to have fewer tests.
 409 sub pairs (@) {
 410     #print __LINE__, ": ", join(" XXX ", map { sprintf "%04X", $_ } @_), "\n";
 411     map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
 412 }
 413
 414 my @charsets = qw(d u a aa);
 415 if($Config{d_setlocale}) {
 416     my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
 417     if ($current_locale eq 'C') {
 418         require locale; import locale;
 419
 420         # Some implementations don't have the 128-255 range characters all
 421         # mean nothing under the C locale (an example being VMS).  This is
 422         # legal, but since we don't know what the right answers should be,
 423         # skip the locale tests in that situation.
 424         for my $i (128 .. 255) {
 425             my $char = chr($i);
 426             goto untestable_locale if uc($char) ne $char || lc($char) ne $char;
 427         }
 428         push @charsets, 'l';
 429       untestable_locale:
 430     }
 431 }
 432
 433 # Finally ready to do the tests
 434 foreach my $test (sort { numerically } keys %tests) {
 435
 436   my $previous_target;
 437   my $previous_pattern;
 438   my @pairs = pairs(sort numerically $test, @{$tests{$test}});
 439
 440   # Each fold can be viewed as a closure of all the characters that
 441   # participate in it.  Look at each possible pairing from a closure, with the
 442   # first member of the pair the target string to match against, and the
 443   # second member forming the pattern.  Thus each fold member gets tested as
 444   # the string, and the pattern with every other member in the opposite role.
 445   while (my $pair = shift @pairs) {
 446     my ($target, $pattern) = @$pair;
 447
 448     # When testing a char that doesn't fold, we can get the same
 449     # permutation twice; so skip all but the first.
 450     next if $previous_target
 451             && $previous_target == $target
 452             && $previous_pattern == $pattern;
 453     ($previous_target, $previous_pattern) = ($target, $pattern);
 454
 455     # Each side may be either a single char or a string.  Extract each into an
 456     # array (perhaps of length 1)
 457     my @target, my @pattern;
 458     @target = (ref $target) ? @$target : $target;
 459     @pattern = (ref $pattern) ? @$pattern : $pattern;
 460
 461     # We are testing just folds to/from a single character.  If our pairs
 462     # happens to generate multi/multi, skip.
 463     next if @target > 1 && @pattern > 1;
 464
 465     # Have to convert non-utf8 chars to native char set
 466     @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
 467     @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;
 468
 469     # Get in hex form.
 470     my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
 471     my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
 472
 473     my $target_above_latin1 = grep { $_ > 255 } @target;
 474     my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
 475     my $target_has_ascii = grep { $_ < 128 } @target;
 476     my $pattern_has_ascii = grep { $_ < 128 } @pattern;
 477     my $target_only_ascii = ! grep { $_ > 127 } @target;
 478     my $pattern_only_ascii = ! grep { $_ > 127 } @pattern;
 479     my $target_has_latin1 = grep { $_ < 256 } @target;
 480     my $target_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @target;
 481     my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @pattern;
 482     my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
 483     my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
 484
 485     # We don't test multi-char folding into other multi-chars.  We are testing
 486     # a code point that folds to or from other characters.  Find the single
 487     # code point for diagnostic purposes.  (If both are single, choose the
 488     # target string)
 489     my $ord = @target == 1 ? $target[0] : $pattern[0];
 490     my $progress = sprintf "%04X: \"%s\" and /%s/",
 491                             $test,
 492                             join("", @x_target),
 493                             join("", @x_pattern);
 494     #note $progress;
 495
 496     # Now grind out tests, using various combinations.
 497     foreach my $charset (@charsets) {
 498       $okays = 0;
 499       $this_iteration = 0;
 500
 501       # To cut down somewhat on the enormous quantity of tests this currently
 502       # runs, skip some for some of the character sets whose results aren't
 503       # likely to differ from others.  But run all tests on the code points
 504       # that don't fold, plus one other set in each range group.
 505       if (! $is_self) {
 506
 507         # /aa should only affect things with folds in the ASCII range.  But, try
 508         # it on one set in the other ranges just to make sure it doesn't break
 509         # them.
 510         if ($charset eq 'aa') {
 511
 512           # It may be that this $pair of code points to test are both
 513           # non-ascii, but if either of them actually fold to ascii, that is
 514           # suspect and should be tested.  So for /aa, use whether their folds
 515           # are ascii or not
 516           my $target_has_ascii = $target_has_ascii;
 517           my $pattern_has_ascii = $pattern_has_ascii;
 518           if (! $target_has_ascii) {
 519             foreach my $cp (@target) {
 520               if (exists $folds{$cp}
 521                   && grep { ord_native_to_latin1($_) < 128 } @{$folds{$cp}} )
 522               {
 523                   $target_has_ascii = 1;
 524                   last;
 525               }
 526             }
 527           }
 528           if (! $pattern_has_ascii) {
 529             foreach my $cp (@pattern) {
 530               if (exists $folds{$cp}
 531                   && grep { ord_native_to_latin1($_) < 128 } @{$folds{$cp}} )
 532               {
 533                   $pattern_has_ascii = 1;
 534                   last;
 535               }
 536             }
 537           }
 538
 539           if (! $target_has_ascii && ! $pattern_has_ascii) {
 540             if ($target_above_latin1 || $pattern_above_latin1) {
 541               next if defined $has_tested_aa_above_latin1
 542                       && $has_tested_aa_above_latin1 != $test;
 543               $has_tested_aa_above_latin1 = $test;
 544             }
 545             next if defined $has_tested_latin1_aa
 546                     && $has_tested_latin1_aa != $test;
 547             $has_tested_latin1_aa = $test;
 548           }
 549           elsif ($target_only_ascii && $pattern_only_ascii) {
 550
 551               # And, except for one set just to make sure, skip tests
 552               # where both elements in the pair are ASCII.  If one works for
 553               # aa, the others are likely too.  This skips tests where the
 554               # fold is from non-ASCII to ASCII, but this part of the test
 555               # is just about the ASCII components.
 556               next if defined $has_tested_ascii_l
 557                       && $has_tested_ascii_l != $test;
 558               $has_tested_ascii_l = $test;
 559           }
 560         }
 561         elsif ($charset eq 'l') {
 562
 563           # For l, don't need to test beyond one set those things that are
 564           # all above latin1, because unlikely to have different successes
 565           # than /u.  But, for the same reason as described in the /aa above,
 566           # it is suspect and should be tested, if either of the folds are to
 567           # latin1.
 568           my $target_has_latin1 = $target_has_latin1;
 569           my $pattern_has_latin1 = $pattern_has_latin1;
 570           if (! $target_has_latin1) {
 571             foreach my $cp (@target) {
 572               if (exists $folds{$cp}
 573                   && grep { $_ < 256 } @{$folds{$cp}} )
 574               {
 575                 $target_has_latin1 = 1;
 576                 last;
 577               }
 578             }
 579           }
 580           if (! $pattern_has_latin1) {
 581             foreach my $cp (@pattern) {
 582               if (exists $folds{$cp}
 583                   && grep { $_ < 256 } @{$folds{$cp}} )
 584               {
 585                 $pattern_has_latin1 = 1;
 586                 last;
 587               }
 588             }
 589           }
 590           if (! $target_has_latin1 && ! $pattern_has_latin1) {
 591             next if defined $has_tested_above_latin1_l
 592                     && $has_tested_above_latin1_l != $test;
 593             $has_tested_above_latin1_l = $test;
 594           }
 595           elsif ($target_only_ascii && $pattern_only_ascii) {
 596
 597               # And, except for one set just to make sure, skip tests
 598               # where both elements in the pair are ASCII.  This is
 599               # essentially the same reasoning as above for /aa.
 600               next if defined $has_tested_ascii_l
 601                       && $has_tested_ascii_l != $test;
 602               $has_tested_ascii_l = $test;
 603           }
 604         }
 605         elsif ($charset eq 'd') {
 606           # Similarly for d.  Beyond one test (besides self) each, we  don't
 607           # test pairs that are both ascii; or both above latin1, or are
 608           # combinations of ascii and above latin1.
 609           if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
 610             if ($target_has_ascii && $pattern_has_ascii) {
 611               next if defined $has_tested_ascii_d
 612                       && $has_tested_ascii_d != $test;
 613               $has_tested_ascii_d = $test
 614             }
 615             elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
 616               next if defined $has_tested_above_latin1_d
 617                       && $has_tested_above_latin1_d != $test;
 618               $has_tested_above_latin1_d = $test;
 619             }
 620             else {
 621               next if defined $has_tested_non_latin1_d
 622                       && $has_tested_non_latin1_d != $test;
 623               $has_tested_non_latin1_d = $test;
 624             }
 625           }
 626         }
 627         elsif ($charset eq 'a') {
 628           # Similarly for a.  This should match identically to /u, so wasn't
 629           # tested at all until a bug was found that was thereby missed.
 630           # As a compromise, beyond one test (besides self) each, we  don't
 631           # test pairs that are both ascii; or both above latin1, or are
 632           # combinations of ascii and above latin1.
 633           if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
 634             if ($target_has_ascii && $pattern_has_ascii) {
 635               next if defined $has_tested_ascii_a
 636                       && $has_tested_ascii_a != $test;
 637               $has_tested_ascii_a = $test
 638             }
 639             elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
 640               next if defined $has_tested_above_latin1_a
 641                       && $has_tested_above_latin1_a != $test;
 642               $has_tested_above_latin1_a = $test;
 643             }
 644             else {
 645               next if defined $has_tested_non_latin1_a
 646                       && $has_tested_non_latin1_a != $test;
 647               $has_tested_non_latin1_a = $test;
 648             }
 649           }
 650         }
 651       }
 652
 653       foreach my $utf8_target (0, 1) {    # Both utf8 and not, for
 654                                           # code points < 256
 655         my $upgrade_target = "";
 656
 657         # These must already be in utf8 because the string to match has
 658         # something above latin1.  So impossible to test if to not to be in
 659         # utf8; and otherwise, no upgrade is needed.
 660         next if $target_above_latin1 && ! $utf8_target;
 661         $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;
 662
 663         foreach my $utf8_pattern (0, 1) {
 664           next if $pattern_above_latin1 && ! $utf8_pattern;
 665
 666           # Our testing of 'l' uses the POSIX locale, which is ASCII-only
 667           my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/);
 668           my $upgrade_pattern = "";
 669           $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
 670
 671           my $lhs = join "", @x_target;
 672           my $lhs_str = eval qq{"$lhs"}; fail($@) if $@;
 673           my @rhs = @x_pattern;
 674           my $rhs = join "", @rhs;
 675           my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self)
 676                             || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii)
 677                             || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1);
 678
 679           # Do simple tests of referencing capture buffers, named and
 680           # numbered.
 681           my $op = '=~';
 682           $op = '!~' if $should_fail;
 683
 684           my $todo = 0;  # No longer any todo's
 685           my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 686           run_test($eval, $todo, "");
 687
 688           $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 689           run_test($eval, $todo, "");
 690
 691           if ($lhs ne $rhs) {
 692             $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 693             run_test($eval, "", "");
 694
 695             $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 696             run_test($eval, "", "");
 697           }
 698
 699           # See if works on what could be a simple trie.
 700           my $alternate;
 701           {
 702             # Keep the alternate | branch the same length as the tested one so
 703             # that it's length doesn't influence things
 704             my $evaled = eval "\"$rhs\"";   # Convert e.g. \x{foo} into its
 705                                             # chr equivalent
 706             use bytes;
 707             $alternate = 'q' x length $evaled;
 708           }
 709           $eval = "my \$c = \"$lhs\"; my \$p = qr/$rhs|$alternate/i$charset;$upgrade_target$upgrade_pattern \$c $op \$p";
 710           run_test($eval, "", "");
 711
 712           # Check that works when the folded character follows something that
 713           # is quantified.  This test knows the regex code internals to the
 714           # extent that it knows this is a potential problem, and that there
 715           # are three different types of quantifiers generated: 1) The thing
 716           # being quantified matches a single character; 2) it matches more
 717           # than one character, but is fixed width; 3) it can match a variable
 718           # number of characters.  (It doesn't know that case 3 shouldn't
 719           # matter, since it doesn't do anything special for the character
 720           # following the quantifier; nor that some of the different
 721           # quantifiers execute the same underlying code, as these tests are
 722           # quick, and this insulates these tests from changes in the
 723           # implementation.)
 724           for my $quantifier ('?', '??', '*', '*?', '+', '+?', '{1,2}', '{1,2}?') {
 725             $eval = "my \$c = \"_$lhs\"; my \$p = qr/(?$charset:.$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 726             run_test($eval, "", "");
 727             $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset:(?:..)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 728             run_test($eval, "", "");
 729             $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset:(?:.|\\R)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 730             run_test($eval, "", "");
 731           }
 732
 733           foreach my $bracketed (0, 1) {   # Put rhs in [...], or not
 734             next if $bracketed && @pattern != 1;    # bracketed makes these
 735                                                     # or's instead of a sequence
 736             foreach my $optimize_bracketed (0, 1) {
 737               next if $optimize_bracketed && ! $bracketed;
 738               foreach my $inverted (0,1) {
 739                   next if $inverted && ! $bracketed;  # inversion only valid
 740                                                       # in [^...]
 741                   next if $inverted && @target != 1;  # [perl #89750] multi-char
 742                                                       # not valid in [^...]
 743
 744                 # In some cases, add an extra character that doesn't fold, and
 745                 # looks ok in the output.
 746                 my $extra_char = "_";
 747                 foreach my $prepend ("", $extra_char) {
 748                   foreach my $append ("", $extra_char) {
 749
 750                     # Assemble the rhs.  Put each character in a separate
 751                     # bracketed if using charclasses.  This creates a stress on
 752                     # the code to span a match across multiple elements
 753                     my $rhs = "";
 754                     foreach my $rhs_char (@rhs) {
 755                         $rhs .= '[' if $bracketed;
 756                         $rhs .= '^' if $inverted;
 757                         $rhs .=  $rhs_char;
 758
 759                         # Add a character to the class, so class doesn't get
 760                         # optimized out, unless we are testing that optimization
 761                         $rhs .= '_' if $optimize_bracketed;
 762                         $rhs .= ']' if $bracketed;
 763                     }
 764
 765                     # Add one of: no capturing parens
 766                     #             a single set
 767                     #             a nested set
 768                     # Use quantifiers and extra variable width matches inside
 769                     # them to keep some optimizations from happening
 770                     foreach my $parend (0, 1, 2) {
 771                       my $interior = (! $parend)
 772                                       ? $rhs
 773                                       : ($parend == 1)
 774                                           ? "(${rhs},?)"
 775                                           : "((${rhs})+,?)";
 776                       foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
 777
 778                         # Perhaps should be TODOs, as are unimplemented, but
 779                         # maybe will never be implemented
 780                         next if @pattern != 1 && $quantifier;
 781
 782                         # A ? or * quantifier normally causes the thing to be
 783                         # able to match a null string
 784                         my $quantifier_can_match_null = $quantifier eq '?'
 785                                                      || $quantifier eq '*';
 786
 787                         # But since we only quantify the last character in a
 788                         # multiple fold, the other characters will have width,
 789                         # except if we are quantifying the whole rhs
 790                         my $can_match_null = $quantifier_can_match_null
 791                                              && (@rhs == 1 || $parend);
 792
 793                         foreach my $l_anchor ("", '^') { # '\A' didn't change
 794                                                          # result)
 795                           foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't
 796                                                            # change result)
 797                             # The folded part can match the null string if it
 798                             # isn't required to have width, and there's not
 799                             # something on one or both sides that force it to.
 800                             my $both_sides = ($l_anchor && $r_anchor)
 801                                               || ($l_anchor && $append)
 802                                               || ($r_anchor && $prepend)
 803                                               || ($prepend && $append);
 804                             my $must_match = ! $can_match_null || $both_sides;
 805                             # for performance, but doing this missed many failures
 806                             #next unless $must_match;
 807                             my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
 808                             my $op;
 809                             if ($must_match && $should_fail)  {
 810                                 $op = 0;
 811                             } else {
 812                                 $op = 1;
 813                             }
 814                             $op = ! $op if $must_match && $inverted;
 815
 816                             if ($inverted && @target > 1) {
 817                               # When doing an inverted match against a
 818                               # multi-char target, and there is not something on
 819                               # the left to anchor the match, if it shouldn't
 820                               # succeed, skip, as what will happen (when working
 821                               # correctly) is that it will match the first
 822                               # position correctly, and then be inverted to not
 823                               # match; then it will go to the second position
 824                               # where it won't match, but get inverted to match,
 825                               # and hence succeeding.
 826                               next if ! ($l_anchor || $prepend) && ! $op;
 827
 828                               # Can't ever match for latin1 code points non-uni
 829                               # semantics that have a inverted multi-char fold
 830                               # when there is something on both sides and the
 831                               # quantifier isn't such as to span the required
 832                               # width, which is 2 or 3.
 833                               $op = 0 if $ord < 255
 834                                         && ! $uni_semantics
 835                                         && $both_sides
 836                                         && ( ! $quantifier || $quantifier eq '?')
 837                                         && $parend < 2;
 838
 839                               # Similarly can't ever match when inverting a
 840                               # multi-char fold for /aa and the quantifier
 841                               # isn't sufficient to allow it to span to both
 842                               # sides.
 843                               $op = 0 if $target_has_ascii
 844                                          && $charset eq 'aa'
 845                                          && $both_sides
 846                                          && ( ! $quantifier || $quantifier eq '?')
 847                                          && $parend < 2;
 848
 849                               # Or for /l
 850                               $op = 0 if $target_has_latin1 && $charset eq 'l'
 851                                       && $both_sides
 852                                       && ( ! $quantifier || $quantifier eq '?')
 853                                       && $parend < 2;
 854                             }
 855
 856
 857                             my $desc = "my \$c = \"$prepend$lhs$append\"; "
 858                                     . "my \$p = qr/$quantified/i;"
 859                                     . "$upgrade_target$upgrade_pattern "
 860                                     . "\$c " . ($op ? "=~" : "!~") . " \$p; ";
 861                             if ($DEBUG) {
 862                               $desc .= (
 863                               "; uni_semantics=$uni_semantics, "
 864                               . "should_fail=$should_fail, "
 865                               . "bracketed=$bracketed, "
 866                               . "prepend=$prepend, "
 867                               . "append=$append, "
 868                               . "parend=$parend, "
 869                               . "quantifier=$quantifier, "
 870                               . "l_anchor=$l_anchor, "
 871                               . "r_anchor=$r_anchor; "
 872                               . "pattern_above_latin1=$pattern_above_latin1; "
 873                               . "utf8_pattern=$utf8_pattern"
 874                               );
 875                             }
 876
 877                             my $c = "$prepend$lhs_str$append";
 878                             my $p = qr/$quantified/i;
 879                             utf8::upgrade($c) if length($upgrade_target);
 880                             utf8::upgrade($p) if length($upgrade_pattern);
 881                             my $res = $op ? ($c =~ $p): ($c !~ $p);
 882
 883                             if (!$res || $list_all_tests) {
 884                               # Failed or debug; output the result
 885                               $count++;
 886                               ok($res, "test $count - $desc");
 887                             } else {
 888                               # Just count the test as passed
 889                               $okays++;
 890                             }
 891                             $this_iteration++;
 892                           }
 893                         }
 894                       }
 895                     }
 896                   }
 897                 }
 898               }
 899             }
 900           }
 901         }
 902       }
 903       unless($list_all_tests) {
 904         $count++;
 905         is $okays, $this_iteration, "$okays subtests ok for"
 906           . " /$charset,"
 907           . ' target="' . join("", @x_target) . '",'
 908           . ' pat="' . join("", @x_pattern) . '"';
 909       }
 910     }
 911   }
 912 }
 913
 914 plan($count);
 915
 916 1