t/re/fold_grind.t

   1 # Grind out a lot of combinatoric tests for folding.
   2
   3 use charnames ":full";
   4
   5 binmode STDOUT, ":utf8";
   6
   7 BEGIN {
   8     chdir 't' if -d 't';
   9     @INC = '../lib';
  10     require './test.pl';
  11 }
  12
  13 my $DEBUG = 0;  # Outputs extra information for debugging this .t
  14
  15 use strict;
  16 use warnings;
  17 use Encode;
  18
  19 # Tests both unicode and not, so make sure not implicitly testing unicode
  20 no feature 'unicode_strings';
  21
  22 # Case-insensitive matching is a large and complicated issue.  Perl does not
  23 # implement it fully, properly.  For example, it doesn't include normalization
  24 # as part of the equation.  To test every conceivable combination is clearly
  25 # impossible; these tests are mostly drawn from visual inspection of the code
  26 # and experience, trying to exercise all areas.
  27
  28 # There are three basic ranges of characters that Perl may treat differently:
  29 # 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
  30 #    referred to here as ASCII.  On EBCDIC machines, the non-ASCII invariants
  31 #    are all controls that fold to themselves.
  32 my $ASCII = 1;
  33
  34 # 2) Other characters that fit into a byte but are different in utf8 than not;
  35 #    here referred to, taking some liberties, as Latin1.
  36 my $Latin1 = 2;
  37
  38 # 3) Characters that won't fit in a byte; here referred to as Unicode
  39 my $Unicode = 3;
  40
  41 # Within these basic groups are equivalence classes that testing any character
  42 # in is likely to lead to the same results as any other character.  This is
  43 # used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
  44 # set.
  45 my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
  46
  47 sub range_type {
  48     my $ord = shift;
  49
  50     return $ASCII if $ord < 128;
  51     return $Latin1 if $ord < 256;
  52     return $Unicode;
  53 }
  54
  55 my %todos;
  56 map { $todos{$_} = '1' } (
  57 95557,
  58 95558,
  59 95561,
  60 95562,
  61 95573,
  62 95574,
  63 95605,
  64 95606,
  65 95609,
  66 95610,
  67 95621,
  68 95622,
  69 );
  70
  71 sub numerically {
  72     return $a <=> $b
  73 }
  74
  75 sub format_test($$$) {
  76     my ($test, $count, $debug) = @_;
  77
  78     # Create a test entry, with TODO set if it is one of the known problem
  79     # code points
  80
  81     $debug = "" unless $DEBUG;
  82
  83     my $todo = (exists $todos{$count}) ? "Known problem" : 0;
  84
  85     return qq[TODO: { local \$::TODO = "$todo"; ok(eval '$test', '$test; $debug'); }];
  86 }
  87
  88 my %tests;          # The final set of tests. keys are the code points to test
  89 my %simple_folds;
  90 my %multi_folds;
  91
  92 # First, analyze the current Unicode's folding rules
  93 my %folded_from;
  94 my $file="../lib/unicore/CaseFolding.txt";
  95 open my $fh, "<", $file or die "Failed to read '$file': $!";
  96 while (<$fh>) {
  97     chomp;
  98
  99     # Lines look like (though without the initial '#')
 100     #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
 101
 102     my ($line, $comment) = split / \s+ \# \s+ /x, $_;
 103     next if $line eq "" || substr($line, 0, 1) eq '#';
 104     my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
 105
 106     my $from = hex $hex_from;
 107
 108     if ($fold_type eq 'F') {
 109          my $from_range_type = range_type($from);
 110
 111         # If we were testing comprehensively, we would try every combination
 112         # of upper and lower case in the fold, but it is quite likely that if
 113         # the code can handle all combinations if it can handle the cases
 114         # where everything is upper and when everything is lower.  Because of
 115         # complement matching, we need to do both.  And we use the
 116         # reverse-fold instead of uppercase.
 117         @folded = map { hex $_ } @folded;
 118         # XXX better to use reverse fold of these instead of uc
 119         my @uc_folded = map { ord uc chr $_ } @folded;
 120
 121         # Include three code points that are handled internally by the regex
 122         # engine specially, plus all non-above-255 multi folds (which actually
 123         # the only one is already included in the three, but this makes sure)
 124         # And if any member of the fold is not the same range type as the
 125         # source, add it directly to the tests.  It needs to be an array of an
 126         # array, so that it is distinguished from multiple single folds
 127         if ($from == 0xDF || $from == 0x390 || $from == 0x3B0
 128             || $from_range_type != $Unicode
 129             || grep { range_type($_) != $from_range_type } @folded)
 130         {
 131             $tests{$from} = [ [ @folded ], [ @uc_folded ] ];
 132         }
 133         else {
 134
 135             # The only multi-char non-utf8 fold is DF, which is handled above,
 136             # so here chr() must be utf8.  Get the number of bytes in each.
 137             # This is because the optimizer cares about length differences.
 138             my $from_length = length encode('UTF-8', chr($from));
 139             my $to_length = length encode('UTF-8', pack 'U*', @folded);
 140             push @{$multi_folds{$from_length}{$to_length}}, { $from => [ [ @folded ], [ @uc_folded ] ] };
 141         }
 142     }
 143
 144     # Perl only deals with C and F folds
 145     next if $fold_type ne 'C';
 146
 147     # C folds are single-char $from to single-char $folded, in chr terms
 148     # folded_from{'s'} = [ 'S', \N{LATIN SMALL LETTER LONG S} ]
 149     push @{$folded_from{hex $folded[0]}}, $from;
 150 }
 151
 152 # Now try to sort the single char folds into equivalence classes that are
 153 # likely to have identical successes and failures.  Any fold that crosses
 154 # range types is suspect, and is automatically tested.  Otherwise, store by
 155 # the number of characters that participate in a fold.  Likely all folds in a
 156 # range type that fold to each other like B->b->B will have identical success
 157 # and failure; similarly all folds that have three characters participating
 158 # are likely to have the same successes and failures, etc.
 159 foreach my $folded (sort numerically keys %folded_from) {
 160     my $target_range_type  = range_type($folded);
 161     my $count = @{$folded_from{$folded}};
 162
 163     # Automatically test any fold that crosses range types
 164     if (grep { range_type($_) != $target_range_type } @{$folded_from{$folded}})
 165     {
 166         $tests{$folded} = $folded_from{$folded};
 167     }
 168     else {
 169         push @{$simple_folds{$target_range_type}{$count}},
 170                { $folded => $folded_from{$folded} };
 171     }
 172 }
 173
 174 foreach my $from_length (keys %multi_folds) {
 175     foreach my $fold_length (keys %{$multi_folds{$from_length}}) {
 176         #print __LINE__, ref $multi_folds{$from_length}{$fold_length}, Dumper $multi_folds{$from_length}{$fold_length};
 177         foreach my $test (@{$multi_folds{$from_length}{$fold_length}}) {
 178             #print __LINE__, ": $from_length, $fold_length, $test:\n";
 179             my ($target, $pattern) = each %$test;
 180             #print __LINE__, ": $target: $pattern\n";
 181             $tests{$target} = $pattern;
 182             last if $skip_apparently_redundant;
 183         }
 184     }
 185 }
 186
 187 # Add in tests for single character folds.  Add tests for each range type,
 188 # and within those tests for each number of characters participating in a
 189 # fold.  Thus B->b has two characters participating.  But K->k and Kelvin
 190 # Sign->k has three characters participating.  So we would make sure that
 191 # there is a test for 3 chars, 4 chars, ... .  (Note that the 'k' example is a
 192 # bad one because it crosses range types, so is automatically tested.  In the
 193 # Unicode range there are various of these 3 and 4 char classes, but aren't as
 194 # easily described as the 'k' one.)
 195 foreach my $type (keys %simple_folds) {
 196     foreach my $count (keys %{$simple_folds{$type}}) {
 197         foreach my $test (@{$simple_folds{$type}{$count}}) {
 198             my ($target, $pattern) = each %$test;
 199             $tests{$target} = $pattern;
 200             last if $skip_apparently_redundant;
 201         }
 202     }
 203 }
 204
 205 # For each range type, test additionally a character that folds to itself
 206 $tests{0x3A} = [ 0x3A ];
 207 $tests{0xF7} = [ 0xF7 ];
 208 $tests{0x2C7} = [ 0x2C7 ];
 209
 210 my $clump_execs = 1000;    # Speed up by building an 'exec' of many tests
 211 my @eval_tests;
 212
 213 # To cut down on the number of tests
 214 my $has_tested_aa_above_latin1;
 215 my $has_tested_latin1_aa;
 216
 217 # For use by pairs() in generating combinations
 218 sub prefix {
 219     my $p = shift;
 220     map [ $p, $_ ], @_
 221 }
 222
 223 # Returns all ordered combinations of pairs of elements from the input array.
 224 # It doesn't return pairs like (a, a), (b, b).  Change the slice to an array
 225 # to do that.  This was just to have fewer tests.
 226 sub pairs (@) {
 227     #print __LINE__, ": ", join(" XXX ", @_), "\n";
 228     map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
 229 }
 230
 231
 232 # Finally ready to do the tests
 233 my $count=0;
 234 foreach my $test (sort { numerically } keys %tests) {
 235
 236   my $previous_target;
 237   my $previous_pattern;
 238   my @pairs = pairs(sort numerically $test, @{$tests{$test}});
 239
 240   # Each fold can be viewed as a closure of all the characters that
 241   # participate in it.  Look at each possible pairing from a closure, with the
 242   # first member of the pair the target string to match against, and the
 243   # second member forming the pattern.  Thus each fold member gets tested as
 244   # the string, and the pattern with every other member in the opposite role.
 245   while (my $pair = shift @pairs) {
 246     my ($target, $pattern) = @$pair;
 247
 248     # When testing a char that doesn't fold, we can get the same
 249     # permutation twice; so skip all but the first.
 250     next if $previous_target
 251             && $previous_target == $target
 252             && $previous_pattern == $pattern;
 253     ($previous_target, $previous_pattern) = ($target, $pattern);
 254
 255     # Each side may be either a single char or a string.  Extract each into an
 256     # array (perhaps of length 1)
 257     my @target, my @pattern;
 258     @target = (ref $target) ? @$target : $target;
 259     @pattern = (ref $pattern) ? @$pattern : $pattern;
 260
 261     # Have to convert non-utf8 chars to native char set
 262     @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
 263     @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;
 264
 265     # Get in hex form.
 266     my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
 267     my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
 268
 269     my $target_above_latin1 = grep { $_ > 255 } @target;
 270     my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
 271     my $target_has_ascii = grep { $_ < 128 } @target;
 272     my $pattern_has_ascii = grep { $_ < 128 } @pattern;
 273     my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
 274
 275     # We don't test multi-char folding into other multi-chars.  We are testing
 276     # a code point that folds to or from other characters.  Find the single
 277     # code point for diagnostic purposes.  (If both are single, choose the
 278     # target string)
 279     my $ord = @target == 1 ? $target[0] : $pattern[0];
 280     my $progress = sprintf "%04X: \"%s\" and /%s/",
 281                             $test,
 282                             join("", @x_target),
 283                             join("", @x_pattern);
 284     #print $progress, "\n";
 285     #diag $progress;
 286
 287     # Now grind out tests, using various combinations.
 288     foreach my $charset ('d', 'u', 'aa') {
 289
 290       # /aa should only affect things with folds in the ASCII range.  But, try
 291       # it on one pair in the other ranges just to make sure it doesn't break
 292       # them.  Set these flags.  They are set to the ord of the character
 293       # tested so that all pairs of that ord get tested.
 294       if ($charset eq 'aa') {
 295         if (! $target_has_ascii && ! $pattern_has_ascii) {
 296           if ($target_above_latin1 || $pattern_above_latin1) {
 297             next if defined $has_tested_aa_above_latin1
 298                     && $has_tested_aa_above_latin1 != $test;
 299             $has_tested_aa_above_latin1 = $test;
 300           }
 301           next if defined $has_tested_latin1_aa && $has_tested_latin1_aa != $test;
 302           $has_tested_latin1_aa = $test;
 303         }
 304       }
 305
 306       foreach my $utf8_target (0, 1) {    # Both utf8 and not, for
 307                                           # code points < 256
 308         my $upgrade_target = "";
 309
 310         # These must already be in utf8 because the string to match has
 311         # something above latin1.  So impossible to test if to not to be in
 312         # utf8; and otherwise, no upgrade is needed.
 313         next if $target_above_latin1 && ! $utf8_target;
 314         $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;
 315
 316         foreach my $utf8_pattern (0, 1) {
 317           next if $pattern_above_latin1 && ! $utf8_pattern;
 318           my $uni_semantics = $utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/;
 319           my $upgrade_pattern = "";
 320           $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
 321
 322           my $lhs = join "", @x_target;
 323           my @rhs = @x_pattern;
 324           my $rhs = join "", @rhs;
 325           my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self)
 326                             || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii);
 327
 328           # Do simple tests of referencing capture buffers, named and
 329           # numbered.
 330           my $op = '=~';
 331           $op = '!~' if $should_fail;
 332
 333           my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 334           push @eval_tests, format_test($eval, ++$count, "");
 335
 336           $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 337           push @eval_tests, format_test($eval, ++$count, "");
 338
 339           if ($lhs ne $rhs) {
 340             $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 341             push @eval_tests, format_test($eval, ++$count, "");
 342
 343             $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 344             push @eval_tests, format_test($eval, ++$count, "");
 345           }
 346
 347           foreach my $bracketed (0, 1) {   # Put rhs in [...], or not
 348             foreach my $inverted (0,1) {
 349                 next if $inverted && ! $bracketed;  # inversion only valid in [^...]
 350
 351               # In some cases, add an extra character that doesn't fold, and
 352               # looks ok in the output.
 353               my $extra_char = "_";
 354               foreach my $prepend ("", $extra_char) {
 355                 foreach my $append ("", $extra_char) {
 356
 357                   # Assemble the rhs.  Put each character in a separate
 358                   # bracketed if using charclasses.  This creates a stress on
 359                   # the code to span a match across multiple elements
 360                   my $rhs = "";
 361                   foreach my $rhs_char (@rhs) {
 362                       $rhs .= '[' if $bracketed;
 363                       $rhs .= '^' if $inverted;
 364                       $rhs .=  $rhs_char;
 365
 366                       # Add a character to the class, so class doesn't get
 367                       # optimized out
 368                       $rhs .= '_]' if $bracketed;
 369                   }
 370
 371                   # Add one of: no capturing parens
 372                   #             a single set
 373                   #             a nested set
 374                   # Use quantifiers and extra variable width matches inside
 375                   # them to keep some optimizations from happening
 376                   foreach my $parend (0, 1, 2) {
 377                     my $interior = (! $parend)
 378                                     ? $rhs
 379                                     : ($parend == 1)
 380                                         ? "(${rhs},?)"
 381                                         : "((${rhs})+,?)";
 382                     foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
 383
 384                       # A ? or * quantifier normally causes the thing to be
 385                       # able to match a null string
 386                       my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*';
 387
 388                       # But since we only quantify the last character in a
 389                       # multiple fold, the other characters will have width,
 390                       # except if we are quantifying the whole rhs
 391                       my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend);
 392
 393                       foreach my $l_anchor ("", '^') { # '\A' didn't change result)
 394                         foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result)
 395
 396                           # The folded part can match the null string if it
 397                           # isn't required to have width, and there's not
 398                           # something on one or both sides that force it to.
 399                           my $both_sides = ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append);
 400                           my $must_match = ! $can_match_null || $both_sides;
 401                           # for performance, but doing this missed many failures
 402                           #next unless $must_match;
 403                           my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
 404                           my $op;
 405                           if ($must_match && $should_fail)  {
 406                               $op = 0;
 407                           } else {
 408                               $op = 1;
 409                           }
 410                           $op = ! $op if $must_match && $inverted;
 411
 412                           if ($inverted && @target > 1) {
 413                             # When doing an inverted match against a
 414                             # multi-char target, and there is not something on
 415                             # the left to anchor the match, if it shouldn't
 416                             # succeed, skip, as what will happen (when working
 417                             # correctly) is that it will match the first
 418                             # position correctly, and then be inverted to not
 419                             # match; then it will go to the second position
 420                             # where it won't match, but get inverted to match,
 421                             # and hence succeeding.
 422                             next if ! ($l_anchor || $prepend) && ! $op;
 423
 424                             # Can't ever match for latin1 code points non-uni
 425                             # semantics that have a inverted multi-char fold
 426                             # when there is something on both sides and the
 427                             # quantifier isn't such as to span the required
 428                             # width, which is 2 or 3.
 429                             $op = 0 if $ord < 255
 430                                        && ! $uni_semantics
 431                                        && $both_sides
 432                                        && ( ! $quantifier || $quantifier eq '?')
 433                                        && $parend < 2;
 434
 435                             # Similarly can't ever match when inverting a multi-char
 436                             # fold for /aa and the quantifier isn't sufficient
 437                             # to allow it to span to both sides.
 438                             $op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
 439                           }
 440
 441                           $op = ($op) ? '=~' : '!~';
 442
 443                           my $debug .= " uni_semantics=$uni_semantics, should_fail=$should_fail, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quantifier=$quantifier, l_anchor=$l_anchor, r_anchor=$r_anchor";
 444                           $debug .= "; pattern_above_latin1=$pattern_above_latin1; utf8_pattern=$utf8_pattern";
 445                           my $eval = "my \$c = \"$prepend$lhs$append\"; my \$p = qr/$quantified/i;$upgrade_target$upgrade_pattern \$c $op \$p";
 446
 447                           # XXX Doesn't currently test multi-char folds in pattern
 448                           next if @pattern != 1;
 449                           push @eval_tests, format_test($eval, ++$count, $debug);
 450
 451                           # Group tests
 452                           if (@eval_tests >= $clump_execs) {
 453                               #eval "use re qw(Debug COMPILE EXECUTE);" . join ";\n", @eval_tests;
 454                               eval join ";\n", @eval_tests;
 455                               if ($@) {
 456                                 fail($@);
 457                                 exit 1;
 458                               }
 459                               undef @eval_tests;
 460                           }
 461                         }
 462                       }
 463                     }
 464                   }
 465                 }
 466               }
 467             }
 468           }
 469         }
 470       }
 471     }
 472   }
 473 }
 474
 475 # Finish up any tests not already done
 476 eval join ";\n", @eval_tests;
 477 if ($@) {
 478   fail($@);
 479   exit 1;
 480 }
 481
 482 plan($count);
 483
 484 1