| 1 | # Grind out a lot of combinatoric tests for folding. |
| 2 | |
| 3 | binmode STDOUT, ":utf8"; |
| 4 | |
| 5 | BEGIN { |
| 6 | chdir 't' if -d 't'; |
| 7 | @INC = '../lib'; |
| 8 | require './test.pl'; |
| 9 | skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX"); |
| 10 | } |
| 11 | |
| 12 | use charnames ":full"; |
| 13 | |
| 14 | my $DEBUG = 0; # Outputs extra information for debugging this .t |
| 15 | |
| 16 | use strict; |
| 17 | use warnings; |
| 18 | use Encode; |
| 19 | use POSIX; |
| 20 | |
| 21 | # Tests both unicode and not, so make sure not implicitly testing unicode |
| 22 | no feature 'unicode_strings'; |
| 23 | |
| 24 | # Case-insensitive matching is a large and complicated issue. Perl does not |
| 25 | # implement it fully, properly. For example, it doesn't include normalization |
| 26 | # as part of the equation. To test every conceivable combination is clearly |
| 27 | # impossible; these tests are mostly drawn from visual inspection of the code |
| 28 | # and experience, trying to exercise all areas. |
| 29 | |
| 30 | # There are three basic ranges of characters that Perl may treat differently: |
| 31 | # 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are |
| 32 | # referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants |
| 33 | # are all controls that fold to themselves. |
| 34 | my $ASCII = 1; |
| 35 | |
| 36 | # 2) Other characters that fit into a byte but are different in utf8 than not; |
| 37 | # here referred to, taking some liberties, as Latin1. |
| 38 | my $Latin1 = 2; |
| 39 | |
| 40 | # 3) Characters that won't fit in a byte; here referred to as Unicode |
| 41 | my $Unicode = 3; |
| 42 | |
| 43 | # Within these basic groups are equivalence classes that testing any character |
| 44 | # in is likely to lead to the same results as any other character. This is |
| 45 | # used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is |
| 46 | # set. |
| 47 | my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS}; |
| 48 | |
| 49 | # Additionally parts of this test run a lot of subtests, outputting the |
| 50 | # resulting TAP can be expensive so the tests are summarised internally. The |
| 51 | # PERL_DEBUG_FULL_TEST environment variable can be set to produce the full |
| 52 | # output for debugging purposes. |
| 53 | |
| 54 | sub range_type { |
| 55 | my $ord = shift; |
| 56 | |
| 57 | return $ASCII if $ord < 128; |
| 58 | return $Latin1 if $ord < 256; |
| 59 | return $Unicode; |
| 60 | } |
| 61 | |
| 62 | sub numerically { |
| 63 | return $a <=> $b |
| 64 | } |
| 65 | |
| 66 | sub run_test($$$$) { |
| 67 | my ($test, $count, $todo, $debug) = @_; |
| 68 | |
| 69 | $debug = "" unless $DEBUG; |
| 70 | ok(eval $test, "$test; $debug"); |
| 71 | } |
| 72 | |
| 73 | my %tests; # The final set of tests. keys are the code points to test |
| 74 | my %simple_folds; |
| 75 | my %multi_folds; |
| 76 | |
| 77 | # First, analyze the current Unicode's folding rules |
| 78 | my %folded_from; |
| 79 | my $file="../lib/unicore/CaseFolding.txt"; |
| 80 | open my $fh, "<", $file or die "Failed to read '$file': $!"; |
| 81 | while (<$fh>) { |
| 82 | chomp; |
| 83 | |
| 84 | # Lines look like (though without the initial '#') |
| 85 | #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE |
| 86 | |
| 87 | my ($line, $comment) = split / \s+ \# \s+ /x, $_; |
| 88 | next if $line eq "" || substr($line, 0, 1) eq '#'; |
| 89 | my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line; |
| 90 | |
| 91 | my $from = hex $hex_from; |
| 92 | |
| 93 | if ($fold_type eq 'F') { |
| 94 | my $from_range_type = range_type($from); |
| 95 | |
| 96 | # If we were testing comprehensively, we would try every combination |
| 97 | # of upper and lower case in the fold, but it is quite likely that if |
| 98 | # the code can handle all combinations if it can handle the cases |
| 99 | # where everything is upper and when everything is lower. Because of |
| 100 | # complement matching, we need to do both. And we use the |
| 101 | # reverse-fold instead of uppercase. |
| 102 | @folded = map { hex $_ } @folded; |
| 103 | # XXX better to use reverse fold of these instead of uc |
| 104 | my @uc_folded = map { ord uc chr $_ } @folded; |
| 105 | |
| 106 | # Include three code points that are handled internally by the regex |
| 107 | # engine specially, plus all non-above-255 multi folds (which actually |
| 108 | # the only one is already included in the three, but this makes sure) |
| 109 | # And if any member of the fold is not the same range type as the |
| 110 | # source, add it directly to the tests. It needs to be an array of an |
| 111 | # array, so that it is distinguished from multiple single folds |
| 112 | if ($from == 0xDF || $from == 0x390 || $from == 0x3B0 |
| 113 | || $from_range_type != $Unicode |
| 114 | || grep { range_type($_) != $from_range_type } @folded) |
| 115 | { |
| 116 | $tests{$from} = [ [ @folded ], [ @uc_folded ] ]; |
| 117 | } |
| 118 | else { |
| 119 | |
| 120 | # The only multi-char non-utf8 fold is DF, which is handled above, |
| 121 | # so here chr() must be utf8. Get the number of bytes in each. |
| 122 | # This is because the optimizer cares about length differences. |
| 123 | my $from_length = length encode('UTF-8', chr($from)); |
| 124 | my $to_length = length encode('UTF-8', pack 'U*', @folded); |
| 125 | push @{$multi_folds{$from_length}{$to_length}}, { $from => [ [ @folded ], [ @uc_folded ] ] }; |
| 126 | } |
| 127 | } |
| 128 | |
| 129 | # Perl only deals with C and F folds |
| 130 | next if $fold_type ne 'C'; |
| 131 | |
| 132 | # C folds are single-char $from to single-char $folded, in chr terms |
| 133 | # folded_from{'s'} = [ 'S', \N{LATIN SMALL LETTER LONG S} ] |
| 134 | push @{$folded_from{hex $folded[0]}}, $from; |
| 135 | } |
| 136 | |
| 137 | # Now try to sort the single char folds into equivalence classes that are |
| 138 | # likely to have identical successes and failures. Any fold that crosses |
| 139 | # range types is suspect, and is automatically tested. Otherwise, store by |
| 140 | # the number of characters that participate in a fold. Likely all folds in a |
| 141 | # range type that fold to each other like B->b->B will have identical success |
| 142 | # and failure; similarly all folds that have three characters participating |
| 143 | # are likely to have the same successes and failures, etc. |
| 144 | foreach my $folded (sort numerically keys %folded_from) { |
| 145 | my $target_range_type = range_type($folded); |
| 146 | my $count = @{$folded_from{$folded}}; |
| 147 | |
| 148 | # Automatically test any fold that crosses range types |
| 149 | if (grep { range_type($_) != $target_range_type } @{$folded_from{$folded}}) |
| 150 | { |
| 151 | $tests{$folded} = $folded_from{$folded}; |
| 152 | } |
| 153 | else { |
| 154 | push @{$simple_folds{$target_range_type}{$count}}, |
| 155 | { $folded => $folded_from{$folded} }; |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | foreach my $from_length (keys %multi_folds) { |
| 160 | foreach my $fold_length (keys %{$multi_folds{$from_length}}) { |
| 161 | #print __LINE__, ref $multi_folds{$from_length}{$fold_length}, Dumper $multi_folds{$from_length}{$fold_length}; |
| 162 | foreach my $test (@{$multi_folds{$from_length}{$fold_length}}) { |
| 163 | #print __LINE__, ": $from_length, $fold_length, $test:\n"; |
| 164 | my ($target, $pattern) = each %$test; |
| 165 | #print __LINE__, ": $target: $pattern\n"; |
| 166 | $tests{$target} = $pattern; |
| 167 | last if $skip_apparently_redundant; |
| 168 | } |
| 169 | } |
| 170 | } |
| 171 | |
| 172 | # Add in tests for single character folds. Add tests for each range type, |
| 173 | # and within those tests for each number of characters participating in a |
| 174 | # fold. Thus B->b has two characters participating. But K->k and Kelvin |
| 175 | # Sign->k has three characters participating. So we would make sure that |
| 176 | # there is a test for 3 chars, 4 chars, ... . (Note that the 'k' example is a |
| 177 | # bad one because it crosses range types, so is automatically tested. In the |
| 178 | # Unicode range there are various of these 3 and 4 char classes, but aren't as |
| 179 | # easily described as the 'k' one.) |
| 180 | foreach my $type (keys %simple_folds) { |
| 181 | foreach my $count (keys %{$simple_folds{$type}}) { |
| 182 | foreach my $test (@{$simple_folds{$type}{$count}}) { |
| 183 | my ($target, $pattern) = each %$test; |
| 184 | $tests{$target} = $pattern; |
| 185 | last if $skip_apparently_redundant; |
| 186 | } |
| 187 | } |
| 188 | } |
| 189 | |
| 190 | # For each range type, test additionally a character that folds to itself |
| 191 | $tests{0x3A} = [ 0x3A ]; |
| 192 | $tests{0xF7} = [ 0xF7 ]; |
| 193 | $tests{0x2C7} = [ 0x2C7 ]; |
| 194 | |
| 195 | |
| 196 | # To cut down on the number of tests |
| 197 | my $has_tested_aa_above_latin1; |
| 198 | my $has_tested_latin1_aa; |
| 199 | my $has_tested_ascii_aa; |
| 200 | my $has_tested_l_above_latin1; |
| 201 | my $has_tested_above_latin1_l; |
| 202 | my $has_tested_ascii_l; |
| 203 | my $has_tested_above_latin1_d; |
| 204 | my $has_tested_ascii_d; |
| 205 | my $has_tested_non_latin1_d; |
| 206 | my $has_tested_above_latin1_a; |
| 207 | my $has_tested_ascii_a; |
| 208 | my $has_tested_non_latin1_a; |
| 209 | |
| 210 | # For use by pairs() in generating combinations |
| 211 | sub prefix { |
| 212 | my $p = shift; |
| 213 | map [ $p, $_ ], @_ |
| 214 | } |
| 215 | |
| 216 | # Returns all ordered combinations of pairs of elements from the input array. |
| 217 | # It doesn't return pairs like (a, a), (b, b). Change the slice to an array |
| 218 | # to do that. This was just to have fewer tests. |
| 219 | sub pairs (@) { |
| 220 | #print __LINE__, ": ", join(" XXX ", @_), "\n"; |
| 221 | map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_ |
| 222 | } |
| 223 | |
| 224 | my @charsets = qw(d u a aa); |
| 225 | my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // ""; |
| 226 | push @charsets, 'l' if $current_locale eq 'C'; |
| 227 | |
| 228 | # Finally ready to do the tests |
| 229 | my $count=0; |
| 230 | foreach my $test (sort { numerically } keys %tests) { |
| 231 | |
| 232 | my $previous_target; |
| 233 | my $previous_pattern; |
| 234 | my @pairs = pairs(sort numerically $test, @{$tests{$test}}); |
| 235 | |
| 236 | # Each fold can be viewed as a closure of all the characters that |
| 237 | # participate in it. Look at each possible pairing from a closure, with the |
| 238 | # first member of the pair the target string to match against, and the |
| 239 | # second member forming the pattern. Thus each fold member gets tested as |
| 240 | # the string, and the pattern with every other member in the opposite role. |
| 241 | while (my $pair = shift @pairs) { |
| 242 | my ($target, $pattern) = @$pair; |
| 243 | |
| 244 | # When testing a char that doesn't fold, we can get the same |
| 245 | # permutation twice; so skip all but the first. |
| 246 | next if $previous_target |
| 247 | && $previous_target == $target |
| 248 | && $previous_pattern == $pattern; |
| 249 | ($previous_target, $previous_pattern) = ($target, $pattern); |
| 250 | |
| 251 | # Each side may be either a single char or a string. Extract each into an |
| 252 | # array (perhaps of length 1) |
| 253 | my @target, my @pattern; |
| 254 | @target = (ref $target) ? @$target : $target; |
| 255 | @pattern = (ref $pattern) ? @$pattern : $pattern; |
| 256 | |
| 257 | # Have to convert non-utf8 chars to native char set |
| 258 | @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target; |
| 259 | @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern; |
| 260 | |
| 261 | # Get in hex form. |
| 262 | my @x_target = map { sprintf "\\x{%04X}", $_ } @target; |
| 263 | my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern; |
| 264 | |
| 265 | my $target_above_latin1 = grep { $_ > 255 } @target; |
| 266 | my $pattern_above_latin1 = grep { $_ > 255 } @pattern; |
| 267 | my $target_has_ascii = grep { $_ < 128 } @target; |
| 268 | my $pattern_has_ascii = grep { $_ < 128 } @pattern; |
| 269 | my $target_only_ascii = ! grep { $_ > 127 } @target; |
| 270 | my $pattern_only_ascii = ! grep { $_ > 127 } @pattern; |
| 271 | my $target_has_latin1 = grep { $_ < 256 } @target; |
| 272 | my $target_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @target; |
| 273 | my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @pattern; |
| 274 | my $pattern_has_latin1 = grep { $_ < 256 } @pattern; |
| 275 | my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0]; |
| 276 | |
| 277 | # We don't test multi-char folding into other multi-chars. We are testing |
| 278 | # a code point that folds to or from other characters. Find the single |
| 279 | # code point for diagnostic purposes. (If both are single, choose the |
| 280 | # target string) |
| 281 | my $ord = @target == 1 ? $target[0] : $pattern[0]; |
| 282 | my $progress = sprintf "%04X: \"%s\" and /%s/", |
| 283 | $test, |
| 284 | join("", @x_target), |
| 285 | join("", @x_pattern); |
| 286 | #print $progress, "\n"; |
| 287 | #diag $progress; |
| 288 | |
| 289 | # Now grind out tests, using various combinations. |
| 290 | foreach my $charset (@charsets) { |
| 291 | |
| 292 | # To cut down somewhat on the enormous quantity of tests this currently |
| 293 | # runs, skip some for some of the character sets whose results aren't |
| 294 | # likely to differ from others. But run all tests on the code points |
| 295 | # that don't fold, plus one other set in each range group. |
| 296 | if (! $is_self) { |
| 297 | |
| 298 | # /aa should only affect things with folds in the ASCII range. But, try |
| 299 | # it on one set in the other ranges just to make sure it doesn't break |
| 300 | # them. |
| 301 | if ($charset eq 'aa') { |
| 302 | if (! $target_has_ascii && ! $pattern_has_ascii) { |
| 303 | if ($target_above_latin1 || $pattern_above_latin1) { |
| 304 | next if defined $has_tested_aa_above_latin1 |
| 305 | && $has_tested_aa_above_latin1 != $test; |
| 306 | $has_tested_aa_above_latin1 = $test; |
| 307 | } |
| 308 | next if defined $has_tested_latin1_aa |
| 309 | && $has_tested_latin1_aa != $test; |
| 310 | $has_tested_latin1_aa = $test; |
| 311 | } |
| 312 | elsif ($target_only_ascii && $pattern_only_ascii) { |
| 313 | |
| 314 | # And, except for one set just to make sure, skip tests |
| 315 | # where both elements in the pair are ASCII. If one works for |
| 316 | # aa, the others are likely too. This skips tests where the |
| 317 | # fold is from non-ASCII to ASCII, but this part of the test |
| 318 | # is just about the ASCII components. |
| 319 | next if defined $has_tested_ascii_l |
| 320 | && $has_tested_ascii_l != $test; |
| 321 | $has_tested_ascii_l = $test; |
| 322 | } |
| 323 | } |
| 324 | elsif ($charset eq 'l') { |
| 325 | |
| 326 | # For l, don't need to test beyond one set those things that are |
| 327 | # all above latin1, because unlikely to have different successes |
| 328 | # than /u |
| 329 | if (! $target_has_latin1 && ! $pattern_has_latin1) { |
| 330 | next if defined $has_tested_above_latin1_l |
| 331 | && $has_tested_above_latin1_l != $test; |
| 332 | $has_tested_above_latin1_l = $test; |
| 333 | } |
| 334 | elsif ($target_only_ascii && $pattern_only_ascii) { |
| 335 | |
| 336 | # And, except for one set just to make sure, skip tests |
| 337 | # where both elements in the pair are ASCII. This is |
| 338 | # essentially the same reasoning as above for /aa. |
| 339 | next if defined $has_tested_ascii_l |
| 340 | && $has_tested_ascii_l != $test; |
| 341 | $has_tested_ascii_l = $test; |
| 342 | } |
| 343 | } |
| 344 | elsif ($charset eq 'd') { |
| 345 | # Similarly for d. Beyond one test (besides self) each, we don't |
| 346 | # test pairs that are both ascii; or both above latin1, or are |
| 347 | # combinations of ascii and above latin1. |
| 348 | if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { |
| 349 | if ($target_has_ascii && $pattern_has_ascii) { |
| 350 | next if defined $has_tested_ascii_d |
| 351 | && $has_tested_ascii_d != $test; |
| 352 | $has_tested_ascii_d = $test |
| 353 | } |
| 354 | elsif (! $target_has_latin1 && ! $pattern_has_latin1) { |
| 355 | next if defined $has_tested_above_latin1_d |
| 356 | && $has_tested_above_latin1_d != $test; |
| 357 | $has_tested_above_latin1_d = $test; |
| 358 | } |
| 359 | else { |
| 360 | next if defined $has_tested_non_latin1_d |
| 361 | && $has_tested_non_latin1_d != $test; |
| 362 | $has_tested_non_latin1_d = $test; |
| 363 | } |
| 364 | } |
| 365 | } |
| 366 | elsif ($charset eq 'a') { |
| 367 | # Similarly for a. This should match identically to /u, so wasn't |
| 368 | # tested at all until a bug was found that was thereby missed. |
| 369 | # As a compromise, beyond one test (besides self) each, we don't |
| 370 | # test pairs that are both ascii; or both above latin1, or are |
| 371 | # combinations of ascii and above latin1. |
| 372 | if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { |
| 373 | if ($target_has_ascii && $pattern_has_ascii) { |
| 374 | next if defined $has_tested_ascii_a |
| 375 | && $has_tested_ascii_a != $test; |
| 376 | $has_tested_ascii_a = $test |
| 377 | } |
| 378 | elsif (! $target_has_latin1 && ! $pattern_has_latin1) { |
| 379 | next if defined $has_tested_above_latin1_a |
| 380 | && $has_tested_above_latin1_a != $test; |
| 381 | $has_tested_above_latin1_a = $test; |
| 382 | } |
| 383 | else { |
| 384 | next if defined $has_tested_non_latin1_a |
| 385 | && $has_tested_non_latin1_a != $test; |
| 386 | $has_tested_non_latin1_a = $test; |
| 387 | } |
| 388 | } |
| 389 | } |
| 390 | } |
| 391 | |
| 392 | foreach my $utf8_target (0, 1) { # Both utf8 and not, for |
| 393 | # code points < 256 |
| 394 | my $upgrade_target = ""; |
| 395 | |
| 396 | # These must already be in utf8 because the string to match has |
| 397 | # something above latin1. So impossible to test if to not to be in |
| 398 | # utf8; and otherwise, no upgrade is needed. |
| 399 | next if $target_above_latin1 && ! $utf8_target; |
| 400 | $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target; |
| 401 | |
| 402 | foreach my $utf8_pattern (0, 1) { |
| 403 | next if $pattern_above_latin1 && ! $utf8_pattern; |
| 404 | |
| 405 | # Our testing of 'l' uses the POSIX locale, which is ASCII-only |
| 406 | my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/); |
| 407 | my $upgrade_pattern = ""; |
| 408 | $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern; |
| 409 | |
| 410 | my $lhs = join "", @x_target; |
| 411 | my $lhs_str = eval qq{"$lhs"}; fail($@) if $@; |
| 412 | my @rhs = @x_pattern; |
| 413 | my $rhs = join "", @rhs; |
| 414 | my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self) |
| 415 | || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii) |
| 416 | || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1); |
| 417 | |
| 418 | # Do simple tests of referencing capture buffers, named and |
| 419 | # numbered. |
| 420 | my $op = '=~'; |
| 421 | $op = '!~' if $should_fail; |
| 422 | |
| 423 | # I'm afraid this was derived from trial and error. |
| 424 | my $todo = ($test == 0xdf |
| 425 | && $lhs =~ /DF/ |
| 426 | && $uni_semantics |
| 427 | && ($charset eq 'u' || $charset eq 'a' || $charset eq 'd') |
| 428 | && ! (($charset eq 'u' || $charset eq 'a') |
| 429 | && (($upgrade_target eq "") != ($upgrade_pattern eq ""))) |
| 430 | && ! ($charset eq 'd' && (! $upgrade_target || ! $upgrade_pattern)) |
| 431 | ); |
| 432 | my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; |
| 433 | run_test($eval, ++$count, $todo, ""); |
| 434 | |
| 435 | $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; |
| 436 | run_test($eval, ++$count, $todo, ""); |
| 437 | |
| 438 | if ($lhs ne $rhs) { |
| 439 | $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; |
| 440 | run_test($eval, ++$count, "", ""); |
| 441 | |
| 442 | $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; |
| 443 | run_test($eval, ++$count, "", ""); |
| 444 | } |
| 445 | |
| 446 | # XXX Doesn't currently test multi-char folds in pattern |
| 447 | next if @pattern != 1; |
| 448 | |
| 449 | my $okays = 0; |
| 450 | my $this_iteration = 0; |
| 451 | |
| 452 | foreach my $bracketed (0, 1) { # Put rhs in [...], or not |
| 453 | foreach my $inverted (0,1) { |
| 454 | next if $inverted && ! $bracketed; # inversion only valid in [^...] |
| 455 | next if $inverted && @target != 1; # [perl #89750] multi-char |
| 456 | # not valid in [^...] |
| 457 | |
| 458 | # In some cases, add an extra character that doesn't fold, and |
| 459 | # looks ok in the output. |
| 460 | my $extra_char = "_"; |
| 461 | foreach my $prepend ("", $extra_char) { |
| 462 | foreach my $append ("", $extra_char) { |
| 463 | |
| 464 | # Assemble the rhs. Put each character in a separate |
| 465 | # bracketed if using charclasses. This creates a stress on |
| 466 | # the code to span a match across multiple elements |
| 467 | my $rhs = ""; |
| 468 | foreach my $rhs_char (@rhs) { |
| 469 | $rhs .= '[' if $bracketed; |
| 470 | $rhs .= '^' if $inverted; |
| 471 | $rhs .= $rhs_char; |
| 472 | |
| 473 | # Add a character to the class, so class doesn't get |
| 474 | # optimized out |
| 475 | $rhs .= '_]' if $bracketed; |
| 476 | } |
| 477 | |
| 478 | # Add one of: no capturing parens |
| 479 | # a single set |
| 480 | # a nested set |
| 481 | # Use quantifiers and extra variable width matches inside |
| 482 | # them to keep some optimizations from happening |
| 483 | foreach my $parend (0, 1, 2) { |
| 484 | my $interior = (! $parend) |
| 485 | ? $rhs |
| 486 | : ($parend == 1) |
| 487 | ? "(${rhs},?)" |
| 488 | : "((${rhs})+,?)"; |
| 489 | foreach my $quantifier ("", '?', '*', '+', '{1,3}') { |
| 490 | |
| 491 | # A ? or * quantifier normally causes the thing to be |
| 492 | # able to match a null string |
| 493 | my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*'; |
| 494 | |
| 495 | # But since we only quantify the last character in a |
| 496 | # multiple fold, the other characters will have width, |
| 497 | # except if we are quantifying the whole rhs |
| 498 | my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend); |
| 499 | |
| 500 | foreach my $l_anchor ("", '^') { # '\A' didn't change result) |
| 501 | foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result) |
| 502 | |
| 503 | # The folded part can match the null string if it |
| 504 | # isn't required to have width, and there's not |
| 505 | # something on one or both sides that force it to. |
| 506 | my $both_sides = ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append); |
| 507 | my $must_match = ! $can_match_null || $both_sides; |
| 508 | # for performance, but doing this missed many failures |
| 509 | #next unless $must_match; |
| 510 | my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)"; |
| 511 | my $op; |
| 512 | if ($must_match && $should_fail) { |
| 513 | $op = 0; |
| 514 | } else { |
| 515 | $op = 1; |
| 516 | } |
| 517 | $op = ! $op if $must_match && $inverted; |
| 518 | |
| 519 | if ($inverted && @target > 1) { |
| 520 | # When doing an inverted match against a |
| 521 | # multi-char target, and there is not something on |
| 522 | # the left to anchor the match, if it shouldn't |
| 523 | # succeed, skip, as what will happen (when working |
| 524 | # correctly) is that it will match the first |
| 525 | # position correctly, and then be inverted to not |
| 526 | # match; then it will go to the second position |
| 527 | # where it won't match, but get inverted to match, |
| 528 | # and hence succeeding. |
| 529 | next if ! ($l_anchor || $prepend) && ! $op; |
| 530 | |
| 531 | # Can't ever match for latin1 code points non-uni |
| 532 | # semantics that have a inverted multi-char fold |
| 533 | # when there is something on both sides and the |
| 534 | # quantifier isn't such as to span the required |
| 535 | # width, which is 2 or 3. |
| 536 | $op = 0 if $ord < 255 |
| 537 | && ! $uni_semantics |
| 538 | && $both_sides |
| 539 | && ( ! $quantifier || $quantifier eq '?') |
| 540 | && $parend < 2; |
| 541 | |
| 542 | # Similarly can't ever match when inverting a multi-char |
| 543 | # fold for /aa and the quantifier isn't sufficient |
| 544 | # to allow it to span to both sides. |
| 545 | $op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2; |
| 546 | |
| 547 | # Or for /l |
| 548 | $op = 0 if $target_has_latin1 && $charset eq 'l' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2; |
| 549 | } |
| 550 | |
| 551 | |
| 552 | my $desc = "my \$c = \"$prepend$lhs$append\"; " |
| 553 | . "my \$p = qr/$quantified/i;" |
| 554 | . "$upgrade_target$upgrade_pattern " |
| 555 | . "\$c " . ($op ? "=~" : "!~") . " \$p; "; |
| 556 | if ($DEBUG) { |
| 557 | $desc .= ( |
| 558 | "; uni_semantics=$uni_semantics, " |
| 559 | . "should_fail=$should_fail, " |
| 560 | . "bracketed=$bracketed, " |
| 561 | . "prepend=$prepend, " |
| 562 | . "append=$append, " |
| 563 | . "parend=$parend, " |
| 564 | . "quantifier=$quantifier, " |
| 565 | . "l_anchor=$l_anchor, " |
| 566 | . "r_anchor=$r_anchor; " |
| 567 | . "pattern_above_latin1=$pattern_above_latin1; " |
| 568 | . "utf8_pattern=$utf8_pattern" |
| 569 | ); |
| 570 | } |
| 571 | |
| 572 | my $c = "$prepend$lhs_str$append"; |
| 573 | my $p = qr/$quantified/i; |
| 574 | utf8::upgrade($c) if length($upgrade_target); |
| 575 | utf8::upgrade($p) if length($upgrade_pattern); |
| 576 | my $res = $op ? ($c =~ $p): ($c !~ $p); |
| 577 | |
| 578 | if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) { |
| 579 | # Failed or debug; output the result |
| 580 | $count++; |
| 581 | ok($res, $desc); |
| 582 | } else { |
| 583 | # Just count the test as passed |
| 584 | $okays++; |
| 585 | } |
| 586 | $this_iteration++; |
| 587 | } |
| 588 | } |
| 589 | } |
| 590 | } |
| 591 | } |
| 592 | } |
| 593 | } |
| 594 | } |
| 595 | |
| 596 | unless($ENV{PERL_DEBUG_FULL_TEST}) { |
| 597 | $count++; |
| 598 | is $okays, $this_iteration, "Subtests okay for " |
| 599 | . "charset=$charset, utf8_pattern=$utf8_pattern"; |
| 600 | } |
| 601 | } |
| 602 | } |
| 603 | } |
| 604 | } |
| 605 | } |
| 606 | |
| 607 | plan($count); |
| 608 | |
| 609 | 1 |