| 1 | #!./perl |
| 2 | |
| 3 | # The tests are in a separate file 't/re/re_tests'. |
| 4 | # Each line in that file is a separate test. |
| 5 | # There are five columns, separated by tabs. |
| 6 | # An optional sixth column is used to give a reason, only when skipping tests |
| 7 | # |
| 8 | # Column 1 contains the pattern, optionally enclosed in C<''>. |
| 9 | # Modifiers can be put after the closing C<'>. |
| 10 | # |
| 11 | # Column 2 contains the string to be matched. |
| 12 | # |
| 13 | # Column 3 contains the expected result: |
| 14 | # y expect a match |
| 15 | # n expect no match |
| 16 | # c expect an error |
| 17 | # T the test is a TODO (can be combined with y/n/c) |
| 18 | # M skip test on miniperl (combine with y/n/c/T) |
| 19 | # B test exposes a known bug in Perl, should be skipped |
| 20 | # b test exposes a known bug in Perl, should be skipped if noamp |
| 21 | # t test exposes a bug with threading, TODO if qr_embed_thr |
| 22 | # s test should only be run for regex_sets_compat.t |
| 23 | # S test should not be run for regex_sets_compat.t |
| 24 | # a test should only be run on ASCII platforms |
| 25 | # e test should only be run on EBCDIC platforms |
| 26 | # |
| 27 | # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. |
| 28 | # |
| 29 | # Column 4 contains a string, usually C<$&>. |
| 30 | # |
| 31 | # Column 5 contains the expected result of double-quote |
| 32 | # interpolating that string after the match, or start of error message. |
| 33 | # |
| 34 | # Column 6, if present, contains a reason why the test is skipped. |
| 35 | # This is printed with "skipped", for harness to pick up. |
| 36 | # |
| 37 | # Column 7 can be used for comments |
| 38 | # |
| 39 | # \n in the tests are interpolated, as are variables of the form ${\w+}. |
| 40 | # |
| 41 | # Blanks lines are treated as PASSING tests to keep the line numbers |
| 42 | # linked to the test number. |
| 43 | # |
| 44 | # If you want to add a regular expression test that can't be expressed |
| 45 | # in this format, don't add it here: put it in re/pat.t instead. |
| 46 | # |
| 47 | # Note that the inputs get passed on as "m're'", so the re bypasses the lexer. |
| 48 | # This means this file cannot be used for testing anything that the lexer |
| 49 | # handles; in 5.12 this means just \N{NAME} and \N{U+...}. |
| 50 | # |
| 51 | # Note that columns 2,3 and 5 are all enclosed in double quotes and then |
| 52 | # evalled; so something like a\"\x{100}$1 has length 3+length($1). |
| 53 | # |
| 54 | # \x... and \o{...} constants are automatically converted to the native |
| 55 | # character set if necessary. \[0-7] constants aren't |
| 56 | |
| 57 | my ($file, $iters); |
| 58 | BEGIN { |
| 59 | $iters = shift || 1; # Poor man performance suite, 10000 is OK. |
| 60 | |
| 61 | # Do this open before any chdir |
| 62 | $file = shift; |
| 63 | if (defined $file) { |
| 64 | open TESTS, $file or die "Can't open $file"; |
| 65 | } |
| 66 | |
| 67 | chdir 't' if -d 't'; |
| 68 | @INC = qw '../lib ../ext/re'; |
| 69 | if (!defined &DynaLoader::boot_DynaLoader) { # miniperl |
| 70 | print("1..0 # Skip Unicode tables not built yet\n"), exit |
| 71 | unless eval 'require "unicore/Heavy.pl"'; |
| 72 | } |
| 73 | } |
| 74 | |
| 75 | sub _comment { |
| 76 | return map { /^#/ ? "$_\n" : "# $_\n" } |
| 77 | map { split /\n/ } @_; |
| 78 | } |
| 79 | |
| 80 | sub convert_from_ascii { |
| 81 | my $string = shift; |
| 82 | |
| 83 | #my $save = $string; |
| 84 | # Convert \x{...}, \o{...} |
| 85 | $string =~ s/ (?<! \\ ) \\x\{ ( .*? ) } / "\\x{" . sprintf("%X", utf8::unicode_to_native(hex $1)) . "}" /gex; |
| 86 | $string =~ s/ (?<! \\ ) \\o\{ ( .*? ) } / "\\o{" . sprintf("%o", utf8::unicode_to_native(oct $1)) . "}" /gex; |
| 87 | |
| 88 | # Convert \xAB |
| 89 | $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9]{2} ) / "\\x" . sprintf("%02X", utf8::unicode_to_native(hex $1)) /gex; |
| 90 | |
| 91 | # Convert \xA |
| 92 | $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9] ) (?! [A-Fa-f0-9] ) / "\\x" . sprintf("%X", utf8::unicode_to_native(hex $1)) /gex; |
| 93 | |
| 94 | #print STDERR __LINE__, ": $save\n$string\n" if $save ne $string; |
| 95 | return $string; |
| 96 | } |
| 97 | |
| 98 | use strict; |
| 99 | use warnings FATAL=>"all"; |
| 100 | use vars qw($bang $ffff $nulnul); # used by the tests |
| 101 | use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets); # set by our callers |
| 102 | |
| 103 | |
| 104 | |
| 105 | if (!defined $file) { |
| 106 | open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!"; |
| 107 | } |
| 108 | |
| 109 | my @tests = <TESTS>; |
| 110 | |
| 111 | close TESTS; |
| 112 | |
| 113 | $bang = sprintf "\\%03o", ord "!"; # \41 would not be portable. |
| 114 | $ffff = chr(0xff) x 2; |
| 115 | $nulnul = "\0" x 2; |
| 116 | my $OP = $qr ? 'qr' : 'm'; |
| 117 | |
| 118 | $| = 1; |
| 119 | printf "1..%d\n# $iters iterations\n", scalar @tests; |
| 120 | |
| 121 | my $test; |
| 122 | TEST: |
| 123 | foreach (@tests) { |
| 124 | $test++; |
| 125 | if (!/\S/ || /^\s*#/ || /^__END__$/) { |
| 126 | chomp; |
| 127 | my ($not,$comment)= split /\s*#\s*/, $_, 2; |
| 128 | $comment ||= "(blank line)"; |
| 129 | print "ok $test # $comment\n"; |
| 130 | next; |
| 131 | } |
| 132 | chomp; |
| 133 | s/\\n/\n/g unless $regex_sets; |
| 134 | my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7); |
| 135 | if (!defined $subject) { |
| 136 | die "Bad test definition on line $test: $_\n"; |
| 137 | } |
| 138 | $reason = '' unless defined $reason; |
| 139 | my $input = join(':',$pat,$subject,$result,$repl,$expect); |
| 140 | |
| 141 | # the double '' below keeps simple syntax highlighters from going crazy |
| 142 | $pat = "'$pat'" unless $pat =~ /^[:''\/]/; |
| 143 | $pat =~ s/(\$\{\w+\})/$1/eeg; |
| 144 | $pat =~ s/\\n/\n/g unless $regex_sets; |
| 145 | $pat = convert_from_ascii($pat) if ord("A") != 65; |
| 146 | |
| 147 | $subject = convert_from_ascii($subject) if ord("A") != 65; |
| 148 | $subject = eval qq("$subject"); die $@ if $@; |
| 149 | |
| 150 | $expect = convert_from_ascii($expect) if ord("A") != 65; |
| 151 | $expect = eval qq("$expect"); die $@ if $@; |
| 152 | $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; |
| 153 | |
| 154 | my $todo_qr = $qr_embed_thr && ($result =~ s/t//); |
| 155 | my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); |
| 156 | ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader; |
| 157 | if ($result =~ s/ ( [Ss] ) //x) { |
| 158 | if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) { |
| 159 | $skip++; |
| 160 | $reason = "Test not valid for $0"; |
| 161 | } |
| 162 | } |
| 163 | if ($result =~ s/a// && ord("A") != 65) { |
| 164 | $skip++; |
| 165 | $reason = "Test is only valid for ASCII platforms. $reason"; |
| 166 | } |
| 167 | if ($result =~ s/e// && ord("A") != 193) { |
| 168 | $skip++; |
| 169 | $reason = "Test is only valid for EBCDIC platforms. $reason"; |
| 170 | } |
| 171 | $reason = 'skipping $&' if $reason eq '' && $skip_amp; |
| 172 | $result =~ s/B//i unless $skip; |
| 173 | my $todo= $result =~ s/T// ? " # TODO" : ""; |
| 174 | my $testname= $test; |
| 175 | if ($comment) { |
| 176 | $comment=~s/^\s*(?:#\s*)?//; |
| 177 | $testname .= " - $comment" if $comment; |
| 178 | } |
| 179 | if (! $skip && $regex_sets) { |
| 180 | |
| 181 | # If testing regex sets, change the [bracketed] classes into |
| 182 | # (?[bracketed]). But note that '\[' and '\c[' don't introduce such a |
| 183 | # class. (We don't bother looking for an odd number of backslashes, |
| 184 | # as this hasn't been needed so far.) |
| 185 | if ($pat !~ / (?<!\\c) (?<!\\) \[ /x) { |
| 186 | $skip++; |
| 187 | $reason = "Pattern doesn't contain [brackets]"; |
| 188 | } |
| 189 | else { # Use non-regex features of Perl to accomplish this. |
| 190 | my $modified = ""; |
| 191 | my $in_brackets = 0; |
| 192 | |
| 193 | # Go through the pattern character-by-character. We also add |
| 194 | # blanks around each token to test the /x parts of (?[ ]) |
| 195 | my $pat_len = length($pat); |
| 196 | CHAR: for (my $i = 0; $i < $pat_len; $i++) { |
| 197 | my $curchar = substr($pat, $i, 1); |
| 198 | if ($curchar eq '\\') { |
| 199 | $modified .= " " if $in_brackets; |
| 200 | $modified .= $curchar; |
| 201 | $i++; |
| 202 | |
| 203 | # Get the character the backslash is escaping |
| 204 | $curchar = substr($pat, $i, 1); |
| 205 | $modified .= $curchar; |
| 206 | |
| 207 | # If the character following that is a '{}', treat the |
| 208 | # entire amount as a single token |
| 209 | if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') { |
| 210 | my $j = index($pat, '}', $i+2); |
| 211 | if ($j < 0) { |
| 212 | last unless $in_brackets; |
| 213 | if ($result eq 'c') { |
| 214 | $skip++; |
| 215 | $reason = "Can't handle compilation errors with unmatched '{'"; |
| 216 | } |
| 217 | else { |
| 218 | print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; |
| 219 | next TEST; |
| 220 | } |
| 221 | } |
| 222 | $modified .= substr($pat, $i+1, $j - $i); |
| 223 | $i = $j; |
| 224 | } |
| 225 | elsif ($curchar eq 'x') { |
| 226 | |
| 227 | # \x without brackets is supposed to be followed by 2 |
| 228 | # hex digits. Take up to 2, and then add a blank |
| 229 | # after the last one. This avoids getting errors from |
| 230 | # (?[ ]) for run-ons, like \xabc |
| 231 | my $j = $i + 1; |
| 232 | for (; $j < $i + 3 && $j < $pat_len; $j++) { |
| 233 | my $curord = ord(substr($pat, $j, 1)); |
| 234 | if (!(($curord >= ord("A") && $curord <= ord("F")) |
| 235 | || ($curord >= ord("a") && $curord <= ord("f")) |
| 236 | || ($curord >= ord("0") && $curord <= ord("9")))) |
| 237 | { |
| 238 | $j++; |
| 239 | last; |
| 240 | } |
| 241 | } |
| 242 | $j--; |
| 243 | $modified .= substr($pat, $i + 1, $j - $i) . " "; |
| 244 | $i = $j; |
| 245 | } |
| 246 | elsif (ord($curchar) >= ord('0') |
| 247 | && (ord($curchar) <= ord('7'))) |
| 248 | { |
| 249 | # Similarly, octal constants have up to 3 digits. |
| 250 | my $j = $i + 1; |
| 251 | for (; $j < $i + 3 && $j < $pat_len; $j++) { |
| 252 | my $curord = ord(substr($pat, $j, 1)); |
| 253 | if (! ($curord >= ord("0") && $curord <= ord("7"))) { |
| 254 | $j++; |
| 255 | last; |
| 256 | } |
| 257 | } |
| 258 | $j--; |
| 259 | $modified .= substr($pat, $i + 1, $j - $i); |
| 260 | $i = $j; |
| 261 | } |
| 262 | |
| 263 | next; |
| 264 | } # End of processing a backslash sequence |
| 265 | |
| 266 | if (! $in_brackets # Skip (?{ }) |
| 267 | && $curchar eq '(' |
| 268 | && $i < $pat_len - 2 |
| 269 | && substr($pat, $i+1, 1) eq '?' |
| 270 | && substr($pat, $i+2, 1) eq '{') |
| 271 | { |
| 272 | $skip++; |
| 273 | $reason = "Pattern contains '(?{'"; |
| 274 | last; |
| 275 | } |
| 276 | |
| 277 | # Closing ']' |
| 278 | if ($curchar eq ']' && $in_brackets) { |
| 279 | $modified .= " ] ])"; |
| 280 | $in_brackets = 0; |
| 281 | next; |
| 282 | } |
| 283 | |
| 284 | # A regular character. |
| 285 | if ($curchar ne '[') { |
| 286 | if (! $in_brackets) { |
| 287 | $modified .= $curchar; |
| 288 | } |
| 289 | else { |
| 290 | $modified .= " $curchar "; |
| 291 | } |
| 292 | next; |
| 293 | } |
| 294 | |
| 295 | # Here is a '['; If not in a bracketed class, treat as the |
| 296 | # beginning of one. |
| 297 | if (! $in_brackets) { |
| 298 | $in_brackets = 1; |
| 299 | $modified .= "(?[ [ "; |
| 300 | |
| 301 | # An immediately following ']' or '^]' is not the ending |
| 302 | # of the class, but is to be treated literally. |
| 303 | if ($i < $pat_len - 1 |
| 304 | && substr($pat, $i+1, 1) eq ']') |
| 305 | { |
| 306 | $i ++; |
| 307 | $modified .= " ] "; |
| 308 | } |
| 309 | elsif ($i < $pat_len - 2 |
| 310 | && substr($pat, $i+1, 1) eq '^' |
| 311 | && substr($pat, $i+2, 1) eq ']') |
| 312 | { |
| 313 | $i += 2; |
| 314 | $modified .= " ^ ] "; |
| 315 | } |
| 316 | next; |
| 317 | } |
| 318 | |
| 319 | # Here is a plain '[' within [ ]. Could mean wants to |
| 320 | # match a '[', or it could be a posix class that has a |
| 321 | # corresponding ']'. Absorb either |
| 322 | |
| 323 | $modified .= ' ['; |
| 324 | last if $i >= $pat_len - 1; |
| 325 | |
| 326 | $i++; |
| 327 | $curchar = substr($pat, $i, 1); |
| 328 | if ($curchar =~ /[:=.]/) { |
| 329 | for (my $j = $i + 1; $j < $pat_len; $j++) { |
| 330 | next unless substr($pat, $j, 1) eq ']'; |
| 331 | last if $j - $i < 2; |
| 332 | if (substr($pat, $j - 1, 1) eq $curchar) { |
| 333 | # Here, is a posix class |
| 334 | $modified .= substr($pat, $i, $j - $i + 1) . " "; |
| 335 | $i = $j; |
| 336 | next CHAR; |
| 337 | } |
| 338 | } |
| 339 | } |
| 340 | |
| 341 | # Here wasn't a posix class, just process normally |
| 342 | $modified .= " $curchar "; |
| 343 | } |
| 344 | |
| 345 | if ($in_brackets && ! $skip) { |
| 346 | if ($result eq 'c') { |
| 347 | $skip++; |
| 348 | $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error"; |
| 349 | } |
| 350 | else { |
| 351 | print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; |
| 352 | next TEST; |
| 353 | } |
| 354 | } |
| 355 | |
| 356 | # Use our modified pattern instead of the original |
| 357 | $pat = $modified; |
| 358 | } |
| 359 | } |
| 360 | |
| 361 | for my $study ('', 'study $subject', 'utf8::upgrade($subject)', |
| 362 | 'utf8::upgrade($subject); study $subject') { |
| 363 | # Need to make a copy, else the utf8::upgrade of an already studied |
| 364 | # scalar confuses things. |
| 365 | my $subject = $subject; |
| 366 | my $c = $iters; |
| 367 | my ($code, $match, $got); |
| 368 | if ($repl eq 'pos') { |
| 369 | $code= <<EOFCODE; |
| 370 | $study; |
| 371 | pos(\$subject)=0; |
| 372 | \$match = ( \$subject =~ m${pat}g ); |
| 373 | \$got = pos(\$subject); |
| 374 | EOFCODE |
| 375 | } |
| 376 | elsif ($qr_embed) { |
| 377 | $code= <<EOFCODE; |
| 378 | my \$RE = qr$pat; |
| 379 | $study; |
| 380 | \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; |
| 381 | \$got = "$repl"; |
| 382 | EOFCODE |
| 383 | } |
| 384 | elsif ($qr_embed_thr) { |
| 385 | $code= <<EOFCODE; |
| 386 | # Can't run the match in a subthread, but can do this and |
| 387 | # clone the pattern the other way. |
| 388 | my \$RE = threads->new(sub {qr$pat})->join(); |
| 389 | $study; |
| 390 | \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; |
| 391 | \$got = "$repl"; |
| 392 | EOFCODE |
| 393 | } |
| 394 | else { |
| 395 | $code= <<EOFCODE; |
| 396 | $study; |
| 397 | \$match = (\$subject =~ $OP$pat) while \$c--; |
| 398 | \$got = "$repl"; |
| 399 | EOFCODE |
| 400 | } |
| 401 | $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets; |
| 402 | #$code.=qq[\n\$expect="$expect";\n]; |
| 403 | #use Devel::Peek; |
| 404 | #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; |
| 405 | { |
| 406 | # Probably we should annotate specific tests with which warnings |
| 407 | # categories they're known to trigger, and hence should be |
| 408 | # disabled just for that test |
| 409 | no warnings qw(uninitialized regexp deprecated); |
| 410 | eval $code; |
| 411 | } |
| 412 | chomp( my $err = $@ ); |
| 413 | if ( $skip ) { |
| 414 | print "ok $testname # skipped", length($reason) ? ". $reason" : '', "\n"; |
| 415 | next TEST; |
| 416 | } |
| 417 | elsif ($result eq 'c') { |
| 418 | if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST } |
| 419 | last; # no need to study a syntax error |
| 420 | } |
| 421 | elsif ( $todo_qr ) { |
| 422 | print "not ok $testname # TODO", length($reason) ? " - $reason" : '', "\n"; |
| 423 | next TEST; |
| 424 | } |
| 425 | elsif ($@) { |
| 426 | print "not ok $testname$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST; |
| 427 | } |
| 428 | elsif ($result =~ /^n/) { |
| 429 | if ($match) { print "not ok $testname$todo ($study) $input => false positive\n"; next TEST } |
| 430 | } |
| 431 | else { |
| 432 | if (!$match || $got ne $expect) { |
| 433 | eval { require Data::Dumper }; |
| 434 | no warnings "utf8"; # But handle should be utf8 |
| 435 | if ($@ || !defined &DynaLoader::boot_DynaLoader) { |
| 436 | # Data::Dumper will load on miniperl, but fail when used in |
| 437 | # anger as it tries to load B. I'd prefer to keep the |
| 438 | # regular calls below outside of an eval so that real |
| 439 | # (unknown) failures get spotted, not ignored. |
| 440 | print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$code\n"); |
| 441 | } |
| 442 | else { # better diagnostics |
| 443 | my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump; |
| 444 | my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump; |
| 445 | my $e = Data::Dumper->new([$expect],['expected'])->Useqq(1)->Dump; |
| 446 | print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$code\n$g\n$e\n"); |
| 447 | } |
| 448 | next TEST; |
| 449 | } |
| 450 | } |
| 451 | } |
| 452 | print "ok $testname$todo\n"; |
| 453 | } |
| 454 | |
| 455 | 1; |