| 1 | #!./perl |
| 2 | |
| 3 | # The tests are in a separate file 't/re/re_tests'. |
| 4 | # Each line in that file is a separate test. |
| 5 | # There are five columns, separated by tabs. |
| 6 | # |
| 7 | # Column 1 contains the pattern, optionally enclosed in C<''>. |
| 8 | # Modifiers can be put after the closing C<'>. |
| 9 | # |
| 10 | # Column 2 contains the string to be matched. |
| 11 | # |
| 12 | # Column 3 contains the expected result: |
| 13 | # y expect a match |
| 14 | # n expect no match |
| 15 | # c expect an error |
| 16 | # T the test is a TODO (can be combined with y/n/c) |
| 17 | # M skip test on miniperl (combine with y/n/c/T) |
| 18 | # B test exposes a known bug in Perl, should be skipped |
| 19 | # b test exposes a known bug in Perl, should be skipped if noamp |
| 20 | # t test exposes a bug with threading, TODO if qr_embed_thr |
| 21 | # s test should only be run for regex_sets_compat.t |
| 22 | # S test should not be run for regex_sets_compat.t |
| 23 | # |
| 24 | # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. |
| 25 | # |
| 26 | # Column 4 contains a string, usually C<$&>. |
| 27 | # |
| 28 | # Column 5 contains the expected result of double-quote |
| 29 | # interpolating that string after the match, or start of error message. |
| 30 | # |
| 31 | # Column 6, if present, contains a reason why the test is skipped. |
| 32 | # This is printed with "skipped", for harness to pick up. |
| 33 | # |
| 34 | # \n in the tests are interpolated, as are variables of the form ${\w+}. |
| 35 | # |
| 36 | # Blanks lines are treated as PASSING tests to keep the line numbers |
| 37 | # linked to the test number. |
| 38 | # |
| 39 | # If you want to add a regular expression test that can't be expressed |
| 40 | # in this format, don't add it here: put it in re/pat.t instead. |
| 41 | # |
| 42 | # Note that the inputs get passed on as "m're'", so the re bypasses the lexer. |
| 43 | # This means this file cannot be used for testing anything that the lexer |
| 44 | # handles; in 5.12 this means just \N{NAME} and \N{U+...}. |
| 45 | # |
| 46 | # Note that columns 2,3 and 5 are all enclosed in double quotes and then |
| 47 | # evalled; so something like a\"\x{100}$1 has length 3+length($1). |
| 48 | |
| 49 | my ($file, $iters); |
| 50 | BEGIN { |
| 51 | $iters = shift || 1; # Poor man performance suite, 10000 is OK. |
| 52 | |
| 53 | # Do this open before any chdir |
| 54 | $file = shift; |
| 55 | if (defined $file) { |
| 56 | open TESTS, $file or die "Can't open $file"; |
| 57 | } |
| 58 | |
| 59 | chdir 't' if -d 't'; |
| 60 | @INC = '../lib'; |
| 61 | |
| 62 | } |
| 63 | |
| 64 | sub _comment { |
| 65 | return map { /^#/ ? "$_\n" : "# $_\n" } |
| 66 | map { split /\n/ } @_; |
| 67 | } |
| 68 | |
| 69 | use strict; |
| 70 | use warnings FATAL=>"all"; |
| 71 | use vars qw($bang $ffff $nulnul); # used by the tests |
| 72 | use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets); # set by our callers |
| 73 | |
| 74 | |
| 75 | |
| 76 | if (!defined $file) { |
| 77 | open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!"; |
| 78 | } |
| 79 | |
| 80 | my @tests = <TESTS>; |
| 81 | |
| 82 | close TESTS; |
| 83 | |
| 84 | $bang = sprintf "\\%03o", ord "!"; # \41 would not be portable. |
| 85 | $ffff = chr(0xff) x 2; |
| 86 | $nulnul = "\0" x 2; |
| 87 | my $OP = $qr ? 'qr' : 'm'; |
| 88 | |
| 89 | $| = 1; |
| 90 | printf "1..%d\n# $iters iterations\n", scalar @tests; |
| 91 | |
| 92 | my $test; |
| 93 | TEST: |
| 94 | foreach (@tests) { |
| 95 | $test++; |
| 96 | if (!/\S/ || /^\s*#/ || /^__END__$/) { |
| 97 | print "ok $test # (Blank line or comment)\n"; |
| 98 | if (/#/) { print $_ }; |
| 99 | next; |
| 100 | } |
| 101 | chomp; |
| 102 | s/\\n/\n/g unless $regex_sets; |
| 103 | my ($pat, $subject, $result, $repl, $expect, $reason) = split(/\t/,$_,6); |
| 104 | if (!defined $subject) { |
| 105 | die "Bad test definition on line $test: $_\n"; |
| 106 | } |
| 107 | $reason = '' unless defined $reason; |
| 108 | my $input = join(':',$pat,$subject,$result,$repl,$expect); |
| 109 | # the double '' below keeps simple syntax highlighters from going crazy |
| 110 | $pat = "'$pat'" unless $pat =~ /^[:''\/]/; |
| 111 | $pat =~ s/(\$\{\w+\})/$1/eeg; |
| 112 | $pat =~ s/\\n/\n/g unless $regex_sets; |
| 113 | $subject = eval qq("$subject"); die $@ if $@; |
| 114 | $expect = eval qq("$expect"); die $@ if $@; |
| 115 | $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; |
| 116 | my $todo_qr = $qr_embed_thr && ($result =~ s/t//); |
| 117 | my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); |
| 118 | ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader; |
| 119 | if ($result =~ s/ ( [Ss] ) //x) { |
| 120 | if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) { |
| 121 | $skip++; |
| 122 | $reason = "Test not valid for $0"; |
| 123 | } |
| 124 | } |
| 125 | $reason = 'skipping $&' if $reason eq '' && $skip_amp; |
| 126 | $result =~ s/B//i unless $skip; |
| 127 | my $todo= $result =~ s/T// ? " # TODO" : ""; |
| 128 | if (! $skip && $regex_sets) { |
| 129 | |
| 130 | # If testing regex sets, change the [bracketed] classes into |
| 131 | # (?[bracketed]). |
| 132 | |
| 133 | if ($pat !~ / \[ /x) { |
| 134 | |
| 135 | $skip++; |
| 136 | $reason = "Pattern doesn't contain [brackets]"; |
| 137 | } |
| 138 | else { # Use non-regex features of Perl to accomplish this. |
| 139 | my $modified = ""; |
| 140 | my $in_brackets = 0; |
| 141 | |
| 142 | # Go through the pattern character-by-character. We also add |
| 143 | # blanks around each token to test the /x parts of (?[ ]) |
| 144 | my $pat_len = length($pat); |
| 145 | CHAR: for (my $i = 0; $i < $pat_len; $i++) { |
| 146 | my $curchar = substr($pat, $i, 1); |
| 147 | if ($curchar eq '\\') { |
| 148 | $modified .= " " if $in_brackets; |
| 149 | $modified .= $curchar; |
| 150 | $i++; |
| 151 | |
| 152 | # Get the character the backslash is escaping |
| 153 | $curchar = substr($pat, $i, 1); |
| 154 | $modified .= $curchar; |
| 155 | |
| 156 | # If the character following that is a '{}', treat the |
| 157 | # entire amount as a single token |
| 158 | if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') { |
| 159 | my $j = index($pat, '}', $i+2); |
| 160 | if ($j < 0) { |
| 161 | last unless $in_brackets; |
| 162 | if ($result eq 'c') { |
| 163 | $skip++; |
| 164 | $reason = "Can't handle compilation errors with unmatched '{'"; |
| 165 | } |
| 166 | else { |
| 167 | print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n"; |
| 168 | next TEST; |
| 169 | } |
| 170 | } |
| 171 | $modified .= substr($pat, $i+1, $j - $i); |
| 172 | $i = $j; |
| 173 | } |
| 174 | elsif ($curchar eq 'x') { |
| 175 | |
| 176 | # \x without brackets is supposed to be followed by 2 |
| 177 | # hex digits. Take up to 2, and then add a blank |
| 178 | # after the last one. This avoids getting errors from |
| 179 | # (?[ ]) for run-ons, like \xabc |
| 180 | my $j = $i + 1; |
| 181 | for (; $j < $i + 3 && $j < $pat_len; $j++) { |
| 182 | my $curord = ord(substr($pat, $j, 1)); |
| 183 | if (!(($curord >= ord("A") && $curord <= ord("F")) |
| 184 | || ($curord >= ord("a") && $curord <= ord("f")) |
| 185 | || ($curord >= ord("0") && $curord <= ord("9")))) |
| 186 | { |
| 187 | $j++; |
| 188 | last; |
| 189 | } |
| 190 | } |
| 191 | $j--; |
| 192 | $modified .= substr($pat, $i + 1, $j - $i) . " "; |
| 193 | $i = $j; |
| 194 | } |
| 195 | elsif (ord($curchar) >= ord('0') |
| 196 | && (ord($curchar) <= ord('7'))) |
| 197 | { |
| 198 | # Similarly, octal constants have up to 3 digits. |
| 199 | my $j = $i + 1; |
| 200 | for (; $j < $i + 3 && $j < $pat_len; $j++) { |
| 201 | my $curord = ord(substr($pat, $j, 1)); |
| 202 | if (! ($curord >= ord("0") && $curord <= ord("7"))) { |
| 203 | $j++; |
| 204 | last; |
| 205 | } |
| 206 | } |
| 207 | $j--; |
| 208 | $modified .= substr($pat, $i + 1, $j - $i); |
| 209 | $i = $j; |
| 210 | } |
| 211 | |
| 212 | next; |
| 213 | } # End of processing a backslash sequence |
| 214 | |
| 215 | if (! $in_brackets # Skip (?{ }) |
| 216 | && $curchar eq '(' |
| 217 | && $i < $pat_len - 2 |
| 218 | && substr($pat, $i+1, 1) eq '?' |
| 219 | && substr($pat, $i+2, 1) eq '{') |
| 220 | { |
| 221 | $skip++; |
| 222 | $reason = "Pattern contains '(?{'"; |
| 223 | last; |
| 224 | } |
| 225 | |
| 226 | # Closing ']' |
| 227 | if ($curchar eq ']' && $in_brackets) { |
| 228 | $modified .= " ] ])"; |
| 229 | $in_brackets = 0; |
| 230 | next; |
| 231 | } |
| 232 | |
| 233 | # A regular character. |
| 234 | if ($curchar ne '[') { |
| 235 | if (! $in_brackets) { |
| 236 | $modified .= $curchar; |
| 237 | } |
| 238 | else { |
| 239 | $modified .= " $curchar "; |
| 240 | } |
| 241 | next; |
| 242 | } |
| 243 | |
| 244 | # Here is a '['; If not in a bracketed class, treat as the |
| 245 | # beginning of one. |
| 246 | if (! $in_brackets) { |
| 247 | $in_brackets = 1; |
| 248 | $modified .= "(?[ [ "; |
| 249 | |
| 250 | # An immediately following ']' or '^]' is not the ending |
| 251 | # of the class, but is to be treated literally. |
| 252 | if ($i < $pat_len - 1 |
| 253 | && substr($pat, $i+1, 1) eq ']') |
| 254 | { |
| 255 | $i ++; |
| 256 | $modified .= " ] "; |
| 257 | } |
| 258 | elsif ($i < $pat_len - 2 |
| 259 | && substr($pat, $i+1, 1) eq '^' |
| 260 | && substr($pat, $i+2, 1) eq ']') |
| 261 | { |
| 262 | $i += 2; |
| 263 | $modified .= " ^ ] "; |
| 264 | } |
| 265 | next; |
| 266 | } |
| 267 | |
| 268 | # Here is a plain '[' within [ ]. Could mean wants to |
| 269 | # match a '[', or it could be a posix class that has a |
| 270 | # corresponding ']'. Absorb either |
| 271 | |
| 272 | $modified .= ' ['; |
| 273 | last if $i >= $pat_len - 1; |
| 274 | |
| 275 | $i++; |
| 276 | $curchar = substr($pat, $i, 1); |
| 277 | if ($curchar =~ /[:=.]/) { |
| 278 | for (my $j = $i + 1; $j < $pat_len; $j++) { |
| 279 | next unless substr($pat, $j, 1) eq ']'; |
| 280 | last if $j - $i < 2; |
| 281 | if (substr($pat, $j - 1, 1) eq $curchar) { |
| 282 | # Here, is a posix class |
| 283 | $modified .= substr($pat, $i, $j - $i + 1) . " "; |
| 284 | $i = $j; |
| 285 | next CHAR; |
| 286 | } |
| 287 | } |
| 288 | } |
| 289 | |
| 290 | # Here wasn't a posix class, just process normally |
| 291 | $modified .= " $curchar "; |
| 292 | } |
| 293 | |
| 294 | if ($in_brackets && ! $skip) { |
| 295 | if ($result eq 'c') { |
| 296 | $skip++; |
| 297 | $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error"; |
| 298 | } |
| 299 | else { |
| 300 | print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n"; |
| 301 | next TEST; |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | # Use our modified pattern instead of the original |
| 306 | $pat = $modified; |
| 307 | } |
| 308 | } |
| 309 | |
| 310 | for my $study ('', 'study $subject', 'utf8::upgrade($subject)', |
| 311 | 'utf8::upgrade($subject); study $subject') { |
| 312 | # Need to make a copy, else the utf8::upgrade of an already studied |
| 313 | # scalar confuses things. |
| 314 | my $subject = $subject; |
| 315 | my $c = $iters; |
| 316 | my ($code, $match, $got); |
| 317 | if ($repl eq 'pos') { |
| 318 | $code= <<EOFCODE; |
| 319 | $study; |
| 320 | pos(\$subject)=0; |
| 321 | \$match = ( \$subject =~ m${pat}g ); |
| 322 | \$got = pos(\$subject); |
| 323 | EOFCODE |
| 324 | } |
| 325 | elsif ($qr_embed) { |
| 326 | $code= <<EOFCODE; |
| 327 | my \$RE = qr$pat; |
| 328 | $study; |
| 329 | \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; |
| 330 | \$got = "$repl"; |
| 331 | EOFCODE |
| 332 | } |
| 333 | elsif ($qr_embed_thr) { |
| 334 | $code= <<EOFCODE; |
| 335 | # Can't run the match in a subthread, but can do this and |
| 336 | # clone the pattern the other way. |
| 337 | my \$RE = threads->new(sub {qr$pat})->join(); |
| 338 | $study; |
| 339 | \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; |
| 340 | \$got = "$repl"; |
| 341 | EOFCODE |
| 342 | } |
| 343 | else { |
| 344 | $code= <<EOFCODE; |
| 345 | $study; |
| 346 | \$match = (\$subject =~ $OP$pat) while \$c--; |
| 347 | \$got = "$repl"; |
| 348 | EOFCODE |
| 349 | } |
| 350 | $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets; |
| 351 | #$code.=qq[\n\$expect="$expect";\n]; |
| 352 | #use Devel::Peek; |
| 353 | #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; |
| 354 | { |
| 355 | # Probably we should annotate specific tests with which warnings |
| 356 | # categories they're known to trigger, and hence should be |
| 357 | # disabled just for that test |
| 358 | no warnings qw(uninitialized regexp); |
| 359 | eval $code; |
| 360 | } |
| 361 | chomp( my $err = $@ ); |
| 362 | if ( $skip ) { |
| 363 | print "ok $test # skipped", length($reason) ? ". $reason" : '', "\n"; |
| 364 | next TEST; |
| 365 | } |
| 366 | elsif ($result eq 'c') { |
| 367 | if ($err !~ m!^\Q$expect!) { print "not ok $test$todo (compile) $input => '$err'\n"; next TEST } |
| 368 | last; # no need to study a syntax error |
| 369 | } |
| 370 | elsif ( $todo_qr ) { |
| 371 | print "not ok $test # TODO", length($reason) ? " - $reason" : '', "\n"; |
| 372 | next TEST; |
| 373 | } |
| 374 | elsif ($@) { |
| 375 | print "not ok $test$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST; |
| 376 | } |
| 377 | elsif ($result =~ /^n/) { |
| 378 | if ($match) { print "not ok $test$todo ($study) $input => false positive\n"; next TEST } |
| 379 | } |
| 380 | else { |
| 381 | if (!$match || $got ne $expect) { |
| 382 | eval { require Data::Dumper }; |
| 383 | no warnings "utf8"; # But handle should be utf8 |
| 384 | if ($@ || !defined &DynaLoader::boot_DynaLoader) { |
| 385 | # Data::Dumper will load on miniperl, but fail when used in |
| 386 | # anger as it tries to load B. I'd prefer to keep the |
| 387 | # regular calls below outside of an eval so that real |
| 388 | # (unknown) failures get spotted, not ignored. |
| 389 | print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$code\n"); |
| 390 | } |
| 391 | else { # better diagnostics |
| 392 | my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump; |
| 393 | my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump; |
| 394 | print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$g\n$code\n"); |
| 395 | } |
| 396 | next TEST; |
| 397 | } |
| 398 | } |
| 399 | } |
| 400 | print "ok $test$todo\n"; |
| 401 | } |
| 402 | |
| 403 | 1; |