t/re/regexp.t

   1 #!./perl
   2
   3 # The tests are in a separate file 't/re/re_tests'.
   4 # Each line in that file is a separate test.
   5 # There are five columns, separated by tabs.
   6 # An optional sixth column is used to give a reason, only when skipping tests
   7 #
   8 # Column 1 contains the pattern, optionally enclosed in C<''> C<::> or
   9 # C<//>.  Modifiers can be put after the closing delimiter.  C<''> will
  10 # automatically be added to any other patterns.
  11 #
  12 # Column 2 contains the string to be matched.
  13 #
  14 # Column 3 contains the expected result:
  15 #       y       expect a match
  16 #       n       expect no match
  17 #       c       expect an error
  18 #       T       the test is a TODO (can be combined with y/n/c)
  19 #       M       skip test on miniperl (combine with y/n/c/T)
  20 #       B       test exposes a known bug in Perl, should be skipped
  21 #       b       test exposes a known bug in Perl, should be skipped if noamp
  22 #       t       test exposes a bug with threading, TODO if qr_embed_thr
  23 #       s       test should only be run for regex_sets_compat.t
  24 #       S       test should not be run for regex_sets_compat.t
  25 #       a       test should only be run on ASCII platforms
  26 #       e       test should only be run on EBCDIC platforms
  27 #
  28 # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
  29 #
  30 # Column 4 contains a string, usually C<$&>.
  31 #
  32 # Column 5 contains the expected result of double-quote
  33 # interpolating that string after the match, or start of error message.
  34 #
  35 # Column 6, if present, contains a reason why the test is skipped.
  36 # This is printed with "skipped", for harness to pick up.
  37 #
  38 # Column 7 can be used for comments
  39 #
  40 # \n in the tests are interpolated, as are variables of the form ${\w+}.
  41 #
  42 # Blanks lines are treated as PASSING tests to keep the line numbers
  43 # linked to the test number.
  44 #
  45 # If you want to add a regular expression test that can't be expressed
  46 # in this format, don't add it here: put it in re/pat.t instead.
  47 #
  48 # Note that the inputs get passed on as "m're'", so the re bypasses the lexer.
  49 # This means this file cannot be used for testing anything that the lexer
  50 # handles; in 5.12 this means just \N{NAME} and \N{U+...}.
  51 #
  52 # Note that columns 2,3 and 5 are all enclosed in double quotes and then
  53 # evalled; so something like a\"\x{100}$1 has length 3+length($1).
  54 #
  55 # \x... and \o{...} constants are automatically converted to the native
  56 # character set if necessary.  \[0-7] constants aren't
  57
  58 my ($file, $iters);
  59 BEGIN {
  60     $iters = shift || 1;        # Poor man performance suite, 10000 is OK.
  61
  62     # Do this open before any chdir
  63     $file = shift;
  64     if (defined $file) {
  65         open TESTS, $file or die "Can't open $file";
  66     }
  67
  68     chdir 't' if -d 't';
  69     @INC = qw '../lib ../ext/re';
  70     if (!defined &DynaLoader::boot_DynaLoader) { # miniperl
  71         print("1..0 # Skip Unicode tables not built yet\n"), exit
  72             unless eval 'require "unicore/Heavy.pl"';
  73     }
  74
  75     # Some of the tests need a locale; which one doesn't much matter, except
  76     # that it be valid.  Make sure of that
  77     eval { require POSIX;
  78             POSIX->import(qw(LC_ALL setlocale));
  79             POSIX::setlocale(&LC_ALL, "C");
  80     };
  81 }
  82
  83 sub _comment {
  84     return map { /^#/ ? "$_\n" : "# $_\n" }
  85            map { split /\n/ } @_;
  86 }
  87
  88 sub convert_from_ascii {
  89     my $string = shift;
  90
  91     #my $save = $string;
  92     # Convert \x{...}, \o{...}
  93     $string =~ s/ (?<! \\ ) \\x\{ ( .*? ) } / "\\x{" . sprintf("%X", utf8::unicode_to_native(hex $1)) .  "}" /gex;
  94     $string =~ s/ (?<! \\ ) \\o\{ ( .*? ) } / "\\o{" . sprintf("%o", utf8::unicode_to_native(oct $1)) .  "}" /gex;
  95
  96     # Convert \xAB
  97     $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9]{2} ) / "\\x" . sprintf("%02X", utf8::unicode_to_native(hex $1)) /gex;
  98
  99     # Convert \xA
 100     $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9] ) (?! [A-Fa-f0-9] ) / "\\x" . sprintf("%X", utf8::unicode_to_native(hex $1)) /gex;
 101
 102     #print STDERR __LINE__, ": $save\n$string\n" if $save ne $string;
 103     return $string;
 104 }
 105
 106 use strict;
 107 use warnings FATAL=>"all";
 108 no warnings 'experimental::vlb';
 109 our ($bang, $ffff, $nulnul); # used by the tests
 110 our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $alpha_assertions, $no_null); # set by our callers
 111
 112 my $expanded_text = "expanded name from original test number";
 113 my $expanded_text_re = qr/$expanded_text/;
 114
 115 if (!defined $file) {
 116     open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!";
 117 }
 118
 119 my @tests = <TESTS>;
 120
 121 close TESTS;
 122
 123 $bang = sprintf "\\%03o", ord "!"; # \41 would not be portable.
 124 $ffff  = chr(0xff) x 2;
 125 $nulnul = "\0" x 2;
 126 my $OP = $qr ? 'qr' : 'm';
 127
 128 $| = 1;
 129
 130 my $test;
 131 TEST:
 132 foreach (@tests) {
 133     $test++;
 134     if (!/\S/ || /^\s*#/ || /^__END__$/) {
 135         chomp;
 136         my ($not,$comment)= split /\s*#\s*/, $_, 2;
 137         $comment ||= "(blank line)";
 138         print "ok $test # $comment\n";
 139         next;
 140     }
 141     chomp;
 142     s/\\n/\n/g unless $regex_sets;
 143     my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7);
 144     $comment = "" unless defined $comment;
 145     if (!defined $subject) {
 146         die "Bad test definition on line $test: $_\n";
 147     }
 148     $reason = '' unless defined $reason;
 149     my $input = join(':',$pat,$subject,$result,$repl,$expect);
 150
 151     # the double '' below keeps simple syntax highlighters from going crazy
 152     $pat = "'$pat'" unless $pat =~ /^[:''\/]/;
 153     $pat =~ s/(\$\{\w+\})/$1/eeg;
 154     $pat =~ s/\\n/\n/g unless $regex_sets;
 155     $pat = convert_from_ascii($pat) if ord("A") != 65;
 156
 157     my $no_null_pat;
 158     if ($no_null && $pat =~ /^'(.*)'\z/) {
 159        $no_null_pat = XS::APItest::string_without_null($1);
 160     }
 161
 162     $subject = convert_from_ascii($subject) if ord("A") != 65;
 163     $subject = eval qq("$subject"); die $@ if $@;
 164
 165     $expect = convert_from_ascii($expect) if ord("A") != 65;
 166     $expect  = eval qq("$expect"); die $@ if $@;
 167     $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/;
 168
 169     my $todo_qr = $qr_embed_thr && ($result =~ s/t//);
 170     my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
 171     ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader;
 172     if ($result =~ s/ ( [Ss] ) //x) {
 173         if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) {
 174             $skip++;
 175             $reason = "Test not valid for $0";
 176         }
 177     }
 178     if ($result =~ s/a// && ord("A") != 65) {
 179         $skip++;
 180         $reason = "Test is only valid for ASCII platforms.  $reason";
 181     }
 182     if ($result =~ s/e// && ord("A") != 193) {
 183         $skip++;
 184         $reason = "Test is only valid for EBCDIC platforms.  $reason";
 185     }
 186     $reason = 'skipping $&' if $reason eq  '' && $skip_amp;
 187     $result =~ s/B//i unless $skip;
 188     my $todo= $result =~ s/T// ? " # TODO" : "";
 189     my $testname= $test;
 190     if ($comment) {
 191         $comment=~s/^\s*(?:#\s*)?//;
 192         $testname .= " - $comment" if $comment;
 193     }
 194     if (! $skip && $alpha_assertions) {
 195         my $assertions_re = qr/ (?: \Q(?\E (?: > | <? [=>] ) ) /x;
 196         if ($pat !~ $assertions_re && $comment !~ $expanded_text_re) {
 197             $skip++;
 198             $reason = "Pattern doesn't contain assertions";
 199         }
 200         elsif ($comment !~ $expanded_text_re) {
 201             my $expanded_pat = $pat;
 202
 203             $pat =~ s/\( \? > /(*atomic:/xg;
 204
 205             if ($pat =~ s/\( \? = /(*pla:/xg) {
 206                 $expanded_pat =~ s//(*positive_lookahead:/g;
 207             }
 208             if ($pat =~ s/\( \? ! /(*nla:/xg) {
 209                 $expanded_pat =~ s//(*negative_lookahead:/g;
 210             }
 211             if ($pat =~ s/\( \? <= /(*plb:/xg) {
 212                 $expanded_pat =~ s//(*positive_lookbehind:/g;
 213             }
 214             if ($pat =~ s/\( \? <! /(*nlb:/xg) {
 215                 $expanded_pat =~ s//(*negative_lookbehind:/g;
 216             }
 217             if ($expanded_pat ne $pat) {
 218                 $comment .= " $expanded_text $test";
 219                 push @tests, join "\t", $expanded_pat,
 220                                         $subject // "",
 221                                         $result // "",
 222                                         $repl // "",
 223                                         $expect // "",
 224                                         $reason // "",
 225                                         $comment;
 226             }
 227         }
 228     }
 229     elsif (! $skip && $regex_sets) {
 230
 231         # If testing regex sets, change the [bracketed] classes into
 232         # (?[bracketed]).  But note that '\[' and '\c[' don't introduce such a
 233         # class.  (We don't bother looking for an odd number of backslashes,
 234         # as this hasn't been needed so far.)
 235         if ($pat !~ / (?<!\\c) (?<!\\) \[ /x) {
 236             $skip++;
 237             $reason = "Pattern doesn't contain [brackets]";
 238         }
 239         else { # Use non-regex features of Perl to accomplish this.
 240             my $modified = "";
 241             my $in_brackets = 0;
 242
 243             # Go through the pattern character-by-character.  We also add
 244             # blanks around each token to test the /x parts of (?[ ])
 245             my $pat_len = length($pat);
 246       CHAR: for (my $i = 0; $i < $pat_len; $i++) {
 247                 my $curchar = substr($pat, $i, 1);
 248                 if ($curchar eq '\\') {
 249                     $modified .= " " if $in_brackets;
 250                     $modified .= $curchar;
 251                     $i++;
 252
 253                     # Get the character the backslash is escaping
 254                     $curchar = substr($pat, $i, 1);
 255                     $modified .= $curchar;
 256
 257                     # If the character following that is a '{}', treat the
 258                     # entire amount as a single token
 259                     if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') {
 260                         my $j = index($pat, '}', $i+2);
 261                         if ($j < 0) {
 262                             last unless $in_brackets;
 263                             if ($result eq 'c') {
 264                                 $skip++;
 265                                 $reason = "Can't handle compilation errors with unmatched '{'";
 266                             }
 267                             else {
 268                                 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n";
 269                                 next TEST;
 270                             }
 271                         }
 272                         $modified .= substr($pat, $i+1, $j - $i);
 273                         $i = $j;
 274                     }
 275                     elsif ($curchar eq 'x') {
 276
 277                         # \x without brackets is supposed to be followed by 2
 278                         # hex digits.  Take up to 2, and then add a blank
 279                         # after the last one.  This avoids getting errors from
 280                         # (?[ ]) for run-ons, like \xabc
 281                         my $j = $i + 1;
 282                         for (; $j < $i + 3 && $j < $pat_len; $j++) {
 283                             my $curord = ord(substr($pat, $j, 1));
 284                             if (!(($curord >= ord("A") && $curord <= ord("F"))
 285                                  || ($curord >= ord("a") && $curord <= ord("f"))
 286                                  || ($curord >= ord("0") && $curord <= ord("9"))))
 287                             {
 288                                 $j++;
 289                                 last;
 290                             }
 291                         }
 292                         $j--;
 293                         $modified .= substr($pat, $i + 1, $j - $i);
 294                         $modified .= " " if $in_brackets;
 295                         $i = $j;
 296                     }
 297                     elsif (ord($curchar) >= ord('0')
 298                            && (ord($curchar) <= ord('7')))
 299                     {
 300                         # Similarly, octal constants have up to 3 digits.
 301                         my $j = $i + 1;
 302                         for (; $j < $i + 3 && $j < $pat_len; $j++) {
 303                             my $curord = ord(substr($pat, $j, 1));
 304                             if (! ($curord >= ord("0") &&  $curord <= ord("7"))) {
 305                                 $j++;
 306                                 last;
 307                             }
 308                         }
 309                         $j--;
 310                         $modified .= substr($pat, $i + 1, $j - $i);
 311                         $i = $j;
 312                     }
 313
 314                     next;
 315                 } # End of processing a backslash sequence
 316
 317                 if (! $in_brackets  # Skip (?{ })
 318                     && $curchar eq '('
 319                     && $i < $pat_len - 2
 320                     && substr($pat, $i+1, 1) eq '?'
 321                     && substr($pat, $i+2, 1) eq '{')
 322                 {
 323                     $skip++;
 324                     $reason = "Pattern contains '(?{'";
 325                     last;
 326                 }
 327
 328                 # Closing ']'
 329                 if ($curchar eq ']' && $in_brackets) {
 330                     $modified .= " ] ])";
 331                     $in_brackets = 0;
 332                     next;
 333                 }
 334
 335                 # A regular character.
 336                 if ($curchar ne '[') {
 337                     $modified .= " " if  $in_brackets;
 338                     $modified .= $curchar;
 339                     next;
 340                 }
 341
 342                 # Here is a '['; If not in a bracketed class, treat as the
 343                 # beginning of one.
 344                 if (! $in_brackets) {
 345                     $in_brackets = 1;
 346                     $modified .= "(?[ [ ";
 347
 348                     # An immediately following ']' or '^]' is not the ending
 349                     # of the class, but is to be treated literally.
 350                     if ($i < $pat_len - 1
 351                         && substr($pat, $i+1, 1) eq ']')
 352                     {
 353                         $i ++;
 354                         $modified .= " ] ";
 355                     }
 356                     elsif ($i < $pat_len - 2
 357                             && substr($pat, $i+1, 1) eq '^'
 358                             && substr($pat, $i+2, 1) eq ']')
 359                     {
 360                         $i += 2;
 361                         $modified .= " ^ ] ";
 362                     }
 363                     next;
 364                 }
 365
 366                 # Here is a plain '[' within [ ].  Could mean wants to
 367                 # match a '[', or it could be a posix class that has a
 368                 # corresponding ']'.  Absorb either
 369
 370                 $modified .= ' [';
 371                 last if $i >= $pat_len - 1;
 372
 373                 $i++;
 374                 $curchar = substr($pat, $i, 1);
 375                 if ($curchar =~ /[:=.]/) {
 376                     for (my $j = $i + 1; $j < $pat_len; $j++) {
 377                         next unless substr($pat, $j, 1) eq ']';
 378                         last if $j - $i < 2;
 379                         if (substr($pat, $j - 1, 1) eq $curchar) {
 380                             # Here, is a posix class
 381                             $modified .= substr($pat, $i, $j - $i + 1) . " ";
 382                             $i = $j;
 383                             next CHAR;
 384                         }
 385                     }
 386                 }
 387
 388                 # Here wasn't a posix class, just process normally
 389                 $modified .= " $curchar ";
 390             }
 391
 392             if ($in_brackets && ! $skip) {
 393                 if ($result eq 'c') {
 394                     $skip++;
 395                     $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error";
 396                 }
 397                 else {
 398                     print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n";
 399                     next TEST;
 400                 }
 401             }
 402
 403             # Use our modified pattern instead of the original
 404             $pat = $modified;
 405         }
 406     }
 407
 408     for my $study ('', 'study $subject;', 'utf8::upgrade($subject);',
 409                    'utf8::upgrade($subject); study $subject;') {
 410         # Need to make a copy, else the utf8::upgrade of an already studied
 411         # scalar confuses things.
 412         my $subject = $subject;
 413         $subject = XS::APItest::string_without_null($subject) if $no_null;
 414         my $c = $iters;
 415         my ($code, $match, $got);
 416         if ($repl eq 'pos') {
 417             my $patcode = defined $no_null_pat ? '/$no_null_pat/g'
 418                                                : "m${pat}g";
 419             $code= <<EOFCODE;
 420                 $study
 421                 pos(\$subject)=0;
 422                 \$match = ( \$subject =~ $patcode );
 423                 \$got = pos(\$subject);
 424 EOFCODE
 425         }
 426         elsif ($qr_embed) {
 427             $code= <<EOFCODE;
 428                 my \$RE = qr$pat;
 429                 $study
 430                 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
 431                 \$got = "$repl";
 432 EOFCODE
 433         }
 434         elsif ($qr_embed_thr) {
 435             $code= <<EOFCODE;
 436                 # Can't run the match in a subthread, but can do this and
 437                 # clone the pattern the other way.
 438                 my \$RE = threads->new(sub {qr$pat})->join();
 439                 $study
 440                 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
 441                 \$got = "$repl";
 442 EOFCODE
 443         }
 444         elsif ($no_null) {
 445             my $patcode = defined $no_null_pat ? '/$no_null_pat/'
 446                                                :  $pat;
 447             $code= <<EOFCODE;
 448                 $study
 449                 \$match = (\$subject =~ $OP$pat) while \$c--;
 450                 \$got = "$repl";
 451 EOFCODE
 452         }
 453         else {
 454             $code= <<EOFCODE;
 455                 $study
 456                 \$match = (\$subject =~ $OP$pat) while \$c--;
 457                 \$got = "$repl";
 458 EOFCODE
 459         }
 460         $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets;
 461         $code = "no warnings 'experimental::alpha_assertions';$code" if $alpha_assertions;
 462         #$code.=qq[\n\$expect="$expect";\n];
 463         #use Devel::Peek;
 464         #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
 465         {
 466             # Probably we should annotate specific tests with which warnings
 467             # categories they're known to trigger, and hence should be
 468             # disabled just for that test
 469             no warnings qw(uninitialized regexp deprecated);
 470             eval $code;
 471         }
 472         chomp( my $err = $@ );
 473         if ( $skip ) {
 474             print "ok $testname # skipped", length($reason) ? ".  $reason" : '', "\n";
 475             next TEST;
 476         }
 477         elsif ($result eq 'c') {
 478             if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST }
 479             last;  # no need to study a syntax error
 480         }
 481         elsif ( $todo_qr ) {
 482             print "not ok $testname # TODO", length($reason) ? " - $reason" : '', "\n";
 483             next TEST;
 484         }
 485         elsif ($@) {
 486             print "not ok $testname$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST;
 487         }
 488         elsif ($result =~ /^n/) {
 489             if ($match) { print "not ok $testname$todo ($study) $input => false positive\n"; next TEST }
 490         }
 491         else {
 492             if (!$match || $got ne $expect) {
 493                 eval { require Data::Dumper };
 494                 no warnings "utf8"; # But handle should be utf8
 495                 if ($@ || !defined &DynaLoader::boot_DynaLoader) {
 496                     # Data::Dumper will load on miniperl, but fail when used in
 497                     # anger as it tries to load B. I'd prefer to keep the
 498                     # regular calls below outside of an eval so that real
 499                     # (unknown) failures get spotted, not ignored.
 500                     print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$code\n");
 501                 }
 502                 else { # better diagnostics
 503                     my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump;
 504                     my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump;
 505                     my $e = Data::Dumper->new([$expect],['expected'])->Useqq(1)->Dump;
 506                     print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$code\n$g\n$e\n");
 507                 }
 508                 next TEST;
 509             }
 510         }
 511     }
 512     print "ok $testname$todo\n";
 513 }
 514
 515 printf "1..%d\n# $iters iterations\n", scalar @tests;
 516
 517 1;