t/re/regexp.t

   1 #!./perl
   2
   3 # The tests are in a separate file 't/re/re_tests'.
   4 # Each line in that file is a separate test.
   5 # There are five columns, separated by tabs.
   6 # An optional sixth column is used to give a reason, only when skipping tests
   7 #
   8 # Column 1 contains the pattern, optionally enclosed in C<''> C<::> or
   9 # C<//>.  Modifiers can be put after the closing delimiter.  C<''> will
  10 # automatically be added to any other patterns.
  11 #
  12 # Column 2 contains the string to be matched.
  13 #
  14 # Column 3 contains the expected result:
  15 #       y       expect a match
  16 #       n       expect no match
  17 #       c       expect an error
  18 #       T       the test is a TODO (can be combined with y/n/c)
  19 #       M       skip test on miniperl (combine with y/n/c/T)
  20 #       B       test exposes a known bug in Perl, should be skipped
  21 #       b       test exposes a known bug in Perl, should be skipped if noamp
  22 #       t       test exposes a bug with threading, TODO if qr_embed_thr
  23 #       s       test should only be run for regex_sets_compat.t
  24 #       S       test should not be run for regex_sets_compat.t
  25 #       a       test should only be run on ASCII platforms
  26 #       e       test should only be run on EBCDIC platforms
  27 #
  28 # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
  29 #
  30 # Column 4 contains a string, usually C<$&>.
  31 #
  32 # Column 5 contains the expected result of double-quote
  33 # interpolating that string after the match, or start of error message.
  34 #
  35 # Column 6, if present, contains a reason why the test is skipped.
  36 # This is printed with "skipped", for harness to pick up.
  37 #
  38 # Column 7 can be used for comments
  39 #
  40 # \n in the tests are interpolated, as are variables of the form ${\w+}.
  41 #
  42 # Blanks lines are treated as PASSING tests to keep the line numbers
  43 # linked to the test number.
  44 #
  45 # If you want to add a regular expression test that can't be expressed
  46 # in this format, don't add it here: put it in re/pat.t instead.
  47 #
  48 # Note that the inputs get passed on as "m're'", so the re bypasses the lexer.
  49 # This means this file cannot be used for testing anything that the lexer
  50 # handles; in 5.12 this means just \N{NAME} and \N{U+...}.
  51 #
  52 # Note that columns 2,3 and 5 are all enclosed in double quotes and then
  53 # evalled; so something like a\"\x{100}$1 has length 3+length($1).
  54 #
  55 # \x... and \o{...} constants are automatically converted to the native
  56 # character set if necessary.  \[0-7] constants aren't
  57
  58 my ($file, $iters);
  59 BEGIN {
  60     $iters = shift || 1;        # Poor man performance suite, 10000 is OK.
  61
  62     # Do this open before any chdir
  63     $file = shift;
  64     if (defined $file) {
  65         open TESTS, $file or die "Can't open $file";
  66     }
  67
  68     chdir 't' if -d 't';
  69     @INC = qw '../lib ../ext/re';
  70     if (!defined &DynaLoader::boot_DynaLoader) { # miniperl
  71         print("1..0 # Skip Unicode tables not built yet\n"), exit
  72             unless eval 'require "unicore/Heavy.pl"';
  73     }
  74
  75     # Some of the tests need a locale; which one doesn't much matter, except
  76     # that it be valid.  Make sure of that
  77     eval { require POSIX;
  78             POSIX->import(qw(LC_ALL setlocale));
  79             POSIX::setlocale(&LC_ALL, "C");
  80     };
  81 }
  82
  83 sub _comment {
  84     return map { /^#/ ? "$_\n" : "# $_\n" }
  85            map { split /\n/ } @_;
  86 }
  87
  88 sub convert_from_ascii {
  89     my $string = shift;
  90
  91     #my $save = $string;
  92     # Convert \x{...}, \o{...}
  93     $string =~ s/ (?<! \\ ) \\x\{ ( .*? ) } / "\\x{" . sprintf("%X", utf8::unicode_to_native(hex $1)) .  "}" /gex;
  94     $string =~ s/ (?<! \\ ) \\o\{ ( .*? ) } / "\\o{" . sprintf("%o", utf8::unicode_to_native(oct $1)) .  "}" /gex;
  95
  96     # Convert \xAB
  97     $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9]{2} ) / "\\x" . sprintf("%02X", utf8::unicode_to_native(hex $1)) /gex;
  98
  99     # Convert \xA
 100     $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9] ) (?! [A-Fa-f0-9] ) / "\\x" . sprintf("%X", utf8::unicode_to_native(hex $1)) /gex;
 101
 102     #print STDERR __LINE__, ": $save\n$string\n" if $save ne $string;
 103     return $string;
 104 }
 105
 106 use strict;
 107 use warnings FATAL=>"all";
 108 our ($bang, $ffff, $nulnul); # used by the tests
 109 our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $alpha_assertions, $no_null); # set by our callers
 110
 111 my $expanded_text = "expanded name from original test number";
 112 my $expanded_text_re = qr/$expanded_text/;
 113
 114 if (!defined $file) {
 115     open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!";
 116 }
 117
 118 my @tests = <TESTS>;
 119
 120 close TESTS;
 121
 122 $bang = sprintf "\\%03o", ord "!"; # \41 would not be portable.
 123 $ffff  = chr(0xff) x 2;
 124 $nulnul = "\0" x 2;
 125 my $OP = $qr ? 'qr' : 'm';
 126
 127 $| = 1;
 128
 129 my $test;
 130 TEST:
 131 foreach (@tests) {
 132     $test++;
 133     if (!/\S/ || /^\s*#/ || /^__END__$/) {
 134         chomp;
 135         my ($not,$comment)= split /\s*#\s*/, $_, 2;
 136         $comment ||= "(blank line)";
 137         print "ok $test # $comment\n";
 138         next;
 139     }
 140     chomp;
 141     s/\\n/\n/g unless $regex_sets;
 142     my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7);
 143     $comment = "" unless defined $comment;
 144     if (!defined $subject) {
 145         die "Bad test definition on line $test: $_\n";
 146     }
 147     $reason = '' unless defined $reason;
 148     my $input = join(':',$pat,$subject,$result,$repl,$expect);
 149
 150     # the double '' below keeps simple syntax highlighters from going crazy
 151     $pat = "'$pat'" unless $pat =~ /^[:''\/]/;
 152     $pat =~ s/(\$\{\w+\})/$1/eeg;
 153     $pat =~ s/\\n/\n/g unless $regex_sets;
 154     $pat = convert_from_ascii($pat) if ord("A") != 65;
 155
 156     my $no_null_pat;
 157     if ($no_null && $pat =~ /^'(.*)'\z/) {
 158        $no_null_pat = XS::APItest::string_without_null($1);
 159     }
 160
 161     $subject = convert_from_ascii($subject) if ord("A") != 65;
 162     $subject = eval qq("$subject"); die $@ if $@;
 163
 164     $expect = convert_from_ascii($expect) if ord("A") != 65;
 165     $expect  = eval qq("$expect"); die $@ if $@;
 166     $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/;
 167
 168     my $todo_qr = $qr_embed_thr && ($result =~ s/t//);
 169     my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
 170     ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader;
 171     if ($result =~ s/ ( [Ss] ) //x) {
 172         if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) {
 173             $skip++;
 174             $reason = "Test not valid for $0";
 175         }
 176     }
 177     if ($result =~ s/a// && ord("A") != 65) {
 178         $skip++;
 179         $reason = "Test is only valid for ASCII platforms.  $reason";
 180     }
 181     if ($result =~ s/e// && ord("A") != 193) {
 182         $skip++;
 183         $reason = "Test is only valid for EBCDIC platforms.  $reason";
 184     }
 185     $reason = 'skipping $&' if $reason eq  '' && $skip_amp;
 186     $result =~ s/B//i unless $skip;
 187     my $todo= $result =~ s/T// ? " # TODO" : "";
 188     my $testname= $test;
 189     if ($comment) {
 190         $comment=~s/^\s*(?:#\s*)?//;
 191         $testname .= " - $comment" if $comment;
 192     }
 193     if (! $skip && $alpha_assertions) {
 194         my $assertions_re = qr/ (?: \Q(?\E (?: > | <? [=>] ) ) /x;
 195         if ($pat !~ $assertions_re && $comment !~ $expanded_text_re) {
 196             $skip++;
 197             $reason = "Pattern doesn't contain assertions";
 198         }
 199         elsif ($comment !~ $expanded_text_re) {
 200             my $expanded_pat = $pat;
 201
 202             $pat =~ s/\( \? > /(*atomic:/xg;
 203
 204             if ($pat =~ s/\( \? = /(*pla:/xg) {
 205                 $expanded_pat =~ s//(*positive_lookahead:/g;
 206             }
 207             if ($pat =~ s/\( \? ! /(*nla:/xg) {
 208                 $expanded_pat =~ s//(*negative_lookahead:/g;
 209             }
 210             if ($pat =~ s/\( \? <= /(*plb:/xg) {
 211                 $expanded_pat =~ s//(*positive_lookbehind:/g;
 212             }
 213             if ($pat =~ s/\( \? <! /(*nlb:/xg) {
 214                 $expanded_pat =~ s//(*negative_lookbehind:/g;
 215             }
 216             if ($expanded_pat ne $pat) {
 217                 $comment .= " $expanded_text $test";
 218                 push @tests, join "\t", $expanded_pat,
 219                                         $subject // "",
 220                                         $result // "",
 221                                         $repl // "",
 222                                         $expect // "",
 223                                         $reason // "",
 224                                         $comment;
 225             }
 226         }
 227     }
 228     elsif (! $skip && $regex_sets) {
 229
 230         # If testing regex sets, change the [bracketed] classes into
 231         # (?[bracketed]).  But note that '\[' and '\c[' don't introduce such a
 232         # class.  (We don't bother looking for an odd number of backslashes,
 233         # as this hasn't been needed so far.)
 234         if ($pat !~ / (?<!\\c) (?<!\\) \[ /x) {
 235             $skip++;
 236             $reason = "Pattern doesn't contain [brackets]";
 237         }
 238         else { # Use non-regex features of Perl to accomplish this.
 239             my $modified = "";
 240             my $in_brackets = 0;
 241
 242             # Go through the pattern character-by-character.  We also add
 243             # blanks around each token to test the /x parts of (?[ ])
 244             my $pat_len = length($pat);
 245       CHAR: for (my $i = 0; $i < $pat_len; $i++) {
 246                 my $curchar = substr($pat, $i, 1);
 247                 if ($curchar eq '\\') {
 248                     $modified .= " " if $in_brackets;
 249                     $modified .= $curchar;
 250                     $i++;
 251
 252                     # Get the character the backslash is escaping
 253                     $curchar = substr($pat, $i, 1);
 254                     $modified .= $curchar;
 255
 256                     # If the character following that is a '{}', treat the
 257                     # entire amount as a single token
 258                     if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') {
 259                         my $j = index($pat, '}', $i+2);
 260                         if ($j < 0) {
 261                             last unless $in_brackets;
 262                             if ($result eq 'c') {
 263                                 $skip++;
 264                                 $reason = "Can't handle compilation errors with unmatched '{'";
 265                             }
 266                             else {
 267                                 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n";
 268                                 next TEST;
 269                             }
 270                         }
 271                         $modified .= substr($pat, $i+1, $j - $i);
 272                         $i = $j;
 273                     }
 274                     elsif ($curchar eq 'x') {
 275
 276                         # \x without brackets is supposed to be followed by 2
 277                         # hex digits.  Take up to 2, and then add a blank
 278                         # after the last one.  This avoids getting errors from
 279                         # (?[ ]) for run-ons, like \xabc
 280                         my $j = $i + 1;
 281                         for (; $j < $i + 3 && $j < $pat_len; $j++) {
 282                             my $curord = ord(substr($pat, $j, 1));
 283                             if (!(($curord >= ord("A") && $curord <= ord("F"))
 284                                  || ($curord >= ord("a") && $curord <= ord("f"))
 285                                  || ($curord >= ord("0") && $curord <= ord("9"))))
 286                             {
 287                                 $j++;
 288                                 last;
 289                             }
 290                         }
 291                         $j--;
 292                         $modified .= substr($pat, $i + 1, $j - $i);
 293                         $modified .= " " if $in_brackets;
 294                         $i = $j;
 295                     }
 296                     elsif (ord($curchar) >= ord('0')
 297                            && (ord($curchar) <= ord('7')))
 298                     {
 299                         # Similarly, octal constants have up to 3 digits.
 300                         my $j = $i + 1;
 301                         for (; $j < $i + 3 && $j < $pat_len; $j++) {
 302                             my $curord = ord(substr($pat, $j, 1));
 303                             if (! ($curord >= ord("0") &&  $curord <= ord("7"))) {
 304                                 $j++;
 305                                 last;
 306                             }
 307                         }
 308                         $j--;
 309                         $modified .= substr($pat, $i + 1, $j - $i);
 310                         $i = $j;
 311                     }
 312
 313                     next;
 314                 } # End of processing a backslash sequence
 315
 316                 if (! $in_brackets  # Skip (?{ })
 317                     && $curchar eq '('
 318                     && $i < $pat_len - 2
 319                     && substr($pat, $i+1, 1) eq '?'
 320                     && substr($pat, $i+2, 1) eq '{')
 321                 {
 322                     $skip++;
 323                     $reason = "Pattern contains '(?{'";
 324                     last;
 325                 }
 326
 327                 # Closing ']'
 328                 if ($curchar eq ']' && $in_brackets) {
 329                     $modified .= " ] ])";
 330                     $in_brackets = 0;
 331                     next;
 332                 }
 333
 334                 # A regular character.
 335                 if ($curchar ne '[') {
 336                     $modified .= " " if  $in_brackets;
 337                     $modified .= $curchar;
 338                     next;
 339                 }
 340
 341                 # Here is a '['; If not in a bracketed class, treat as the
 342                 # beginning of one.
 343                 if (! $in_brackets) {
 344                     $in_brackets = 1;
 345                     $modified .= "(?[ [ ";
 346
 347                     # An immediately following ']' or '^]' is not the ending
 348                     # of the class, but is to be treated literally.
 349                     if ($i < $pat_len - 1
 350                         && substr($pat, $i+1, 1) eq ']')
 351                     {
 352                         $i ++;
 353                         $modified .= " ] ";
 354                     }
 355                     elsif ($i < $pat_len - 2
 356                             && substr($pat, $i+1, 1) eq '^'
 357                             && substr($pat, $i+2, 1) eq ']')
 358                     {
 359                         $i += 2;
 360                         $modified .= " ^ ] ";
 361                     }
 362                     next;
 363                 }
 364
 365                 # Here is a plain '[' within [ ].  Could mean wants to
 366                 # match a '[', or it could be a posix class that has a
 367                 # corresponding ']'.  Absorb either
 368
 369                 $modified .= ' [';
 370                 last if $i >= $pat_len - 1;
 371
 372                 $i++;
 373                 $curchar = substr($pat, $i, 1);
 374                 if ($curchar =~ /[:=.]/) {
 375                     for (my $j = $i + 1; $j < $pat_len; $j++) {
 376                         next unless substr($pat, $j, 1) eq ']';
 377                         last if $j - $i < 2;
 378                         if (substr($pat, $j - 1, 1) eq $curchar) {
 379                             # Here, is a posix class
 380                             $modified .= substr($pat, $i, $j - $i + 1) . " ";
 381                             $i = $j;
 382                             next CHAR;
 383                         }
 384                     }
 385                 }
 386
 387                 # Here wasn't a posix class, just process normally
 388                 $modified .= " $curchar ";
 389             }
 390
 391             if ($in_brackets && ! $skip) {
 392                 if ($result eq 'c') {
 393                     $skip++;
 394                     $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error";
 395                 }
 396                 else {
 397                     print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n";
 398                     next TEST;
 399                 }
 400             }
 401
 402             # Use our modified pattern instead of the original
 403             $pat = $modified;
 404         }
 405     }
 406
 407     for my $study ('', 'study $subject;', 'utf8::upgrade($subject);',
 408                    'utf8::upgrade($subject); study $subject;') {
 409         # Need to make a copy, else the utf8::upgrade of an already studied
 410         # scalar confuses things.
 411         my $subject = $subject;
 412         $subject = XS::APItest::string_without_null($subject) if $no_null;
 413         my $c = $iters;
 414         my ($code, $match, $got);
 415         if ($repl eq 'pos') {
 416             my $patcode = defined $no_null_pat ? '/$no_null_pat/g'
 417                                                : "m${pat}g";
 418             $code= <<EOFCODE;
 419                 $study
 420                 pos(\$subject)=0;
 421                 \$match = ( \$subject =~ $patcode );
 422                 \$got = pos(\$subject);
 423 EOFCODE
 424         }
 425         elsif ($qr_embed) {
 426             $code= <<EOFCODE;
 427                 my \$RE = qr$pat;
 428                 $study
 429                 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
 430                 \$got = "$repl";
 431 EOFCODE
 432         }
 433         elsif ($qr_embed_thr) {
 434             $code= <<EOFCODE;
 435                 # Can't run the match in a subthread, but can do this and
 436                 # clone the pattern the other way.
 437                 my \$RE = threads->new(sub {qr$pat})->join();
 438                 $study
 439                 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
 440                 \$got = "$repl";
 441 EOFCODE
 442         }
 443         elsif ($no_null) {
 444             my $patcode = defined $no_null_pat ? '/$no_null_pat/'
 445                                                :  $pat;
 446             $code= <<EOFCODE;
 447                 $study
 448                 \$match = (\$subject =~ $OP$pat) while \$c--;
 449                 \$got = "$repl";
 450 EOFCODE
 451         }
 452         else {
 453             $code= <<EOFCODE;
 454                 $study
 455                 \$match = (\$subject =~ $OP$pat) while \$c--;
 456                 \$got = "$repl";
 457 EOFCODE
 458         }
 459         $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets;
 460         $code = "no warnings 'experimental::alpha_assertions';$code" if $alpha_assertions;
 461         #$code.=qq[\n\$expect="$expect";\n];
 462         #use Devel::Peek;
 463         #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
 464         {
 465             # Probably we should annotate specific tests with which warnings
 466             # categories they're known to trigger, and hence should be
 467             # disabled just for that test
 468             no warnings qw(uninitialized regexp deprecated);
 469             eval $code;
 470         }
 471         chomp( my $err = $@ );
 472         if ( $skip ) {
 473             print "ok $testname # skipped", length($reason) ? ".  $reason" : '', "\n";
 474             next TEST;
 475         }
 476         elsif ($result eq 'c') {
 477             if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST }
 478             last;  # no need to study a syntax error
 479         }
 480         elsif ( $todo_qr ) {
 481             print "not ok $testname # TODO", length($reason) ? " - $reason" : '', "\n";
 482             next TEST;
 483         }
 484         elsif ($@) {
 485             print "not ok $testname$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST;
 486         }
 487         elsif ($result =~ /^n/) {
 488             if ($match) { print "not ok $testname$todo ($study) $input => false positive\n"; next TEST }
 489         }
 490         else {
 491             if (!$match || $got ne $expect) {
 492                 eval { require Data::Dumper };
 493                 no warnings "utf8"; # But handle should be utf8
 494                 if ($@ || !defined &DynaLoader::boot_DynaLoader) {
 495                     # Data::Dumper will load on miniperl, but fail when used in
 496                     # anger as it tries to load B. I'd prefer to keep the
 497                     # regular calls below outside of an eval so that real
 498                     # (unknown) failures get spotted, not ignored.
 499                     print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$code\n");
 500                 }
 501                 else { # better diagnostics
 502                     my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump;
 503                     my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump;
 504                     my $e = Data::Dumper->new([$expect],['expected'])->Useqq(1)->Dump;
 505                     print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$code\n$g\n$e\n");
 506                 }
 507                 next TEST;
 508             }
 509         }
 510     }
 511     print "ok $testname$todo\n";
 512 }
 513
 514 printf "1..%d\n# $iters iterations\n", scalar @tests;
 515
 516 1;