t/re/regexp.t

   1 #!./perl
   2
   3 # The tests are in a separate file 't/re/re_tests'.
   4 # Each line in that file is a separate test.
   5 # There are five columns, separated by tabs.
   6 #
   7 # Column 1 contains the pattern, optionally enclosed in C<''>.
   8 # Modifiers can be put after the closing C<'>.
   9 #
  10 # Column 2 contains the string to be matched.
  11 #
  12 # Column 3 contains the expected result:
  13 #       y       expect a match
  14 #       n       expect no match
  15 #       c       expect an error
  16 #       T       the test is a TODO (can be combined with y/n/c)
  17 #       M       skip test on miniperl (combine with y/n/c/T)
  18 #       B       test exposes a known bug in Perl, should be skipped
  19 #       b       test exposes a known bug in Perl, should be skipped if noamp
  20 #       t       test exposes a bug with threading, TODO if qr_embed_thr
  21 #       s       test should only be run for regex_sets_compat.t
  22 #       S       test should not be run for regex_sets_compat.t
  23 #
  24 # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
  25 #
  26 # Column 4 contains a string, usually C<$&>.
  27 #
  28 # Column 5 contains the expected result of double-quote
  29 # interpolating that string after the match, or start of error message.
  30 #
  31 # Column 6, if present, contains a reason why the test is skipped.
  32 # This is printed with "skipped", for harness to pick up.
  33 #
  34 # Column 7 can be used for comments
  35 #
  36 # \n in the tests are interpolated, as are variables of the form ${\w+}.
  37 #
  38 # Blanks lines are treated as PASSING tests to keep the line numbers
  39 # linked to the test number.
  40 #
  41 # If you want to add a regular expression test that can't be expressed
  42 # in this format, don't add it here: put it in re/pat.t instead.
  43 #
  44 # Note that the inputs get passed on as "m're'", so the re bypasses the lexer.
  45 # This means this file cannot be used for testing anything that the lexer
  46 # handles; in 5.12 this means just \N{NAME} and \N{U+...}.
  47 #
  48 # Note that columns 2,3 and 5 are all enclosed in double quotes and then
  49 # evalled; so something like a\"\x{100}$1 has length 3+length($1).
  50
  51 my ($file, $iters);
  52 BEGIN {
  53     $iters = shift || 1;        # Poor man performance suite, 10000 is OK.
  54
  55     # Do this open before any chdir
  56     $file = shift;
  57     if (defined $file) {
  58         open TESTS, $file or die "Can't open $file";
  59     }
  60
  61     chdir 't' if -d 't';
  62     @INC = qw '../lib ../ext/re';
  63
  64 }
  65
  66 sub _comment {
  67     return map { /^#/ ? "$_\n" : "# $_\n" }
  68            map { split /\n/ } @_;
  69 }
  70
  71 use strict;
  72 use warnings FATAL=>"all";
  73 use vars qw($bang $ffff $nulnul); # used by the tests
  74 use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets); # set by our callers
  75
  76
  77
  78 if (!defined $file) {
  79     open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!";
  80 }
  81
  82 my @tests = <TESTS>;
  83
  84 close TESTS;
  85
  86 $bang = sprintf "\\%03o", ord "!"; # \41 would not be portable.
  87 $ffff  = chr(0xff) x 2;
  88 $nulnul = "\0" x 2;
  89 my $OP = $qr ? 'qr' : 'm';
  90
  91 $| = 1;
  92 printf "1..%d\n# $iters iterations\n", scalar @tests;
  93
  94 my $test;
  95 TEST:
  96 foreach (@tests) {
  97     $test++;
  98     if (!/\S/ || /^\s*#/ || /^__END__$/) {
  99         print "ok $test # (Blank line or comment)\n";
 100         if (/#/) { print $_ };
 101         next;
 102     }
 103     chomp;
 104     s/\\n/\n/g unless $regex_sets;
 105     my ($pat, $subject, $result, $repl, $expect, $reason) = split(/\t/,$_,6);
 106     if (!defined $subject) {
 107         die "Bad test definition on line $test: $_\n";
 108     }
 109     $reason = '' unless defined $reason;
 110     my $input = join(':',$pat,$subject,$result,$repl,$expect);
 111     # the double '' below keeps simple syntax highlighters from going crazy
 112     $pat = "'$pat'" unless $pat =~ /^[:''\/]/;
 113     $pat =~ s/(\$\{\w+\})/$1/eeg;
 114     $pat =~ s/\\n/\n/g unless $regex_sets;
 115     $subject = eval qq("$subject"); die $@ if $@;
 116     $expect  = eval qq("$expect"); die $@ if $@;
 117     $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/;
 118     my $todo_qr = $qr_embed_thr && ($result =~ s/t//);
 119     my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
 120     ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader;
 121     if ($result =~ s/ ( [Ss] ) //x) {
 122         if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) {
 123             $skip++;
 124             $reason = "Test not valid for $0";
 125         }
 126     }
 127     $reason = 'skipping $&' if $reason eq  '' && $skip_amp;
 128     $result =~ s/B//i unless $skip;
 129     my $todo= $result =~ s/T// ? " # TODO" : "";
 130     if (! $skip && $regex_sets) {
 131
 132         # If testing regex sets, change the [bracketed] classes into
 133         # (?[bracketed]).
 134
 135         if ($pat !~ / \[ /x) {
 136
 137             $skip++;
 138             $reason = "Pattern doesn't contain [brackets]";
 139         }
 140         else { # Use non-regex features of Perl to accomplish this.
 141             my $modified = "";
 142             my $in_brackets = 0;
 143
 144             # Go through the pattern character-by-character.  We also add
 145             # blanks around each token to test the /x parts of (?[ ])
 146             my $pat_len = length($pat);
 147       CHAR: for (my $i = 0; $i < $pat_len; $i++) {
 148                 my $curchar = substr($pat, $i, 1);
 149                 if ($curchar eq '\\') {
 150                     $modified .= " " if $in_brackets;
 151                     $modified .= $curchar;
 152                     $i++;
 153
 154                     # Get the character the backslash is escaping
 155                     $curchar = substr($pat, $i, 1);
 156                     $modified .= $curchar;
 157
 158                     # If the character following that is a '{}', treat the
 159                     # entire amount as a single token
 160                     if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') {
 161                         my $j = index($pat, '}', $i+2);
 162                         if ($j < 0) {
 163                             last unless $in_brackets;
 164                             if ($result eq 'c') {
 165                                 $skip++;
 166                                 $reason = "Can't handle compilation errors with unmatched '{'";
 167                             }
 168                             else {
 169                                 print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n";
 170                                 next TEST;
 171                             }
 172                         }
 173                         $modified .= substr($pat, $i+1, $j - $i);
 174                         $i = $j;
 175                     }
 176                     elsif ($curchar eq 'x') {
 177
 178                         # \x without brackets is supposed to be followed by 2
 179                         # hex digits.  Take up to 2, and then add a blank
 180                         # after the last one.  This avoids getting errors from
 181                         # (?[ ]) for run-ons, like \xabc
 182                         my $j = $i + 1;
 183                         for (; $j < $i + 3 && $j < $pat_len; $j++) {
 184                             my $curord = ord(substr($pat, $j, 1));
 185                             if (!(($curord >= ord("A") && $curord <= ord("F"))
 186                                  || ($curord >= ord("a") && $curord <= ord("f"))
 187                                  || ($curord >= ord("0") && $curord <= ord("9"))))
 188                             {
 189                                 $j++;
 190                                 last;
 191                             }
 192                         }
 193                         $j--;
 194                         $modified .= substr($pat, $i + 1, $j - $i) . " ";
 195                         $i = $j;
 196                     }
 197                     elsif (ord($curchar) >= ord('0')
 198                            && (ord($curchar) <= ord('7')))
 199                     {
 200                         # Similarly, octal constants have up to 3 digits.
 201                         my $j = $i + 1;
 202                         for (; $j < $i + 3 && $j < $pat_len; $j++) {
 203                             my $curord = ord(substr($pat, $j, 1));
 204                             if (! ($curord >= ord("0") &&  $curord <= ord("7"))) {
 205                                 $j++;
 206                                 last;
 207                             }
 208                         }
 209                         $j--;
 210                         $modified .= substr($pat, $i + 1, $j - $i);
 211                         $i = $j;
 212                     }
 213
 214                     next;
 215                 } # End of processing a backslash sequence
 216
 217                 if (! $in_brackets  # Skip (?{ })
 218                     && $curchar eq '('
 219                     && $i < $pat_len - 2
 220                     && substr($pat, $i+1, 1) eq '?'
 221                     && substr($pat, $i+2, 1) eq '{')
 222                 {
 223                     $skip++;
 224                     $reason = "Pattern contains '(?{'";
 225                     last;
 226                 }
 227
 228                 # Closing ']'
 229                 if ($curchar eq ']' && $in_brackets) {
 230                     $modified .= " ] ])";
 231                     $in_brackets = 0;
 232                     next;
 233                 }
 234
 235                 # A regular character.
 236                 if ($curchar ne '[') {
 237                     if (! $in_brackets) {
 238                         $modified .= $curchar;
 239                     }
 240                     else {
 241                         $modified .= " $curchar ";
 242                     }
 243                     next;
 244                 }
 245
 246                 # Here is a '['; If not in a bracketed class, treat as the
 247                 # beginning of one.
 248                 if (! $in_brackets) {
 249                     $in_brackets = 1;
 250                     $modified .= "(?[ [ ";
 251
 252                     # An immediately following ']' or '^]' is not the ending
 253                     # of the class, but is to be treated literally.
 254                     if ($i < $pat_len - 1
 255                         && substr($pat, $i+1, 1) eq ']')
 256                     {
 257                         $i ++;
 258                         $modified .= " ] ";
 259                     }
 260                     elsif ($i < $pat_len - 2
 261                             && substr($pat, $i+1, 1) eq '^'
 262                             && substr($pat, $i+2, 1) eq ']')
 263                     {
 264                         $i += 2;
 265                         $modified .= " ^ ] ";
 266                     }
 267                     next;
 268                 }
 269
 270                 # Here is a plain '[' within [ ].  Could mean wants to
 271                 # match a '[', or it could be a posix class that has a
 272                 # corresponding ']'.  Absorb either
 273
 274                 $modified .= ' [';
 275                 last if $i >= $pat_len - 1;
 276
 277                 $i++;
 278                 $curchar = substr($pat, $i, 1);
 279                 if ($curchar =~ /[:=.]/) {
 280                     for (my $j = $i + 1; $j < $pat_len; $j++) {
 281                         next unless substr($pat, $j, 1) eq ']';
 282                         last if $j - $i < 2;
 283                         if (substr($pat, $j - 1, 1) eq $curchar) {
 284                             # Here, is a posix class
 285                             $modified .= substr($pat, $i, $j - $i + 1) . " ";
 286                             $i = $j;
 287                             next CHAR;
 288                         }
 289                     }
 290                 }
 291
 292                 # Here wasn't a posix class, just process normally
 293                 $modified .= " $curchar ";
 294             }
 295
 296             if ($in_brackets && ! $skip) {
 297                 if ($result eq 'c') {
 298                     $skip++;
 299                     $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error";
 300                 }
 301                 else {
 302                     print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n";
 303                     next TEST;
 304                 }
 305             }
 306
 307             # Use our modified pattern instead of the original
 308             $pat = $modified;
 309         }
 310     }
 311
 312     for my $study ('', 'study $subject', 'utf8::upgrade($subject)',
 313                    'utf8::upgrade($subject); study $subject') {
 314         # Need to make a copy, else the utf8::upgrade of an already studied
 315         # scalar confuses things.
 316         my $subject = $subject;
 317         my $c = $iters;
 318         my ($code, $match, $got);
 319         if ($repl eq 'pos') {
 320             $code= <<EOFCODE;
 321                 $study;
 322                 pos(\$subject)=0;
 323                 \$match = ( \$subject =~ m${pat}g );
 324                 \$got = pos(\$subject);
 325 EOFCODE
 326         }
 327         elsif ($qr_embed) {
 328             $code= <<EOFCODE;
 329                 my \$RE = qr$pat;
 330                 $study;
 331                 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
 332                 \$got = "$repl";
 333 EOFCODE
 334         }
 335         elsif ($qr_embed_thr) {
 336             $code= <<EOFCODE;
 337                 # Can't run the match in a subthread, but can do this and
 338                 # clone the pattern the other way.
 339                 my \$RE = threads->new(sub {qr$pat})->join();
 340                 $study;
 341                 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
 342                 \$got = "$repl";
 343 EOFCODE
 344         }
 345         else {
 346             $code= <<EOFCODE;
 347                 $study;
 348                 \$match = (\$subject =~ $OP$pat) while \$c--;
 349                 \$got = "$repl";
 350 EOFCODE
 351         }
 352         $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets;
 353         #$code.=qq[\n\$expect="$expect";\n];
 354         #use Devel::Peek;
 355         #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
 356         {
 357             # Probably we should annotate specific tests with which warnings
 358             # categories they're known to trigger, and hence should be
 359             # disabled just for that test
 360             no warnings qw(uninitialized regexp);
 361             eval $code;
 362         }
 363         chomp( my $err = $@ );
 364         if ( $skip ) {
 365             print "ok $test # skipped", length($reason) ? ".  $reason" : '', "\n";
 366             next TEST;
 367         }
 368         elsif ($result eq 'c') {
 369             if ($err !~ m!^\Q$expect!) { print "not ok $test$todo (compile) $input => '$err'\n"; next TEST }
 370             last;  # no need to study a syntax error
 371         }
 372         elsif ( $todo_qr ) {
 373             print "not ok $test # TODO", length($reason) ? " - $reason" : '', "\n";
 374             next TEST;
 375         }
 376         elsif ($@) {
 377             print "not ok $test$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST;
 378         }
 379         elsif ($result =~ /^n/) {
 380             if ($match) { print "not ok $test$todo ($study) $input => false positive\n"; next TEST }
 381         }
 382         else {
 383             if (!$match || $got ne $expect) {
 384                 eval { require Data::Dumper };
 385                 no warnings "utf8"; # But handle should be utf8
 386                 if ($@ || !defined &DynaLoader::boot_DynaLoader) {
 387                     # Data::Dumper will load on miniperl, but fail when used in
 388                     # anger as it tries to load B. I'd prefer to keep the
 389                     # regular calls below outside of an eval so that real
 390                     # (unknown) failures get spotted, not ignored.
 391                     print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$code\n");
 392                 }
 393                 else { # better diagnostics
 394                     my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump;
 395                     my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump;
 396                     print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$g\n$code\n");
 397                 }
 398                 next TEST;
 399             }
 400         }
 401     }
 402     print "ok $test$todo\n";
 403 }
 404
 405 1;