This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
t/re/regexp.t: Speed up many regex tests on ASCII platform
[perl5.git] / t / re / regexp.t
CommitLineData
378cc40b
LW
1#!./perl
2
ae34ee58 3# The tests are in a separate file 't/re/re_tests'.
ad4f75a6
HM
4# Each line in that file is a separate test.
5# There are five columns, separated by tabs.
d6395ff9 6# An optional sixth column is used to give a reason, only when skipping tests
ad4f75a6 7#
71c89c82
FC
8# Column 1 contains the pattern, optionally enclosed in C<''> C<::> or
9# C<//>. Modifiers can be put after the closing delimiter. C<''> will
10# automatically be added to any other patterns.
ad4f75a6
HM
11#
12# Column 2 contains the string to be matched.
13#
14# Column 3 contains the expected result:
15# y expect a match
16# n expect no match
17# c expect an error
24d786f4 18# T the test is a TODO (can be combined with y/n/c)
cb6fa888 19# M skip test on miniperl (combine with y/n/c/T)
cf93c79d
IZ
20# B test exposes a known bug in Perl, should be skipped
21# b test exposes a known bug in Perl, should be skipped if noamp
e3faa678 22# t test exposes a bug with threading, TODO if qr_embed_thr
073b366a
KW
23# s test should only be run for regex_sets_compat.t
24# S test should not be run for regex_sets_compat.t
c46c4601
KW
25# a test should only be run on ASCII platforms
26# e test should only be run on EBCDIC platforms
ad4f75a6 27#
1b1626e4 28# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
ad4f75a6
HM
29#
30# Column 4 contains a string, usually C<$&>.
31#
32# Column 5 contains the expected result of double-quote
c277df42
IZ
33# interpolating that string after the match, or start of error message.
34#
ee595aa6
LC
35# Column 6, if present, contains a reason why the test is skipped.
36# This is printed with "skipped", for harness to pick up.
37#
3238b147
KW
38# Column 7 can be used for comments
39#
9d116dd7 40# \n in the tests are interpolated, as are variables of the form ${\w+}.
83e898de 41#
b9b4dddf
YO
42# Blanks lines are treated as PASSING tests to keep the line numbers
43# linked to the test number.
44#
8d37f932 45# If you want to add a regular expression test that can't be expressed
67a2b8c6 46# in this format, don't add it here: put it in re/pat.t instead.
b2a156bd 47#
ff3f963a
KW
48# Note that the inputs get passed on as "m're'", so the re bypasses the lexer.
49# This means this file cannot be used for testing anything that the lexer
50# handles; in 5.12 this means just \N{NAME} and \N{U+...}.
51#
b2a156bd
DM
52# Note that columns 2,3 and 5 are all enclosed in double quotes and then
53# evalled; so something like a\"\x{100}$1 has length 3+length($1).
d6395ff9
KW
54#
55# \x... and \o{...} constants are automatically converted to the native
56# character set if necessary. \[0-7] constants aren't
c277df42 57
7e1dab6a 58my ($file, $iters);
e4d48cc9 59BEGIN {
1a610890
NC
60 $iters = shift || 1; # Poor man performance suite, 10000 is OK.
61
62 # Do this open before any chdir
63 $file = shift;
64 if (defined $file) {
65 open TESTS, $file or die "Can't open $file";
66 }
67
e4d48cc9 68 chdir 't' if -d 't';
cc7e6304 69 @INC = qw '../lib ../ext/re';
87709a14
FC
70 if (!defined &DynaLoader::boot_DynaLoader) { # miniperl
71 print("1..0 # Skip Unicode tables not built yet\n"), exit
048bdb72 72 unless eval 'require "unicore/UCD.pl"';
87709a14 73 }
52c17dc6
KW
74
75 # Some of the tests need a locale; which one doesn't much matter, except
76 # that it be valid. Make sure of that
77 eval { require POSIX;
78 POSIX->import(qw(LC_ALL setlocale));
79 POSIX::setlocale(&LC_ALL, "C");
80 };
e4d48cc9 81}
1a610890 82
1b7228c9
KW
83sub _comment {
84 return map { /^#/ ? "$_\n" : "# $_\n" }
85 map { split /\n/ } @_;
86}
87
0cd59ee9
KW
88use strict;
89use warnings FATAL=>"all";
90no warnings 'experimental::vlb';
91our ($bang, $ffff, $nulnul); # used by the tests
92our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $alpha_assertions, $no_null); # set by our callers
93
94my $expanded_text = "expanded name from original test number";
95my $expanded_text_re = qr/$expanded_text/;
96
97if (!defined $file) {
98 open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!";
99}
100
101my @tests = <TESTS>;
102
103close TESTS;
104
105my $test_num = 0;
106
107# Some scenarios add extra tests to those just read in. For those where there
108# is a character set translation, the added test will already have been
109# translated, so any test number beginning with this one shouldn't be
110# translated again.
111my $first_already_converted_test_num = @tests + 1;
112
2b7f9bbf
KW
113sub convert_from_ascii_guts {
114 my $string_ref = shift;
d6395ff9 115
2b7f9bbf 116 return if $test_num >= $first_already_converted_test_num;
0cd59ee9 117
2b7f9bbf 118 #my $save = $string_ref;
d6395ff9 119 # Convert \x{...}, \o{...}
2b7f9bbf
KW
120 $$string_ref =~ s/ (?<! \\ ) \\x\{ ( .*? ) } / "\\x{" . sprintf("%X", utf8::unicode_to_native(hex $1)) . "}" /gex;
121 $$string_ref =~ s/ (?<! \\ ) \\o\{ ( .*? ) } / "\\o{" . sprintf("%o", utf8::unicode_to_native(oct $1)) . "}" /gex;
d6395ff9
KW
122
123 # Convert \xAB
2b7f9bbf 124 $$string_ref =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9]{2} ) / "\\x" . sprintf("%02X", utf8::unicode_to_native(hex $1)) /gex;
d6395ff9
KW
125
126 # Convert \xA
2b7f9bbf 127 $$string_ref =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9] ) (?! [A-Fa-f0-9] ) / "\\x" . sprintf("%X", utf8::unicode_to_native(hex $1)) /gex;
d6395ff9 128
2b7f9bbf
KW
129 #print STDERR __LINE__, ": $save\n$string_ref\n" if $save ne $string_ref;
130 return;
d6395ff9
KW
131}
132
2b7f9bbf
KW
133*convert_from_ascii = (ord("A") == 65)
134 ? sub { 1; }
135 : convert_from_ascii_guts;
136
9d116dd7 137$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable.
b8c5462f
JH
138$ffff = chr(0xff) x 2;
139$nulnul = "\0" x 2;
7e1dab6a 140my $OP = $qr ? 'qr' : 'm';
9d116dd7 141
1462b684 142$| = 1;
e3faa678 143
cfa4f241 144TEST:
1a610890 145foreach (@tests) {
52a3aca8 146 $test_num++;
5a51db05 147 if (!/\S/ || /^\s*#/ || /^__END__$/) {
92b05f28
YO
148 chomp;
149 my ($not,$comment)= split /\s*#\s*/, $_, 2;
150 $comment ||= "(blank line)";
52a3aca8 151 print "ok $test_num # $comment\n";
b9b4dddf
YO
152 next;
153 }
b85d18e9 154 chomp;
073b366a 155 s/\\n/\n/g unless $regex_sets;
92b05f28 156 my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7);
e7206367 157 $comment = "" unless defined $comment;
b8f6efdd 158 if (!defined $subject) {
52a3aca8 159 die "Bad test definition on line $test_num: $_\n";
b8f6efdd 160 }
66fb63c1 161 $reason = '' unless defined $reason;
1286eaeb 162 my $input = join(':',$pat,$subject,$result,$repl,$expect);
d6395ff9 163
24d786f4 164 # the double '' below keeps simple syntax highlighters from going crazy
2b7f9bbf 165 $pat = "'$pat'" unless $pat =~ /^[:''\/]/;
9d116dd7 166 $pat =~ s/(\$\{\w+\})/$1/eeg;
073b366a 167 $pat =~ s/\\n/\n/g unless $regex_sets;
2b7f9bbf 168 convert_from_ascii(\$pat);
d6395ff9 169
933f1527
FC
170 my $no_null_pat;
171 if ($no_null && $pat =~ /^'(.*)'\z/) {
172 $no_null_pat = XS::APItest::string_without_null($1);
173 }
174
2b7f9bbf 175 convert_from_ascii(\$subject);
1a610890 176 $subject = eval qq("$subject"); die $@ if $@;
d6395ff9 177
2b7f9bbf 178 convert_from_ascii(\$expect);
1a610890 179 $expect = eval qq("$expect"); die $@ if $@;
c277df42 180 $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/;
d6395ff9 181
24d786f4 182 my $todo_qr = $qr_embed_thr && ($result =~ s/t//);
1286eaeb 183 my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
cb6fa888 184 ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader;
073b366a
KW
185 if ($result =~ s/ ( [Ss] ) //x) {
186 if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) {
187 $skip++;
188 $reason = "Test not valid for $0";
189 }
190 }
c46c4601
KW
191 if ($result =~ s/a// && ord("A") != 65) {
192 $skip++;
193 $reason = "Test is only valid for ASCII platforms. $reason";
194 }
195 if ($result =~ s/e// && ord("A") != 193) {
196 $skip++;
197 $reason = "Test is only valid for EBCDIC platforms. $reason";
198 }
906e884f 199 $reason = 'skipping $&' if $reason eq '' && $skip_amp;
cf93c79d 200 $result =~ s/B//i unless $skip;
24d786f4 201 my $todo= $result =~ s/T// ? " # TODO" : "";
52a3aca8 202 my $testname= $test_num;
92b05f28
YO
203 if ($comment) {
204 $comment=~s/^\s*(?:#\s*)?//;
205 $testname .= " - $comment" if $comment;
206 }
e7206367
KW
207 if (! $skip && $alpha_assertions) {
208 my $assertions_re = qr/ (?: \Q(?\E (?: > | <? [=>] ) ) /x;
209 if ($pat !~ $assertions_re && $comment !~ $expanded_text_re) {
210 $skip++;
211 $reason = "Pattern doesn't contain assertions";
212 }
213 elsif ($comment !~ $expanded_text_re) {
214 my $expanded_pat = $pat;
215
216 $pat =~ s/\( \? > /(*atomic:/xg;
217
218 if ($pat =~ s/\( \? = /(*pla:/xg) {
219 $expanded_pat =~ s//(*positive_lookahead:/g;
220 }
221 if ($pat =~ s/\( \? ! /(*nla:/xg) {
222 $expanded_pat =~ s//(*negative_lookahead:/g;
223 }
224 if ($pat =~ s/\( \? <= /(*plb:/xg) {
225 $expanded_pat =~ s//(*positive_lookbehind:/g;
226 }
227 if ($pat =~ s/\( \? <! /(*nlb:/xg) {
228 $expanded_pat =~ s//(*negative_lookbehind:/g;
229 }
230 if ($expanded_pat ne $pat) {
52a3aca8 231 $comment .= " $expanded_text $test_num";
e7206367
KW
232 push @tests, join "\t", $expanded_pat,
233 $subject // "",
234 $result // "",
235 $repl // "",
236 $expect // "",
237 $reason // "",
238 $comment;
239 }
240 }
241 }
242 elsif (! $skip && $regex_sets) {
073b366a
KW
243
244 # If testing regex sets, change the [bracketed] classes into
58c435ec
KW
245 # (?[bracketed]). But note that '\[' and '\c[' don't introduce such a
246 # class. (We don't bother looking for an odd number of backslashes,
247 # as this hasn't been needed so far.)
248 if ($pat !~ / (?<!\\c) (?<!\\) \[ /x) {
073b366a
KW
249 $skip++;
250 $reason = "Pattern doesn't contain [brackets]";
251 }
252 else { # Use non-regex features of Perl to accomplish this.
253 my $modified = "";
254 my $in_brackets = 0;
255
256 # Go through the pattern character-by-character. We also add
257 # blanks around each token to test the /x parts of (?[ ])
258 my $pat_len = length($pat);
259 CHAR: for (my $i = 0; $i < $pat_len; $i++) {
260 my $curchar = substr($pat, $i, 1);
261 if ($curchar eq '\\') {
262 $modified .= " " if $in_brackets;
263 $modified .= $curchar;
264 $i++;
265
266 # Get the character the backslash is escaping
267 $curchar = substr($pat, $i, 1);
268 $modified .= $curchar;
269
270 # If the character following that is a '{}', treat the
271 # entire amount as a single token
272 if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') {
273 my $j = index($pat, '}', $i+2);
274 if ($j < 0) {
275 last unless $in_brackets;
276 if ($result eq 'c') {
277 $skip++;
278 $reason = "Can't handle compilation errors with unmatched '{'";
279 }
280 else {
92b05f28 281 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n";
073b366a
KW
282 next TEST;
283 }
284 }
285 $modified .= substr($pat, $i+1, $j - $i);
286 $i = $j;
287 }
288 elsif ($curchar eq 'x') {
289
290 # \x without brackets is supposed to be followed by 2
291 # hex digits. Take up to 2, and then add a blank
292 # after the last one. This avoids getting errors from
293 # (?[ ]) for run-ons, like \xabc
294 my $j = $i + 1;
295 for (; $j < $i + 3 && $j < $pat_len; $j++) {
296 my $curord = ord(substr($pat, $j, 1));
297 if (!(($curord >= ord("A") && $curord <= ord("F"))
298 || ($curord >= ord("a") && $curord <= ord("f"))
299 || ($curord >= ord("0") && $curord <= ord("9"))))
300 {
301 $j++;
302 last;
303 }
304 }
305 $j--;
e4eb6481
KW
306 $modified .= substr($pat, $i + 1, $j - $i);
307 $modified .= " " if $in_brackets;
073b366a
KW
308 $i = $j;
309 }
310 elsif (ord($curchar) >= ord('0')
311 && (ord($curchar) <= ord('7')))
312 {
313 # Similarly, octal constants have up to 3 digits.
314 my $j = $i + 1;
315 for (; $j < $i + 3 && $j < $pat_len; $j++) {
316 my $curord = ord(substr($pat, $j, 1));
317 if (! ($curord >= ord("0") && $curord <= ord("7"))) {
318 $j++;
319 last;
320 }
321 }
322 $j--;
323 $modified .= substr($pat, $i + 1, $j - $i);
324 $i = $j;
325 }
326
327 next;
328 } # End of processing a backslash sequence
329
330 if (! $in_brackets # Skip (?{ })
331 && $curchar eq '('
332 && $i < $pat_len - 2
333 && substr($pat, $i+1, 1) eq '?'
334 && substr($pat, $i+2, 1) eq '{')
335 {
336 $skip++;
337 $reason = "Pattern contains '(?{'";
338 last;
339 }
340
341 # Closing ']'
342 if ($curchar eq ']' && $in_brackets) {
343 $modified .= " ] ])";
344 $in_brackets = 0;
345 next;
346 }
347
348 # A regular character.
349 if ($curchar ne '[') {
e4eb6481
KW
350 $modified .= " " if $in_brackets;
351 $modified .= $curchar;
073b366a
KW
352 next;
353 }
354
355 # Here is a '['; If not in a bracketed class, treat as the
356 # beginning of one.
357 if (! $in_brackets) {
358 $in_brackets = 1;
359 $modified .= "(?[ [ ";
360
361 # An immediately following ']' or '^]' is not the ending
362 # of the class, but is to be treated literally.
363 if ($i < $pat_len - 1
364 && substr($pat, $i+1, 1) eq ']')
365 {
366 $i ++;
367 $modified .= " ] ";
368 }
369 elsif ($i < $pat_len - 2
370 && substr($pat, $i+1, 1) eq '^'
371 && substr($pat, $i+2, 1) eq ']')
372 {
373 $i += 2;
374 $modified .= " ^ ] ";
375 }
376 next;
377 }
378
379 # Here is a plain '[' within [ ]. Could mean wants to
380 # match a '[', or it could be a posix class that has a
381 # corresponding ']'. Absorb either
382
383 $modified .= ' [';
384 last if $i >= $pat_len - 1;
385
386 $i++;
387 $curchar = substr($pat, $i, 1);
388 if ($curchar =~ /[:=.]/) {
389 for (my $j = $i + 1; $j < $pat_len; $j++) {
390 next unless substr($pat, $j, 1) eq ']';
391 last if $j - $i < 2;
392 if (substr($pat, $j - 1, 1) eq $curchar) {
393 # Here, is a posix class
394 $modified .= substr($pat, $i, $j - $i + 1) . " ";
395 $i = $j;
396 next CHAR;
397 }
398 }
399 }
400
401 # Here wasn't a posix class, just process normally
402 $modified .= " $curchar ";
403 }
404
405 if ($in_brackets && ! $skip) {
406 if ($result eq 'c') {
407 $skip++;
408 $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error";
409 }
410 else {
92b05f28 411 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n";
073b366a
KW
412 next TEST;
413 }
414 }
415
416 # Use our modified pattern instead of the original
417 $pat = $modified;
418 }
419 }
1de06328 420
cf549b97
DC
421 for my $study ('', 'study $subject;', 'utf8::upgrade($subject);',
422 'utf8::upgrade($subject); study $subject;') {
93f09d7b 423 # Need to make a copy, else the utf8::upgrade of an already studied
52e33015
NC
424 # scalar confuses things.
425 my $subject = $subject;
e426a4af 426 $subject = XS::APItest::string_without_null($subject) if $no_null;
1286eaeb
NC
427 my $c = $iters;
428 my ($code, $match, $got);
1de06328 429 if ($repl eq 'pos') {
933f1527
FC
430 my $patcode = defined $no_null_pat ? '/$no_null_pat/g'
431 : "m${pat}g";
1de06328 432 $code= <<EOFCODE;
cf549b97 433 $study
1de06328 434 pos(\$subject)=0;
933f1527 435 \$match = ( \$subject =~ $patcode );
1de06328
YO
436 \$got = pos(\$subject);
437EOFCODE
438 }
439 elsif ($qr_embed) {
440 $code= <<EOFCODE;
441 my \$RE = qr$pat;
cf549b97 442 $study
1de06328
YO
443 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
444 \$got = "$repl";
445EOFCODE
446 }
e3faa678
NC
447 elsif ($qr_embed_thr) {
448 $code= <<EOFCODE;
449 # Can't run the match in a subthread, but can do this and
450 # clone the pattern the other way.
451 my \$RE = threads->new(sub {qr$pat})->join();
cf549b97 452 $study
e3faa678
NC
453 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
454 \$got = "$repl";
455EOFCODE
456 }
933f1527
FC
457 elsif ($no_null) {
458 my $patcode = defined $no_null_pat ? '/$no_null_pat/'
459 : $pat;
460 $code= <<EOFCODE;
461 $study
462 \$match = (\$subject =~ $OP$pat) while \$c--;
463 \$got = "$repl";
464EOFCODE
465 }
1de06328
YO
466 else {
467 $code= <<EOFCODE;
cf549b97 468 $study
1286eaeb 469 \$match = (\$subject =~ $OP$pat) while \$c--;
1de06328
YO
470 \$got = "$repl";
471EOFCODE
472 }
073b366a 473 $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets;
e1d1eefb
YO
474 #$code.=qq[\n\$expect="$expect";\n];
475 #use Devel::Peek;
476 #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
66fb63c1
NC
477 {
478 # Probably we should annotate specific tests with which warnings
479 # categories they're known to trigger, and hence should be
480 # disabled just for that test
d8d26cac 481 no warnings qw(uninitialized regexp deprecated);
66fb63c1
NC
482 eval $code;
483 }
1286eaeb 484 chomp( my $err = $@ );
565b86e2 485 if ( $skip ) {
92b05f28 486 print "ok $testname # skipped", length($reason) ? ". $reason" : '', "\n";
ee595aa6 487 next TEST;
cf93c79d 488 }
565b86e2 489 elsif ($result eq 'c') {
92b05f28 490 if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST }
565b86e2
KW
491 last; # no need to study a syntax error
492 }
24d786f4 493 elsif ( $todo_qr ) {
92b05f28 494 print "not ok $testname # TODO", length($reason) ? " - $reason" : '', "\n";
e3faa678
NC
495 next TEST;
496 }
c277df42 497 elsif ($@) {
92b05f28 498 print "not ok $testname$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST;
c277df42 499 }
e3faa678 500 elsif ($result =~ /^n/) {
92b05f28 501 if ($match) { print "not ok $testname$todo ($study) $input => false positive\n"; next TEST }
378cc40b
LW
502 }
503 else {
cfa4f241 504 if (!$match || $got ne $expect) {
cde0cee5 505 eval { require Data::Dumper };
969c44e7 506 no warnings "utf8"; # But handle should be utf8
65016092
NC
507 if ($@ || !defined &DynaLoader::boot_DynaLoader) {
508 # Data::Dumper will load on miniperl, but fail when used in
509 # anger as it tries to load B. I'd prefer to keep the
510 # regular calls below outside of an eval so that real
511 # (unknown) failures get spotted, not ignored.
92b05f28 512 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$code\n");
cde0cee5
YO
513 }
514 else { # better diagnostics
515 my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump;
516 my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump;
6ea2424c
DC
517 my $e = Data::Dumper->new([$expect],['expected'])->Useqq(1)->Dump;
518 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$code\n$g\n$e\n");
cde0cee5 519 }
cfa4f241
CS
520 next TEST;
521 }
378cc40b
LW
522 }
523 }
92b05f28 524 print "ok $testname$todo\n";
378cc40b 525}
cfa4f241 526
e7206367
KW
527printf "1..%d\n# $iters iterations\n", scalar @tests;
528
1a610890 5291;