This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
customise Pod::Perldoc to fix output misbehaviour
[perl5.git] / t / re / regexp.t
CommitLineData
378cc40b
LW
1#!./perl
2
ae34ee58 3# The tests are in a separate file 't/re/re_tests'.
ad4f75a6
HM
4# Each line in that file is a separate test.
5# There are five columns, separated by tabs.
d6395ff9 6# An optional sixth column is used to give a reason, only when skipping tests
ad4f75a6 7#
71c89c82
FC
8# Column 1 contains the pattern, optionally enclosed in C<''> C<::> or
9# C<//>. Modifiers can be put after the closing delimiter. C<''> will
10# automatically be added to any other patterns.
ad4f75a6
HM
11#
12# Column 2 contains the string to be matched.
13#
14# Column 3 contains the expected result:
15# y expect a match
16# n expect no match
17# c expect an error
24d786f4 18# T the test is a TODO (can be combined with y/n/c)
cb6fa888 19# M skip test on miniperl (combine with y/n/c/T)
cf93c79d
IZ
20# B test exposes a known bug in Perl, should be skipped
21# b test exposes a known bug in Perl, should be skipped if noamp
e3faa678 22# t test exposes a bug with threading, TODO if qr_embed_thr
073b366a
KW
23# s test should only be run for regex_sets_compat.t
24# S test should not be run for regex_sets_compat.t
c46c4601
KW
25# a test should only be run on ASCII platforms
26# e test should only be run on EBCDIC platforms
ad4f75a6 27#
1b1626e4 28# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
ad4f75a6
HM
29#
30# Column 4 contains a string, usually C<$&>.
31#
32# Column 5 contains the expected result of double-quote
c277df42
IZ
33# interpolating that string after the match, or start of error message.
34#
ee595aa6
LC
35# Column 6, if present, contains a reason why the test is skipped.
36# This is printed with "skipped", for harness to pick up.
37#
3238b147
KW
38# Column 7 can be used for comments
39#
9d116dd7 40# \n in the tests are interpolated, as are variables of the form ${\w+}.
83e898de 41#
b9b4dddf
YO
42# Blanks lines are treated as PASSING tests to keep the line numbers
43# linked to the test number.
44#
8d37f932 45# If you want to add a regular expression test that can't be expressed
67a2b8c6 46# in this format, don't add it here: put it in re/pat.t instead.
b2a156bd 47#
ff3f963a
KW
48# Note that the inputs get passed on as "m're'", so the re bypasses the lexer.
49# This means this file cannot be used for testing anything that the lexer
50# handles; in 5.12 this means just \N{NAME} and \N{U+...}.
51#
b2a156bd
DM
52# Note that columns 2,3 and 5 are all enclosed in double quotes and then
53# evalled; so something like a\"\x{100}$1 has length 3+length($1).
d6395ff9
KW
54#
55# \x... and \o{...} constants are automatically converted to the native
56# character set if necessary. \[0-7] constants aren't
c277df42 57
7e1dab6a 58my ($file, $iters);
e4d48cc9 59BEGIN {
1a610890
NC
60 $iters = shift || 1; # Poor man performance suite, 10000 is OK.
61
62 # Do this open before any chdir
63 $file = shift;
64 if (defined $file) {
65 open TESTS, $file or die "Can't open $file";
66 }
67
e4d48cc9 68 chdir 't' if -d 't';
cc7e6304 69 @INC = qw '../lib ../ext/re';
87709a14
FC
70 if (!defined &DynaLoader::boot_DynaLoader) { # miniperl
71 print("1..0 # Skip Unicode tables not built yet\n"), exit
72 unless eval 'require "unicore/Heavy.pl"';
73 }
e4d48cc9 74}
1a610890 75
1b7228c9
KW
76sub _comment {
77 return map { /^#/ ? "$_\n" : "# $_\n" }
78 map { split /\n/ } @_;
79}
80
d6395ff9
KW
81sub convert_from_ascii {
82 my $string = shift;
83
84 #my $save = $string;
85 # Convert \x{...}, \o{...}
86 $string =~ s/ (?<! \\ ) \\x\{ ( .*? ) } / "\\x{" . sprintf("%X", utf8::unicode_to_native(hex $1)) . "}" /gex;
87 $string =~ s/ (?<! \\ ) \\o\{ ( .*? ) } / "\\o{" . sprintf("%o", utf8::unicode_to_native(oct $1)) . "}" /gex;
88
89 # Convert \xAB
90 $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9]{2} ) / "\\x" . sprintf("%02X", utf8::unicode_to_native(hex $1)) /gex;
91
92 # Convert \xA
93 $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9] ) (?! [A-Fa-f0-9] ) / "\\x" . sprintf("%X", utf8::unicode_to_native(hex $1)) /gex;
94
95 #print STDERR __LINE__, ": $save\n$string\n" if $save ne $string;
96 return $string;
97}
98
1286eaeb 99use strict;
66fb63c1 100use warnings FATAL=>"all";
7e1dab6a 101use vars qw($bang $ffff $nulnul); # used by the tests
e426a4af
FC
102use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets
103 $no_null); # set by our callers
073b366a 104
e4d48cc9 105
ad4f75a6 106
1a610890 107if (!defined $file) {
7e1dab6a 108 open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!";
1a610890
NC
109}
110
111my @tests = <TESTS>;
cfa4f241 112
1a610890 113close TESTS;
378cc40b 114
9d116dd7 115$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable.
b8c5462f
JH
116$ffff = chr(0xff) x 2;
117$nulnul = "\0" x 2;
7e1dab6a 118my $OP = $qr ? 'qr' : 'm';
9d116dd7 119
1462b684 120$| = 1;
1a610890 121printf "1..%d\n# $iters iterations\n", scalar @tests;
e3faa678 122
1a610890 123my $test;
cfa4f241 124TEST:
1a610890
NC
125foreach (@tests) {
126 $test++;
5a51db05 127 if (!/\S/ || /^\s*#/ || /^__END__$/) {
92b05f28
YO
128 chomp;
129 my ($not,$comment)= split /\s*#\s*/, $_, 2;
130 $comment ||= "(blank line)";
131 print "ok $test # $comment\n";
b9b4dddf
YO
132 next;
133 }
b85d18e9 134 chomp;
073b366a 135 s/\\n/\n/g unless $regex_sets;
92b05f28 136 my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7);
b8f6efdd
YO
137 if (!defined $subject) {
138 die "Bad test definition on line $test: $_\n";
139 }
66fb63c1 140 $reason = '' unless defined $reason;
1286eaeb 141 my $input = join(':',$pat,$subject,$result,$repl,$expect);
d6395ff9 142
24d786f4
YO
143 # the double '' below keeps simple syntax highlighters from going crazy
144 $pat = "'$pat'" unless $pat =~ /^[:''\/]/;
9d116dd7 145 $pat =~ s/(\$\{\w+\})/$1/eeg;
073b366a 146 $pat =~ s/\\n/\n/g unless $regex_sets;
d6395ff9
KW
147 $pat = convert_from_ascii($pat) if ord("A") != 65;
148
933f1527
FC
149 my $no_null_pat;
150 if ($no_null && $pat =~ /^'(.*)'\z/) {
151 $no_null_pat = XS::APItest::string_without_null($1);
152 }
153
d6395ff9 154 $subject = convert_from_ascii($subject) if ord("A") != 65;
1a610890 155 $subject = eval qq("$subject"); die $@ if $@;
d6395ff9
KW
156
157 $expect = convert_from_ascii($expect) if ord("A") != 65;
1a610890 158 $expect = eval qq("$expect"); die $@ if $@;
c277df42 159 $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/;
d6395ff9 160
24d786f4 161 my $todo_qr = $qr_embed_thr && ($result =~ s/t//);
1286eaeb 162 my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
cb6fa888 163 ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader;
073b366a
KW
164 if ($result =~ s/ ( [Ss] ) //x) {
165 if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) {
166 $skip++;
167 $reason = "Test not valid for $0";
168 }
169 }
c46c4601
KW
170 if ($result =~ s/a// && ord("A") != 65) {
171 $skip++;
172 $reason = "Test is only valid for ASCII platforms. $reason";
173 }
174 if ($result =~ s/e// && ord("A") != 193) {
175 $skip++;
176 $reason = "Test is only valid for EBCDIC platforms. $reason";
177 }
906e884f 178 $reason = 'skipping $&' if $reason eq '' && $skip_amp;
cf93c79d 179 $result =~ s/B//i unless $skip;
24d786f4 180 my $todo= $result =~ s/T// ? " # TODO" : "";
92b05f28
YO
181 my $testname= $test;
182 if ($comment) {
183 $comment=~s/^\s*(?:#\s*)?//;
184 $testname .= " - $comment" if $comment;
185 }
073b366a
KW
186 if (! $skip && $regex_sets) {
187
188 # If testing regex sets, change the [bracketed] classes into
58c435ec
KW
189 # (?[bracketed]). But note that '\[' and '\c[' don't introduce such a
190 # class. (We don't bother looking for an odd number of backslashes,
191 # as this hasn't been needed so far.)
192 if ($pat !~ / (?<!\\c) (?<!\\) \[ /x) {
073b366a
KW
193 $skip++;
194 $reason = "Pattern doesn't contain [brackets]";
195 }
196 else { # Use non-regex features of Perl to accomplish this.
197 my $modified = "";
198 my $in_brackets = 0;
199
200 # Go through the pattern character-by-character. We also add
201 # blanks around each token to test the /x parts of (?[ ])
202 my $pat_len = length($pat);
203 CHAR: for (my $i = 0; $i < $pat_len; $i++) {
204 my $curchar = substr($pat, $i, 1);
205 if ($curchar eq '\\') {
206 $modified .= " " if $in_brackets;
207 $modified .= $curchar;
208 $i++;
209
210 # Get the character the backslash is escaping
211 $curchar = substr($pat, $i, 1);
212 $modified .= $curchar;
213
214 # If the character following that is a '{}', treat the
215 # entire amount as a single token
216 if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') {
217 my $j = index($pat, '}', $i+2);
218 if ($j < 0) {
219 last unless $in_brackets;
220 if ($result eq 'c') {
221 $skip++;
222 $reason = "Can't handle compilation errors with unmatched '{'";
223 }
224 else {
92b05f28 225 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n";
073b366a
KW
226 next TEST;
227 }
228 }
229 $modified .= substr($pat, $i+1, $j - $i);
230 $i = $j;
231 }
232 elsif ($curchar eq 'x') {
233
234 # \x without brackets is supposed to be followed by 2
235 # hex digits. Take up to 2, and then add a blank
236 # after the last one. This avoids getting errors from
237 # (?[ ]) for run-ons, like \xabc
238 my $j = $i + 1;
239 for (; $j < $i + 3 && $j < $pat_len; $j++) {
240 my $curord = ord(substr($pat, $j, 1));
241 if (!(($curord >= ord("A") && $curord <= ord("F"))
242 || ($curord >= ord("a") && $curord <= ord("f"))
243 || ($curord >= ord("0") && $curord <= ord("9"))))
244 {
245 $j++;
246 last;
247 }
248 }
249 $j--;
250 $modified .= substr($pat, $i + 1, $j - $i) . " ";
251 $i = $j;
252 }
253 elsif (ord($curchar) >= ord('0')
254 && (ord($curchar) <= ord('7')))
255 {
256 # Similarly, octal constants have up to 3 digits.
257 my $j = $i + 1;
258 for (; $j < $i + 3 && $j < $pat_len; $j++) {
259 my $curord = ord(substr($pat, $j, 1));
260 if (! ($curord >= ord("0") && $curord <= ord("7"))) {
261 $j++;
262 last;
263 }
264 }
265 $j--;
266 $modified .= substr($pat, $i + 1, $j - $i);
267 $i = $j;
268 }
269
270 next;
271 } # End of processing a backslash sequence
272
273 if (! $in_brackets # Skip (?{ })
274 && $curchar eq '('
275 && $i < $pat_len - 2
276 && substr($pat, $i+1, 1) eq '?'
277 && substr($pat, $i+2, 1) eq '{')
278 {
279 $skip++;
280 $reason = "Pattern contains '(?{'";
281 last;
282 }
283
284 # Closing ']'
285 if ($curchar eq ']' && $in_brackets) {
286 $modified .= " ] ])";
287 $in_brackets = 0;
288 next;
289 }
290
291 # A regular character.
292 if ($curchar ne '[') {
293 if (! $in_brackets) {
294 $modified .= $curchar;
295 }
296 else {
297 $modified .= " $curchar ";
298 }
299 next;
300 }
301
302 # Here is a '['; If not in a bracketed class, treat as the
303 # beginning of one.
304 if (! $in_brackets) {
305 $in_brackets = 1;
306 $modified .= "(?[ [ ";
307
308 # An immediately following ']' or '^]' is not the ending
309 # of the class, but is to be treated literally.
310 if ($i < $pat_len - 1
311 && substr($pat, $i+1, 1) eq ']')
312 {
313 $i ++;
314 $modified .= " ] ";
315 }
316 elsif ($i < $pat_len - 2
317 && substr($pat, $i+1, 1) eq '^'
318 && substr($pat, $i+2, 1) eq ']')
319 {
320 $i += 2;
321 $modified .= " ^ ] ";
322 }
323 next;
324 }
325
326 # Here is a plain '[' within [ ]. Could mean wants to
327 # match a '[', or it could be a posix class that has a
328 # corresponding ']'. Absorb either
329
330 $modified .= ' [';
331 last if $i >= $pat_len - 1;
332
333 $i++;
334 $curchar = substr($pat, $i, 1);
335 if ($curchar =~ /[:=.]/) {
336 for (my $j = $i + 1; $j < $pat_len; $j++) {
337 next unless substr($pat, $j, 1) eq ']';
338 last if $j - $i < 2;
339 if (substr($pat, $j - 1, 1) eq $curchar) {
340 # Here, is a posix class
341 $modified .= substr($pat, $i, $j - $i + 1) . " ";
342 $i = $j;
343 next CHAR;
344 }
345 }
346 }
347
348 # Here wasn't a posix class, just process normally
349 $modified .= " $curchar ";
350 }
351
352 if ($in_brackets && ! $skip) {
353 if ($result eq 'c') {
354 $skip++;
355 $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error";
356 }
357 else {
92b05f28 358 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n";
073b366a
KW
359 next TEST;
360 }
361 }
362
363 # Use our modified pattern instead of the original
364 $pat = $modified;
365 }
366 }
1de06328 367
cf549b97
DC
368 for my $study ('', 'study $subject;', 'utf8::upgrade($subject);',
369 'utf8::upgrade($subject); study $subject;') {
93f09d7b 370 # Need to make a copy, else the utf8::upgrade of an already studied
52e33015
NC
371 # scalar confuses things.
372 my $subject = $subject;
e426a4af 373 $subject = XS::APItest::string_without_null($subject) if $no_null;
1286eaeb
NC
374 my $c = $iters;
375 my ($code, $match, $got);
1de06328 376 if ($repl eq 'pos') {
933f1527
FC
377 my $patcode = defined $no_null_pat ? '/$no_null_pat/g'
378 : "m${pat}g";
1de06328 379 $code= <<EOFCODE;
cf549b97 380 $study
1de06328 381 pos(\$subject)=0;
933f1527 382 \$match = ( \$subject =~ $patcode );
1de06328
YO
383 \$got = pos(\$subject);
384EOFCODE
385 }
386 elsif ($qr_embed) {
387 $code= <<EOFCODE;
388 my \$RE = qr$pat;
cf549b97 389 $study
1de06328
YO
390 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
391 \$got = "$repl";
392EOFCODE
393 }
e3faa678
NC
394 elsif ($qr_embed_thr) {
395 $code= <<EOFCODE;
396 # Can't run the match in a subthread, but can do this and
397 # clone the pattern the other way.
398 my \$RE = threads->new(sub {qr$pat})->join();
cf549b97 399 $study
e3faa678
NC
400 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
401 \$got = "$repl";
402EOFCODE
403 }
933f1527
FC
404 elsif ($no_null) {
405 my $patcode = defined $no_null_pat ? '/$no_null_pat/'
406 : $pat;
407 $code= <<EOFCODE;
408 $study
409 \$match = (\$subject =~ $OP$pat) while \$c--;
410 \$got = "$repl";
411EOFCODE
412 }
1de06328
YO
413 else {
414 $code= <<EOFCODE;
cf549b97 415 $study
1286eaeb 416 \$match = (\$subject =~ $OP$pat) while \$c--;
1de06328
YO
417 \$got = "$repl";
418EOFCODE
419 }
073b366a 420 $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets;
e1d1eefb
YO
421 #$code.=qq[\n\$expect="$expect";\n];
422 #use Devel::Peek;
423 #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
66fb63c1
NC
424 {
425 # Probably we should annotate specific tests with which warnings
426 # categories they're known to trigger, and hence should be
427 # disabled just for that test
d8d26cac 428 no warnings qw(uninitialized regexp deprecated);
66fb63c1
NC
429 eval $code;
430 }
1286eaeb 431 chomp( my $err = $@ );
565b86e2 432 if ( $skip ) {
92b05f28 433 print "ok $testname # skipped", length($reason) ? ". $reason" : '', "\n";
ee595aa6 434 next TEST;
cf93c79d 435 }
565b86e2 436 elsif ($result eq 'c') {
92b05f28 437 if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST }
565b86e2
KW
438 last; # no need to study a syntax error
439 }
24d786f4 440 elsif ( $todo_qr ) {
92b05f28 441 print "not ok $testname # TODO", length($reason) ? " - $reason" : '', "\n";
e3faa678
NC
442 next TEST;
443 }
c277df42 444 elsif ($@) {
92b05f28 445 print "not ok $testname$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST;
c277df42 446 }
e3faa678 447 elsif ($result =~ /^n/) {
92b05f28 448 if ($match) { print "not ok $testname$todo ($study) $input => false positive\n"; next TEST }
378cc40b
LW
449 }
450 else {
cfa4f241 451 if (!$match || $got ne $expect) {
cde0cee5 452 eval { require Data::Dumper };
969c44e7 453 no warnings "utf8"; # But handle should be utf8
65016092
NC
454 if ($@ || !defined &DynaLoader::boot_DynaLoader) {
455 # Data::Dumper will load on miniperl, but fail when used in
456 # anger as it tries to load B. I'd prefer to keep the
457 # regular calls below outside of an eval so that real
458 # (unknown) failures get spotted, not ignored.
92b05f28 459 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$code\n");
cde0cee5
YO
460 }
461 else { # better diagnostics
462 my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump;
463 my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump;
6ea2424c
DC
464 my $e = Data::Dumper->new([$expect],['expected'])->Useqq(1)->Dump;
465 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$code\n$g\n$e\n");
cde0cee5 466 }
cfa4f241
CS
467 next TEST;
468 }
378cc40b
LW
469 }
470 }
92b05f28 471 print "ok $testname$todo\n";
378cc40b 472}
cfa4f241 473
1a610890 4741;