This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
t/re/regexp.t: Better formatting for test failures
[perl5.git] / t / re / regexp.t
CommitLineData
378cc40b
LW
1#!./perl
2
ae34ee58 3# The tests are in a separate file 't/re/re_tests'.
ad4f75a6
HM
4# Each line in that file is a separate test.
5# There are five columns, separated by tabs.
d6395ff9 6# An optional sixth column is used to give a reason, only when skipping tests
ad4f75a6
HM
7#
8# Column 1 contains the pattern, optionally enclosed in C<''>.
9# Modifiers can be put after the closing C<'>.
10#
11# Column 2 contains the string to be matched.
12#
13# Column 3 contains the expected result:
14# y expect a match
15# n expect no match
16# c expect an error
24d786f4 17# T the test is a TODO (can be combined with y/n/c)
cb6fa888 18# M skip test on miniperl (combine with y/n/c/T)
cf93c79d
IZ
19# B test exposes a known bug in Perl, should be skipped
20# b test exposes a known bug in Perl, should be skipped if noamp
e3faa678 21# t test exposes a bug with threading, TODO if qr_embed_thr
073b366a
KW
22# s test should only be run for regex_sets_compat.t
23# S test should not be run for regex_sets_compat.t
c46c4601
KW
24# a test should only be run on ASCII platforms
25# e test should only be run on EBCDIC platforms
ad4f75a6 26#
1b1626e4 27# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
ad4f75a6
HM
28#
29# Column 4 contains a string, usually C<$&>.
30#
31# Column 5 contains the expected result of double-quote
c277df42
IZ
32# interpolating that string after the match, or start of error message.
33#
ee595aa6
LC
34# Column 6, if present, contains a reason why the test is skipped.
35# This is printed with "skipped", for harness to pick up.
36#
3238b147
KW
37# Column 7 can be used for comments
38#
9d116dd7 39# \n in the tests are interpolated, as are variables of the form ${\w+}.
83e898de 40#
b9b4dddf
YO
41# Blanks lines are treated as PASSING tests to keep the line numbers
42# linked to the test number.
43#
8d37f932 44# If you want to add a regular expression test that can't be expressed
67a2b8c6 45# in this format, don't add it here: put it in re/pat.t instead.
b2a156bd 46#
ff3f963a
KW
47# Note that the inputs get passed on as "m're'", so the re bypasses the lexer.
48# This means this file cannot be used for testing anything that the lexer
49# handles; in 5.12 this means just \N{NAME} and \N{U+...}.
50#
b2a156bd
DM
51# Note that columns 2,3 and 5 are all enclosed in double quotes and then
52# evalled; so something like a\"\x{100}$1 has length 3+length($1).
d6395ff9
KW
53#
54# \x... and \o{...} constants are automatically converted to the native
55# character set if necessary. \[0-7] constants aren't
c277df42 56
7e1dab6a 57my ($file, $iters);
e4d48cc9 58BEGIN {
1a610890
NC
59 $iters = shift || 1; # Poor man performance suite, 10000 is OK.
60
61 # Do this open before any chdir
62 $file = shift;
63 if (defined $file) {
64 open TESTS, $file or die "Can't open $file";
65 }
66
e4d48cc9 67 chdir 't' if -d 't';
cc7e6304 68 @INC = qw '../lib ../ext/re';
87709a14
FC
69 if (!defined &DynaLoader::boot_DynaLoader) { # miniperl
70 print("1..0 # Skip Unicode tables not built yet\n"), exit
71 unless eval 'require "unicore/Heavy.pl"';
72 }
e4d48cc9 73}
1a610890 74
1b7228c9
KW
75sub _comment {
76 return map { /^#/ ? "$_\n" : "# $_\n" }
77 map { split /\n/ } @_;
78}
79
d6395ff9
KW
80sub convert_from_ascii {
81 my $string = shift;
82
83 #my $save = $string;
84 # Convert \x{...}, \o{...}
85 $string =~ s/ (?<! \\ ) \\x\{ ( .*? ) } / "\\x{" . sprintf("%X", utf8::unicode_to_native(hex $1)) . "}" /gex;
86 $string =~ s/ (?<! \\ ) \\o\{ ( .*? ) } / "\\o{" . sprintf("%o", utf8::unicode_to_native(oct $1)) . "}" /gex;
87
88 # Convert \xAB
89 $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9]{2} ) / "\\x" . sprintf("%02X", utf8::unicode_to_native(hex $1)) /gex;
90
91 # Convert \xA
92 $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9] ) (?! [A-Fa-f0-9] ) / "\\x" . sprintf("%X", utf8::unicode_to_native(hex $1)) /gex;
93
94 #print STDERR __LINE__, ": $save\n$string\n" if $save ne $string;
95 return $string;
96}
97
1286eaeb 98use strict;
66fb63c1 99use warnings FATAL=>"all";
7e1dab6a 100use vars qw($bang $ffff $nulnul); # used by the tests
073b366a
KW
101use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets); # set by our callers
102
e4d48cc9 103
ad4f75a6 104
1a610890 105if (!defined $file) {
7e1dab6a 106 open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!";
1a610890
NC
107}
108
109my @tests = <TESTS>;
cfa4f241 110
1a610890 111close TESTS;
378cc40b 112
9d116dd7 113$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable.
b8c5462f
JH
114$ffff = chr(0xff) x 2;
115$nulnul = "\0" x 2;
7e1dab6a 116my $OP = $qr ? 'qr' : 'm';
9d116dd7 117
1462b684 118$| = 1;
1a610890 119printf "1..%d\n# $iters iterations\n", scalar @tests;
e3faa678 120
1a610890 121my $test;
cfa4f241 122TEST:
1a610890
NC
123foreach (@tests) {
124 $test++;
5a51db05 125 if (!/\S/ || /^\s*#/ || /^__END__$/) {
92b05f28
YO
126 chomp;
127 my ($not,$comment)= split /\s*#\s*/, $_, 2;
128 $comment ||= "(blank line)";
129 print "ok $test # $comment\n";
b9b4dddf
YO
130 next;
131 }
b85d18e9 132 chomp;
073b366a 133 s/\\n/\n/g unless $regex_sets;
92b05f28 134 my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7);
b8f6efdd
YO
135 if (!defined $subject) {
136 die "Bad test definition on line $test: $_\n";
137 }
66fb63c1 138 $reason = '' unless defined $reason;
1286eaeb 139 my $input = join(':',$pat,$subject,$result,$repl,$expect);
d6395ff9 140
24d786f4
YO
141 # the double '' below keeps simple syntax highlighters from going crazy
142 $pat = "'$pat'" unless $pat =~ /^[:''\/]/;
9d116dd7 143 $pat =~ s/(\$\{\w+\})/$1/eeg;
073b366a 144 $pat =~ s/\\n/\n/g unless $regex_sets;
d6395ff9
KW
145 $pat = convert_from_ascii($pat) if ord("A") != 65;
146
147 $subject = convert_from_ascii($subject) if ord("A") != 65;
1a610890 148 $subject = eval qq("$subject"); die $@ if $@;
d6395ff9
KW
149
150 $expect = convert_from_ascii($expect) if ord("A") != 65;
1a610890 151 $expect = eval qq("$expect"); die $@ if $@;
c277df42 152 $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/;
d6395ff9 153
24d786f4 154 my $todo_qr = $qr_embed_thr && ($result =~ s/t//);
1286eaeb 155 my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
cb6fa888 156 ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader;
073b366a
KW
157 if ($result =~ s/ ( [Ss] ) //x) {
158 if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) {
159 $skip++;
160 $reason = "Test not valid for $0";
161 }
162 }
c46c4601
KW
163 if ($result =~ s/a// && ord("A") != 65) {
164 $skip++;
165 $reason = "Test is only valid for ASCII platforms. $reason";
166 }
167 if ($result =~ s/e// && ord("A") != 193) {
168 $skip++;
169 $reason = "Test is only valid for EBCDIC platforms. $reason";
170 }
906e884f 171 $reason = 'skipping $&' if $reason eq '' && $skip_amp;
cf93c79d 172 $result =~ s/B//i unless $skip;
24d786f4 173 my $todo= $result =~ s/T// ? " # TODO" : "";
92b05f28
YO
174 my $testname= $test;
175 if ($comment) {
176 $comment=~s/^\s*(?:#\s*)?//;
177 $testname .= " - $comment" if $comment;
178 }
073b366a
KW
179 if (! $skip && $regex_sets) {
180
181 # If testing regex sets, change the [bracketed] classes into
58c435ec
KW
182 # (?[bracketed]). But note that '\[' and '\c[' don't introduce such a
183 # class. (We don't bother looking for an odd number of backslashes,
184 # as this hasn't been needed so far.)
185 if ($pat !~ / (?<!\\c) (?<!\\) \[ /x) {
073b366a
KW
186 $skip++;
187 $reason = "Pattern doesn't contain [brackets]";
188 }
189 else { # Use non-regex features of Perl to accomplish this.
190 my $modified = "";
191 my $in_brackets = 0;
192
193 # Go through the pattern character-by-character. We also add
194 # blanks around each token to test the /x parts of (?[ ])
195 my $pat_len = length($pat);
196 CHAR: for (my $i = 0; $i < $pat_len; $i++) {
197 my $curchar = substr($pat, $i, 1);
198 if ($curchar eq '\\') {
199 $modified .= " " if $in_brackets;
200 $modified .= $curchar;
201 $i++;
202
203 # Get the character the backslash is escaping
204 $curchar = substr($pat, $i, 1);
205 $modified .= $curchar;
206
207 # If the character following that is a '{}', treat the
208 # entire amount as a single token
209 if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') {
210 my $j = index($pat, '}', $i+2);
211 if ($j < 0) {
212 last unless $in_brackets;
213 if ($result eq 'c') {
214 $skip++;
215 $reason = "Can't handle compilation errors with unmatched '{'";
216 }
217 else {
92b05f28 218 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n";
073b366a
KW
219 next TEST;
220 }
221 }
222 $modified .= substr($pat, $i+1, $j - $i);
223 $i = $j;
224 }
225 elsif ($curchar eq 'x') {
226
227 # \x without brackets is supposed to be followed by 2
228 # hex digits. Take up to 2, and then add a blank
229 # after the last one. This avoids getting errors from
230 # (?[ ]) for run-ons, like \xabc
231 my $j = $i + 1;
232 for (; $j < $i + 3 && $j < $pat_len; $j++) {
233 my $curord = ord(substr($pat, $j, 1));
234 if (!(($curord >= ord("A") && $curord <= ord("F"))
235 || ($curord >= ord("a") && $curord <= ord("f"))
236 || ($curord >= ord("0") && $curord <= ord("9"))))
237 {
238 $j++;
239 last;
240 }
241 }
242 $j--;
243 $modified .= substr($pat, $i + 1, $j - $i) . " ";
244 $i = $j;
245 }
246 elsif (ord($curchar) >= ord('0')
247 && (ord($curchar) <= ord('7')))
248 {
249 # Similarly, octal constants have up to 3 digits.
250 my $j = $i + 1;
251 for (; $j < $i + 3 && $j < $pat_len; $j++) {
252 my $curord = ord(substr($pat, $j, 1));
253 if (! ($curord >= ord("0") && $curord <= ord("7"))) {
254 $j++;
255 last;
256 }
257 }
258 $j--;
259 $modified .= substr($pat, $i + 1, $j - $i);
260 $i = $j;
261 }
262
263 next;
264 } # End of processing a backslash sequence
265
266 if (! $in_brackets # Skip (?{ })
267 && $curchar eq '('
268 && $i < $pat_len - 2
269 && substr($pat, $i+1, 1) eq '?'
270 && substr($pat, $i+2, 1) eq '{')
271 {
272 $skip++;
273 $reason = "Pattern contains '(?{'";
274 last;
275 }
276
277 # Closing ']'
278 if ($curchar eq ']' && $in_brackets) {
279 $modified .= " ] ])";
280 $in_brackets = 0;
281 next;
282 }
283
284 # A regular character.
285 if ($curchar ne '[') {
286 if (! $in_brackets) {
287 $modified .= $curchar;
288 }
289 else {
290 $modified .= " $curchar ";
291 }
292 next;
293 }
294
295 # Here is a '['; If not in a bracketed class, treat as the
296 # beginning of one.
297 if (! $in_brackets) {
298 $in_brackets = 1;
299 $modified .= "(?[ [ ";
300
301 # An immediately following ']' or '^]' is not the ending
302 # of the class, but is to be treated literally.
303 if ($i < $pat_len - 1
304 && substr($pat, $i+1, 1) eq ']')
305 {
306 $i ++;
307 $modified .= " ] ";
308 }
309 elsif ($i < $pat_len - 2
310 && substr($pat, $i+1, 1) eq '^'
311 && substr($pat, $i+2, 1) eq ']')
312 {
313 $i += 2;
314 $modified .= " ^ ] ";
315 }
316 next;
317 }
318
319 # Here is a plain '[' within [ ]. Could mean wants to
320 # match a '[', or it could be a posix class that has a
321 # corresponding ']'. Absorb either
322
323 $modified .= ' [';
324 last if $i >= $pat_len - 1;
325
326 $i++;
327 $curchar = substr($pat, $i, 1);
328 if ($curchar =~ /[:=.]/) {
329 for (my $j = $i + 1; $j < $pat_len; $j++) {
330 next unless substr($pat, $j, 1) eq ']';
331 last if $j - $i < 2;
332 if (substr($pat, $j - 1, 1) eq $curchar) {
333 # Here, is a posix class
334 $modified .= substr($pat, $i, $j - $i + 1) . " ";
335 $i = $j;
336 next CHAR;
337 }
338 }
339 }
340
341 # Here wasn't a posix class, just process normally
342 $modified .= " $curchar ";
343 }
344
345 if ($in_brackets && ! $skip) {
346 if ($result eq 'c') {
347 $skip++;
348 $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error";
349 }
350 else {
92b05f28 351 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n";
073b366a
KW
352 next TEST;
353 }
354 }
355
356 # Use our modified pattern instead of the original
357 $pat = $modified;
358 }
359 }
1de06328 360
52e33015
NC
361 for my $study ('', 'study $subject', 'utf8::upgrade($subject)',
362 'utf8::upgrade($subject); study $subject') {
93f09d7b 363 # Need to make a copy, else the utf8::upgrade of an already studied
52e33015
NC
364 # scalar confuses things.
365 my $subject = $subject;
1286eaeb
NC
366 my $c = $iters;
367 my ($code, $match, $got);
1de06328
YO
368 if ($repl eq 'pos') {
369 $code= <<EOFCODE;
370 $study;
371 pos(\$subject)=0;
372 \$match = ( \$subject =~ m${pat}g );
373 \$got = pos(\$subject);
374EOFCODE
375 }
376 elsif ($qr_embed) {
377 $code= <<EOFCODE;
378 my \$RE = qr$pat;
379 $study;
380 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
381 \$got = "$repl";
382EOFCODE
383 }
e3faa678
NC
384 elsif ($qr_embed_thr) {
385 $code= <<EOFCODE;
386 # Can't run the match in a subthread, but can do this and
387 # clone the pattern the other way.
388 my \$RE = threads->new(sub {qr$pat})->join();
389 $study;
390 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
391 \$got = "$repl";
392EOFCODE
393 }
1de06328
YO
394 else {
395 $code= <<EOFCODE;
396 $study;
1286eaeb 397 \$match = (\$subject =~ $OP$pat) while \$c--;
1de06328
YO
398 \$got = "$repl";
399EOFCODE
400 }
073b366a 401 $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets;
e1d1eefb
YO
402 #$code.=qq[\n\$expect="$expect";\n];
403 #use Devel::Peek;
404 #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
66fb63c1
NC
405 {
406 # Probably we should annotate specific tests with which warnings
407 # categories they're known to trigger, and hence should be
408 # disabled just for that test
d8d26cac 409 no warnings qw(uninitialized regexp deprecated);
66fb63c1
NC
410 eval $code;
411 }
1286eaeb 412 chomp( my $err = $@ );
565b86e2 413 if ( $skip ) {
92b05f28 414 print "ok $testname # skipped", length($reason) ? ". $reason" : '', "\n";
ee595aa6 415 next TEST;
cf93c79d 416 }
565b86e2 417 elsif ($result eq 'c') {
92b05f28 418 if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST }
565b86e2
KW
419 last; # no need to study a syntax error
420 }
24d786f4 421 elsif ( $todo_qr ) {
92b05f28 422 print "not ok $testname # TODO", length($reason) ? " - $reason" : '', "\n";
e3faa678
NC
423 next TEST;
424 }
c277df42 425 elsif ($@) {
92b05f28 426 print "not ok $testname$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST;
c277df42 427 }
e3faa678 428 elsif ($result =~ /^n/) {
92b05f28 429 if ($match) { print "not ok $testname$todo ($study) $input => false positive\n"; next TEST }
378cc40b
LW
430 }
431 else {
cfa4f241 432 if (!$match || $got ne $expect) {
cde0cee5 433 eval { require Data::Dumper };
969c44e7 434 no warnings "utf8"; # But handle should be utf8
65016092
NC
435 if ($@ || !defined &DynaLoader::boot_DynaLoader) {
436 # Data::Dumper will load on miniperl, but fail when used in
437 # anger as it tries to load B. I'd prefer to keep the
438 # regular calls below outside of an eval so that real
439 # (unknown) failures get spotted, not ignored.
92b05f28 440 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$code\n");
cde0cee5
YO
441 }
442 else { # better diagnostics
443 my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump;
444 my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump;
6ea2424c
DC
445 my $e = Data::Dumper->new([$expect],['expected'])->Useqq(1)->Dump;
446 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$code\n$g\n$e\n");
cde0cee5 447 }
cfa4f241
CS
448 next TEST;
449 }
378cc40b
LW
450 }
451 }
92b05f28 452 print "ok $testname$todo\n";
378cc40b 453}
cfa4f241 454
1a610890 4551;