Commit | Line | Data |
---|---|---|
378cc40b LW |
1 | #!./perl |
2 | ||
ae34ee58 | 3 | # The tests are in a separate file 't/re/re_tests'. |
ad4f75a6 HM |
4 | # Each line in that file is a separate test. |
5 | # There are five columns, separated by tabs. | |
6 | # | |
7 | # Column 1 contains the pattern, optionally enclosed in C<''>. | |
8 | # Modifiers can be put after the closing C<'>. | |
9 | # | |
10 | # Column 2 contains the string to be matched. | |
11 | # | |
12 | # Column 3 contains the expected result: | |
13 | # y expect a match | |
14 | # n expect no match | |
15 | # c expect an error | |
24d786f4 | 16 | # T the test is a TODO (can be combined with y/n/c) |
cb6fa888 | 17 | # M skip test on miniperl (combine with y/n/c/T) |
cf93c79d IZ |
18 | # B test exposes a known bug in Perl, should be skipped |
19 | # b test exposes a known bug in Perl, should be skipped if noamp | |
e3faa678 | 20 | # t test exposes a bug with threading, TODO if qr_embed_thr |
073b366a KW |
21 | # s test should only be run for regex_sets_compat.t |
22 | # S test should not be run for regex_sets_compat.t | |
ad4f75a6 | 23 | # |
1b1626e4 | 24 | # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. |
ad4f75a6 HM |
25 | # |
26 | # Column 4 contains a string, usually C<$&>. | |
27 | # | |
28 | # Column 5 contains the expected result of double-quote | |
c277df42 IZ |
29 | # interpolating that string after the match, or start of error message. |
30 | # | |
ee595aa6 LC |
31 | # Column 6, if present, contains a reason why the test is skipped. |
32 | # This is printed with "skipped", for harness to pick up. | |
33 | # | |
3238b147 KW |
34 | # Column 7 can be used for comments |
35 | # | |
9d116dd7 | 36 | # \n in the tests are interpolated, as are variables of the form ${\w+}. |
83e898de | 37 | # |
b9b4dddf YO |
38 | # Blanks lines are treated as PASSING tests to keep the line numbers |
39 | # linked to the test number. | |
40 | # | |
8d37f932 | 41 | # If you want to add a regular expression test that can't be expressed |
67a2b8c6 | 42 | # in this format, don't add it here: put it in re/pat.t instead. |
b2a156bd | 43 | # |
ff3f963a KW |
44 | # Note that the inputs get passed on as "m're'", so the re bypasses the lexer. |
45 | # This means this file cannot be used for testing anything that the lexer | |
46 | # handles; in 5.12 this means just \N{NAME} and \N{U+...}. | |
47 | # | |
b2a156bd DM |
48 | # Note that columns 2,3 and 5 are all enclosed in double quotes and then |
49 | # evalled; so something like a\"\x{100}$1 has length 3+length($1). | |
c277df42 | 50 | |
7e1dab6a | 51 | my ($file, $iters); |
e4d48cc9 | 52 | BEGIN { |
1a610890 NC |
53 | $iters = shift || 1; # Poor man performance suite, 10000 is OK. |
54 | ||
55 | # Do this open before any chdir | |
56 | $file = shift; | |
57 | if (defined $file) { | |
58 | open TESTS, $file or die "Can't open $file"; | |
59 | } | |
60 | ||
e4d48cc9 | 61 | chdir 't' if -d 't'; |
20822f61 | 62 | @INC = '../lib'; |
e3faa678 | 63 | |
e4d48cc9 | 64 | } |
1a610890 | 65 | |
1b7228c9 KW |
66 | sub _comment { |
67 | return map { /^#/ ? "$_\n" : "# $_\n" } | |
68 | map { split /\n/ } @_; | |
69 | } | |
70 | ||
1286eaeb | 71 | use strict; |
66fb63c1 | 72 | use warnings FATAL=>"all"; |
7e1dab6a | 73 | use vars qw($bang $ffff $nulnul); # used by the tests |
073b366a KW |
74 | use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets); # set by our callers |
75 | ||
e4d48cc9 | 76 | |
ad4f75a6 | 77 | |
1a610890 | 78 | if (!defined $file) { |
7e1dab6a | 79 | open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!"; |
1a610890 NC |
80 | } |
81 | ||
82 | my @tests = <TESTS>; | |
cfa4f241 | 83 | |
1a610890 | 84 | close TESTS; |
378cc40b | 85 | |
9d116dd7 | 86 | $bang = sprintf "\\%03o", ord "!"; # \41 would not be portable. |
b8c5462f JH |
87 | $ffff = chr(0xff) x 2; |
88 | $nulnul = "\0" x 2; | |
7e1dab6a | 89 | my $OP = $qr ? 'qr' : 'm'; |
9d116dd7 | 90 | |
1462b684 | 91 | $| = 1; |
1a610890 | 92 | printf "1..%d\n# $iters iterations\n", scalar @tests; |
e3faa678 | 93 | |
1a610890 | 94 | my $test; |
cfa4f241 | 95 | TEST: |
1a610890 NC |
96 | foreach (@tests) { |
97 | $test++; | |
5a51db05 | 98 | if (!/\S/ || /^\s*#/ || /^__END__$/) { |
1a610890 | 99 | print "ok $test # (Blank line or comment)\n"; |
5a51db05 | 100 | if (/#/) { print $_ }; |
b9b4dddf YO |
101 | next; |
102 | } | |
b85d18e9 | 103 | chomp; |
073b366a | 104 | s/\\n/\n/g unless $regex_sets; |
1286eaeb | 105 | my ($pat, $subject, $result, $repl, $expect, $reason) = split(/\t/,$_,6); |
b8f6efdd YO |
106 | if (!defined $subject) { |
107 | die "Bad test definition on line $test: $_\n"; | |
108 | } | |
66fb63c1 | 109 | $reason = '' unless defined $reason; |
1286eaeb | 110 | my $input = join(':',$pat,$subject,$result,$repl,$expect); |
24d786f4 YO |
111 | # the double '' below keeps simple syntax highlighters from going crazy |
112 | $pat = "'$pat'" unless $pat =~ /^[:''\/]/; | |
9d116dd7 | 113 | $pat =~ s/(\$\{\w+\})/$1/eeg; |
073b366a | 114 | $pat =~ s/\\n/\n/g unless $regex_sets; |
1a610890 NC |
115 | $subject = eval qq("$subject"); die $@ if $@; |
116 | $expect = eval qq("$expect"); die $@ if $@; | |
c277df42 | 117 | $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; |
24d786f4 | 118 | my $todo_qr = $qr_embed_thr && ($result =~ s/t//); |
1286eaeb | 119 | my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); |
cb6fa888 | 120 | ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader; |
073b366a KW |
121 | if ($result =~ s/ ( [Ss] ) //x) { |
122 | if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) { | |
123 | $skip++; | |
124 | $reason = "Test not valid for $0"; | |
125 | } | |
126 | } | |
906e884f | 127 | $reason = 'skipping $&' if $reason eq '' && $skip_amp; |
cf93c79d | 128 | $result =~ s/B//i unless $skip; |
24d786f4 | 129 | my $todo= $result =~ s/T// ? " # TODO" : ""; |
073b366a KW |
130 | if (! $skip && $regex_sets) { |
131 | ||
132 | # If testing regex sets, change the [bracketed] classes into | |
133 | # (?[bracketed]). | |
134 | ||
135 | if ($pat !~ / \[ /x) { | |
136 | ||
137 | $skip++; | |
138 | $reason = "Pattern doesn't contain [brackets]"; | |
139 | } | |
140 | else { # Use non-regex features of Perl to accomplish this. | |
141 | my $modified = ""; | |
142 | my $in_brackets = 0; | |
143 | ||
144 | # Go through the pattern character-by-character. We also add | |
145 | # blanks around each token to test the /x parts of (?[ ]) | |
146 | my $pat_len = length($pat); | |
147 | CHAR: for (my $i = 0; $i < $pat_len; $i++) { | |
148 | my $curchar = substr($pat, $i, 1); | |
149 | if ($curchar eq '\\') { | |
150 | $modified .= " " if $in_brackets; | |
151 | $modified .= $curchar; | |
152 | $i++; | |
153 | ||
154 | # Get the character the backslash is escaping | |
155 | $curchar = substr($pat, $i, 1); | |
156 | $modified .= $curchar; | |
157 | ||
158 | # If the character following that is a '{}', treat the | |
159 | # entire amount as a single token | |
160 | if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') { | |
161 | my $j = index($pat, '}', $i+2); | |
162 | if ($j < 0) { | |
163 | last unless $in_brackets; | |
164 | if ($result eq 'c') { | |
165 | $skip++; | |
166 | $reason = "Can't handle compilation errors with unmatched '{'"; | |
167 | } | |
168 | else { | |
169 | print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n"; | |
170 | next TEST; | |
171 | } | |
172 | } | |
173 | $modified .= substr($pat, $i+1, $j - $i); | |
174 | $i = $j; | |
175 | } | |
176 | elsif ($curchar eq 'x') { | |
177 | ||
178 | # \x without brackets is supposed to be followed by 2 | |
179 | # hex digits. Take up to 2, and then add a blank | |
180 | # after the last one. This avoids getting errors from | |
181 | # (?[ ]) for run-ons, like \xabc | |
182 | my $j = $i + 1; | |
183 | for (; $j < $i + 3 && $j < $pat_len; $j++) { | |
184 | my $curord = ord(substr($pat, $j, 1)); | |
185 | if (!(($curord >= ord("A") && $curord <= ord("F")) | |
186 | || ($curord >= ord("a") && $curord <= ord("f")) | |
187 | || ($curord >= ord("0") && $curord <= ord("9")))) | |
188 | { | |
189 | $j++; | |
190 | last; | |
191 | } | |
192 | } | |
193 | $j--; | |
194 | $modified .= substr($pat, $i + 1, $j - $i) . " "; | |
195 | $i = $j; | |
196 | } | |
197 | elsif (ord($curchar) >= ord('0') | |
198 | && (ord($curchar) <= ord('7'))) | |
199 | { | |
200 | # Similarly, octal constants have up to 3 digits. | |
201 | my $j = $i + 1; | |
202 | for (; $j < $i + 3 && $j < $pat_len; $j++) { | |
203 | my $curord = ord(substr($pat, $j, 1)); | |
204 | if (! ($curord >= ord("0") && $curord <= ord("7"))) { | |
205 | $j++; | |
206 | last; | |
207 | } | |
208 | } | |
209 | $j--; | |
210 | $modified .= substr($pat, $i + 1, $j - $i); | |
211 | $i = $j; | |
212 | } | |
213 | ||
214 | next; | |
215 | } # End of processing a backslash sequence | |
216 | ||
217 | if (! $in_brackets # Skip (?{ }) | |
218 | && $curchar eq '(' | |
219 | && $i < $pat_len - 2 | |
220 | && substr($pat, $i+1, 1) eq '?' | |
221 | && substr($pat, $i+2, 1) eq '{') | |
222 | { | |
223 | $skip++; | |
224 | $reason = "Pattern contains '(?{'"; | |
225 | last; | |
226 | } | |
227 | ||
228 | # Closing ']' | |
229 | if ($curchar eq ']' && $in_brackets) { | |
230 | $modified .= " ] ])"; | |
231 | $in_brackets = 0; | |
232 | next; | |
233 | } | |
234 | ||
235 | # A regular character. | |
236 | if ($curchar ne '[') { | |
237 | if (! $in_brackets) { | |
238 | $modified .= $curchar; | |
239 | } | |
240 | else { | |
241 | $modified .= " $curchar "; | |
242 | } | |
243 | next; | |
244 | } | |
245 | ||
246 | # Here is a '['; If not in a bracketed class, treat as the | |
247 | # beginning of one. | |
248 | if (! $in_brackets) { | |
249 | $in_brackets = 1; | |
250 | $modified .= "(?[ [ "; | |
251 | ||
252 | # An immediately following ']' or '^]' is not the ending | |
253 | # of the class, but is to be treated literally. | |
254 | if ($i < $pat_len - 1 | |
255 | && substr($pat, $i+1, 1) eq ']') | |
256 | { | |
257 | $i ++; | |
258 | $modified .= " ] "; | |
259 | } | |
260 | elsif ($i < $pat_len - 2 | |
261 | && substr($pat, $i+1, 1) eq '^' | |
262 | && substr($pat, $i+2, 1) eq ']') | |
263 | { | |
264 | $i += 2; | |
265 | $modified .= " ^ ] "; | |
266 | } | |
267 | next; | |
268 | } | |
269 | ||
270 | # Here is a plain '[' within [ ]. Could mean wants to | |
271 | # match a '[', or it could be a posix class that has a | |
272 | # corresponding ']'. Absorb either | |
273 | ||
274 | $modified .= ' ['; | |
275 | last if $i >= $pat_len - 1; | |
276 | ||
277 | $i++; | |
278 | $curchar = substr($pat, $i, 1); | |
279 | if ($curchar =~ /[:=.]/) { | |
280 | for (my $j = $i + 1; $j < $pat_len; $j++) { | |
281 | next unless substr($pat, $j, 1) eq ']'; | |
282 | last if $j - $i < 2; | |
283 | if (substr($pat, $j - 1, 1) eq $curchar) { | |
284 | # Here, is a posix class | |
285 | $modified .= substr($pat, $i, $j - $i + 1) . " "; | |
286 | $i = $j; | |
287 | next CHAR; | |
288 | } | |
289 | } | |
290 | } | |
291 | ||
292 | # Here wasn't a posix class, just process normally | |
293 | $modified .= " $curchar "; | |
294 | } | |
295 | ||
296 | if ($in_brackets && ! $skip) { | |
297 | if ($result eq 'c') { | |
298 | $skip++; | |
299 | $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error"; | |
300 | } | |
301 | else { | |
302 | print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n"; | |
303 | next TEST; | |
304 | } | |
305 | } | |
306 | ||
307 | # Use our modified pattern instead of the original | |
308 | $pat = $modified; | |
309 | } | |
310 | } | |
1de06328 | 311 | |
52e33015 NC |
312 | for my $study ('', 'study $subject', 'utf8::upgrade($subject)', |
313 | 'utf8::upgrade($subject); study $subject') { | |
93f09d7b | 314 | # Need to make a copy, else the utf8::upgrade of an already studied |
52e33015 NC |
315 | # scalar confuses things. |
316 | my $subject = $subject; | |
1286eaeb NC |
317 | my $c = $iters; |
318 | my ($code, $match, $got); | |
1de06328 YO |
319 | if ($repl eq 'pos') { |
320 | $code= <<EOFCODE; | |
321 | $study; | |
322 | pos(\$subject)=0; | |
323 | \$match = ( \$subject =~ m${pat}g ); | |
324 | \$got = pos(\$subject); | |
325 | EOFCODE | |
326 | } | |
327 | elsif ($qr_embed) { | |
328 | $code= <<EOFCODE; | |
329 | my \$RE = qr$pat; | |
330 | $study; | |
331 | \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; | |
332 | \$got = "$repl"; | |
333 | EOFCODE | |
334 | } | |
e3faa678 NC |
335 | elsif ($qr_embed_thr) { |
336 | $code= <<EOFCODE; | |
337 | # Can't run the match in a subthread, but can do this and | |
338 | # clone the pattern the other way. | |
339 | my \$RE = threads->new(sub {qr$pat})->join(); | |
340 | $study; | |
341 | \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; | |
342 | \$got = "$repl"; | |
343 | EOFCODE | |
344 | } | |
1de06328 YO |
345 | else { |
346 | $code= <<EOFCODE; | |
347 | $study; | |
1286eaeb | 348 | \$match = (\$subject =~ $OP$pat) while \$c--; |
1de06328 YO |
349 | \$got = "$repl"; |
350 | EOFCODE | |
351 | } | |
073b366a | 352 | $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets; |
e1d1eefb YO |
353 | #$code.=qq[\n\$expect="$expect";\n]; |
354 | #use Devel::Peek; | |
355 | #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; | |
66fb63c1 NC |
356 | { |
357 | # Probably we should annotate specific tests with which warnings | |
358 | # categories they're known to trigger, and hence should be | |
359 | # disabled just for that test | |
360 | no warnings qw(uninitialized regexp); | |
361 | eval $code; | |
362 | } | |
1286eaeb | 363 | chomp( my $err = $@ ); |
565b86e2 | 364 | if ( $skip ) { |
3c6cc85e | 365 | print "ok $test # skipped", length($reason) ? ". $reason" : '', "\n"; |
ee595aa6 | 366 | next TEST; |
cf93c79d | 367 | } |
565b86e2 KW |
368 | elsif ($result eq 'c') { |
369 | if ($err !~ m!^\Q$expect!) { print "not ok $test$todo (compile) $input => '$err'\n"; next TEST } | |
370 | last; # no need to study a syntax error | |
371 | } | |
24d786f4 | 372 | elsif ( $todo_qr ) { |
e0892690 | 373 | print "not ok $test # TODO", length($reason) ? " - $reason" : '', "\n"; |
e3faa678 NC |
374 | next TEST; |
375 | } | |
c277df42 | 376 | elsif ($@) { |
2fe1f0f5 | 377 | print "not ok $test$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST; |
c277df42 | 378 | } |
e3faa678 | 379 | elsif ($result =~ /^n/) { |
24d786f4 | 380 | if ($match) { print "not ok $test$todo ($study) $input => false positive\n"; next TEST } |
378cc40b LW |
381 | } |
382 | else { | |
cfa4f241 | 383 | if (!$match || $got ne $expect) { |
cde0cee5 | 384 | eval { require Data::Dumper }; |
969c44e7 | 385 | no warnings "utf8"; # But handle should be utf8 |
65016092 NC |
386 | if ($@ || !defined &DynaLoader::boot_DynaLoader) { |
387 | # Data::Dumper will load on miniperl, but fail when used in | |
388 | # anger as it tries to load B. I'd prefer to keep the | |
389 | # regular calls below outside of an eval so that real | |
390 | # (unknown) failures get spotted, not ignored. | |
2fe1f0f5 | 391 | print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$code\n"); |
cde0cee5 YO |
392 | } |
393 | else { # better diagnostics | |
394 | my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump; | |
395 | my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump; | |
2fe1f0f5 | 396 | print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$g\n$code\n"); |
cde0cee5 | 397 | } |
cfa4f241 CS |
398 | next TEST; |
399 | } | |
378cc40b LW |
400 | } |
401 | } | |
24d786f4 | 402 | print "ok $test$todo\n"; |
378cc40b | 403 | } |
cfa4f241 | 404 | |
1a610890 | 405 | 1; |