3 # This is a home for regular expression tests that don't fit into
4 # the format supported by re/regexp.t. If you want to add a test
5 # that does fit that format, add it to re/re_tests, not here.
14 set_up_inc( '../lib', '.' );
15 skip_all_if_miniperl("miniperl can't load Tie::Hash::NamedCapture, need for %+ and %-");
23 plan tests => 2514; # Update this when adding/deleting tests.
25 run_tests() unless caller;
32 like("A \x{263a} B z C", qr/A . B (??{ "z" }) C/,
33 "Match UTF-8 char in presence of (??{ }); Bug 20000731.001 (#3600)");
36 no warnings 'uninitialized';
37 ok(undef =~ /^([^\/]*)(.*)$/, "Used to cause a SEGV; Bug 20001021.005 (#4492)");
41 my $message = 'bug id 20001008.001 (#4407)';
43 my @x = ("stra\337e 138", "stra\337e 138");
45 ok(s/(\d+)\s*([\w\-]+)/$1 . uc $2/e, $message);
46 ok(my ($latin) = /^(.+)(?:\s+\d)/, $message);
47 is($latin, "stra\337e", $message);
48 ok($latin =~ s/stra\337e/straße/, $message);
50 # Previous code follows, but outcommented - there were no tests.
52 # $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a
53 # use utf8; # needed for the raw UTF-8
54 # $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a
59 # Fist half of the bug.
60 my $message = 'HEBREW ACCENT QADMA matched by .*; Bug 20001028.003 (#4536)';
62 ok(my ($Y) = $X =~ /(.*)/, $message);
63 is($Y, v1448, $message);
64 is(length $Y, 1, $message);
66 # Second half of the bug.
67 $message = 'HEBREW ACCENT QADMA in replacement; Bug 20001028.003 (#4536)';
69 $X =~ s/^/chr(1488)/e;
70 is(length $X, 1, $message);
71 is(ord $X, 1488, $message);
75 my $message = 'Repeated s///; Bug 20001108.001 (#4631)';
76 my $X = "Szab\x{f3},Bal\x{e1}zs";
78 $Y =~ s/(B)/$1/ for 0 .. 3;
80 is($X, "Szab\x{f3},Bal\x{e1}zs", $message);
84 my $message = 's/// on UTF-8 string; Bug 20000517.001 (#3253)';
87 is($x, "\x{100}B", $message);
88 is(length $x, 2, $message);
92 # The original bug report had 'no utf8' here but that was irrelevant.
94 my $message = "Don't dump core; Bug 20010306.008 (#5982)";
96 like($a, qr/\w/, $message); # used to core dump.
100 my $message = '/g in scalar context; Bug 20010410.006 (#6796)';
101 for my $rx ('/(.*?)\{(.*?)\}/csg',
102 '/(.*?)\{(.*?)\}/cg',
103 '/(.*?)\{(.*?)\}/sg',
105 '/(.+?)\{(.+?)\}/csg',) {
107 my $input = "a{b}c{d}";
109 while (eval \$input =~ $rx) {
118 # Amazingly vertical tabulator is the same in ASCII and EBCDIC.
119 for ("\n", "\t", "\014", "\r") {
120 unlike($_, qr/[[:print:]]/, sprintf "\\%03o not in [[:print:]]; Bug 20010619.003 (#7131)", ord $_);
123 like($_, qr/[[:print:]]/, "'$_' in [[:print:]]; Bug 20010619.003 (#7131)");
128 # [ID 20010814.004 (#7526)] pos() doesn't work when using =~m// in list context
131 my $a = join ":", /b./gc;
132 my $b = join ":", /a./gc;
134 is("$a $b $c", 'ba:ba ad:ae 10', "pos() works with () = m//; Bug 20010814.004 (#7526)");
138 # [ID 20010407.006 (#6767)] matching utf8 return values from
139 # functions does not work
141 my $message = 'UTF-8 return values from functions; Bug 20010407.006 (#6767)';
142 package ID_20010407_006;
146 ::ok($x =~ /(..)/, $message);
148 ::ok(length ($y) == 2 && $y eq $x, $message);
149 ::ok(x =~ /(..)/, $message);
151 ::ok(length ($y) == 2 && $y eq $x, $message);
155 # High bit bug -- japhy
157 ok $x =~ /.*?\200/, "High bit fine";
161 my $message = 'UTF-8 hash keys and /$/';
162 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters
163 # /2002-01/msg01327.html
166 my $v = substr ($u, 0, 1);
167 my $w = substr ($u, 1, 1);
168 my %u = ($u => $u, $v => $v, $w => $w);
170 my $m1 = /^\w*$/ ? 1 : 0;
171 my $m2 = $u {$_} =~ /^\w*$/ ? 1 : 0;
172 is($m1, $m2, $message);
177 my $message = "s///eg [change 13f46d054db22cf4]; Bug 20020124.005 (#8335)";
179 for my $char ("a", "\x{df}", "\x{100}") {
180 my $x = "$char b $char";
185 is(substr ($x, 0, 1), substr ($x, -1, 1), $message);
190 my $message = "Correct pmop flags checked when empty pattern; Bug 20020412.005 (#8935)";
192 # Requires reuse of last successful pattern.
197 ok($match != $_, $message)
198 or diag(sprintf "'match one' %s on %s iteration" =>
199 $match ? 'succeeded' : 'failed',
200 $_ ? 'second' : 'first');
203 my $result = join "" => $num =~ //g;
204 is($result, $num, $message);
208 my $message = 'UTF-8 regex matches above 32k; Bug 20020630.002 (#10013)';
209 for (['byte', "\x{ff}"], ['utf8', "\x{1ff}"]) {
210 my ($type, $char) = @$_;
211 for my $len (32000, 32768, 33000) {
212 my $s = $char . "f" x $len;
213 my $r = $s =~ /$char([f]*)/gc;
214 ok($r, $message) or diag("<$type x $len>");
215 ok(!$r || pos ($s) == $len + 1, $message)
216 or diag("<$type x $len>; pos = @{[pos $s]}");
222 my $s = "\x{100}" x 5;
223 my $ok = $s =~ /(\x{100}{4})/;
224 my ($ord, $len) = (ord $1, length $1);
225 ok $ok && $ord == 0x100 && $len == 4, "No panic: end_shift [change 0e933229fa758625]";
229 my $message = 'UTF-8 matching; Bug 15397';
230 like("\x{100}", qr/\x{100}/, $message);
231 like("\x{100}", qr/(\x{100})/, $message);
232 like("\x{100}", qr/(\x{100}){1}/, $message);
233 like("\x{100}\x{100}", qr/(\x{100}){2}/, $message);
234 like("\x{100}\x{100}", qr/(\x{100})(\x{100})/, $message);
238 my $message = 'Neither ()* nor ()*? sets $1 when matched 0 times; Bug 7471';
240 ok(/(AB)*?CD/ && !defined $1, $message);
241 ok(/(AB)*CD/ && !defined $1, $message);
245 my $message = "Caching shouldn't prevent match; Bug 3547";
246 my $pattern = "^(b+?|a){1,2}c";
247 ok("bac" =~ /$pattern/ && $1 eq 'a', $message);
248 ok("bbac" =~ /$pattern/ && $1 eq 'a', $message);
249 ok("bbbac" =~ /$pattern/ && $1 eq 'a', $message);
250 ok("bbbbac" =~ /$pattern/ && $1 eq 'a', $message);
254 ok("\x{100}" =~ /(.)/, '$1 should keep UTF-8 ness; Bug 18232');
255 is($1, "\x{100}", '$1 is UTF-8; Bug 18232');
257 is($1, "\x{100}", '$1 is still UTF-8; Bug 18232');
258 isnt($1, "\xC4\x80", '$1 is not non-UTF-8; Bug 18232');
262 my $message = "Optimizer doesn't prematurely reject match; Bug 19767";
266 my $NormalChar = qr /[\p{IsDigit}\p{IsLower}\p{IsUpper}]/;
267 my $NormalWord = qr /${NormalChar}+?/;
268 my $PredNameHyphen = qr /^${NormalWord}(\-${NormalWord})*?$/;
271 like($attr, $PredNameHyphen, $message); # Original test.
274 like("0", qr/\p{N}+\z/, $message); # Variant.
278 my $message = "(??{ }) doesn't return stale values; Bug 20683";
280 foreach (1, 2, 3, 4) {
281 $p ++ if /(??{ $p })/
288 sub TIESCALAR {bless []}
292 foreach (1, 2, 3, 4) {
299 # Subject: Odd regexp behavior
300 # From: Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>
301 # Date: Wed, 26 Feb 2003 16:53:12 +0000
302 # Message-Id: <E18o4nw-0008Ly-00@wisbech.cl.cam.ac.uk>
303 # To: perl-unicode@perl.org
305 my $message = 'Markus Kuhn 2003-02-26';
307 my $x = "\x{2019}\nk";
308 ok($x =~ s/(\S)\n(\S)/$1 $2/sg, $message);
309 is($x, "\x{2019} k", $message);
312 ok($x =~ s/(\S)\n(\S)/$1 $2/sg, $message);
313 is($x, "b k", $message);
315 like("\x{2019}", qr/\S/, $message);
319 my $message = "(??{ .. }) in split doesn't corrupt its stack; Bug 21411";
321 is('-1-3-5-', join('', split /((??{$i++}))/, '-1-3-5-'), $message);
322 no warnings 'syntax';
323 @_ = split /(?{'WOW'})/, 'abc';
325 is("@_", "a|b|c", $message);
328 is(join('-', split /(?{ split "" })/, "abc"), 'a-b-c', 'nested split');
331 $_ = "code: 'x' { '...' }\n"; study;
332 my @x; push @x, $& while m/'[^\']*'/gx;
334 is("@x", "'x':'...'", "Parse::RecDescent triggered infinite loop; Bug 17757");
339 ok("a\nb" !~ /^b/, "Propagated modifier; $_[0]; Bug 22354");
340 ok("a\nb" =~ /^b/m, "Propagated modifier; $_[0] - with /m; Bug 22354");
343 $_ = "x"; s/x/func "in subst"/e;
344 $_ = "x"; s/x/func "in multiline subst"/em;
345 $_ = "x"; /x(?{func "in regexp"})/;
346 $_ = "x"; /x(?{func "in multiline regexp"})/m;
352 is("abcde", $`, 'Global match sets $`; Bug 19049');
356 # [perl #23769] Unicode regex broken on simple example
357 # regrepeat() didn't handle UTF-8 EXACT case right.
359 my $Mess = 'regrepeat() handles UTF-8 EXACT case right';
360 my $message = "$Mess; Bug 23769";
362 my $s = "\x{a0}\x{a0}\x{a0}\x{100}"; chop $s;
364 like($s, qr/\x{a0}/, $message);
365 like($s, qr/\x{a0}+/, $message);
366 like($s, qr/\x{a0}\x{a0}/, $message);
368 $message = "$Mess (easy variant); Bug 23769";
369 ok("aaa\x{100}" =~ /(a+)/, $message);
370 is($1, "aaa", $message);
372 $message = "$Mess (easy invariant); Bug 23769";
373 ok("aaa\x{100} " =~ /(a+?)/, $message);
374 is($1, "a", $message);
376 $message = "$Mess (regrepeat variant); Bug 23769";
377 ok("\xa0\xa0\xa0\x{100} " =~ /(\xa0+?)/, $message);
378 is($1, "\xa0", $message);
380 $message = "$Mess (regrepeat invariant); Bug 23769";
381 ok("\xa0\xa0\xa0\x{100}" =~ /(\xa0+)/, $message);
382 is($1, "\xa0\xa0\xa0", $message);
384 $message = "$Mess (hard variant); Bug 23769";
385 ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+?)/, $message);
386 is($1, "\xa0\xa1", $message);
388 $message = "$Mess (hard invariant); Bug 23769";
389 ok("ababab\x{100} " =~ /((?:ab)+)/, $message);
390 is($1, 'ababab', $message);
392 ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+)/, $message);
393 is($1, "\xa0\xa1\xa0\xa1\xa0\xa1", $message);
395 ok("ababab\x{100} " =~ /((?:ab)+?)/, $message);
396 is($1, "ab", $message);
398 $message = "Don't match first byte of UTF-8 representation; Bug 23769";
399 unlike("\xc4\xc4\xc4", qr/(\x{100}+)/, $message);
400 unlike("\xc4\xc4\xc4", qr/(\x{100}+?)/, $message);
401 unlike("\xc4\xc4\xc4", qr/(\x{100}++)/, $message);
405 # perl panic: pp_match start/end pointers
407 is(eval {my ($x, $y) = "bca" =~ /^(?=.*(a)).*(bc)/; "$x-$y"}, "a-bc",
408 'Captures can move backwards in string; Bug 25269');
412 # \cA not recognized in character classes
413 like("a\cAb", qr/\cA/, '\cA in pattern; Bug 27940');
414 like("a\cAb", qr/[\cA]/, '\cA in character class; Bug 27940');
415 like("a\cAb", qr/[\cA-\cB]/, '\cA in character class range; Bug 27940');
416 like("abc", qr/[^\cA-\cB]/, '\cA in negated character class range; Bug 27940');
417 like("a\cBb", qr/[\cA-\cC]/, '\cB in character class range; Bug 27940');
418 like("a\cCbc", qr/[^\cA-\cB]/, '\cC in negated character class range; Bug 27940');
419 like("a\cAb", qr/(??{"\cA"})/, '\cA in ??{} pattern; Bug 27940');
420 unlike("ab", qr/a\cIb/x, '\cI in pattern; Bug 27940');
424 # perl #28532: optional zero-width match at end of string is ignored
426 ok("abc" =~ /^abc(\z)?/ && defined($1),
427 'Optional zero-width match at end of string; Bug 28532');
428 ok("abc" =~ /^abc(\z)??/ && !defined($1),
429 'Optional zero-width match at end of string; Bug 28532');
433 my $utf8 = "\xe9\x{100}"; chop $utf8;
436 like($utf8, qr/\xe9/i, "utf8/latin; Bug 36207");
437 like($utf8, qr/$latin1/i, "utf8/latin runtime; Bug 36207");
438 like($utf8, qr/(abc|\xe9)/i, "utf8/latin trie; Bug 36207");
439 like($utf8, qr/(abc|$latin1)/i, "utf8/latin trie runtime; Bug 36207");
441 like("\xe9", qr/$utf8/i, "latin/utf8; Bug 36207");
442 like("\xe9", qr/(abc|$utf8)/i, "latin/utf8 trie; Bug 36207");
443 like($latin1, qr/$utf8/i, "latin/utf8 runtime; Bug 36207");
444 like($latin1, qr/(abc|$utf8)/i, "latin/utf8 trie runtime; Bug 36207");
453 "Assigning to original string does not corrupt match vars; Bug 37038");
461 my $aeek = bless {} => 'wooosh';
462 is(do {$aeek -> gloople () =~ /(.)/g}, 1,
463 "//g match against return value of sub [change e26a497577f3ce7b]");
466 is(do{gloople () =~ /(.)/g}, 1,
467 "change e26a497577f3ce7b didn't affect sub calls for some reason");
472 # See changes 26925-26928, which reverted change 26410
476 sub variable : lvalue {$var}
478 my $o = bless [] => 'lv';
482 $f .= $1 if $o -> variable =~ /(.)/g;
487 is($f, "ab", "pos() retained between calls");
490 ok 0, "Code failed: $@";
494 sub variable : lvalue {$var}
498 $g .= $1 if variable =~ /(.)/g;
503 is($g, "ab", "pos() retained between calls");
506 ok 0, "Code failed: $@";
512 skip "In EBCDIC and unclear what would trigger this bug there" if $::IS_EBCDIC;
515 $_ = pack "U0C2", 0xa2, 0xf8; # Ill-formed UTF-8
517 do {!($ret = s/[a\0]+//g)}',
519 {}, "Ill-formed UTF-8 doesn't match NUL in class; Bug 37836");
523 # chr(65535) should be allowed in regexes
525 no warnings 'utf8'; # To allow non-characters
530 is($c, "", "U+FFFF, parsed as atom; Bug 38293");
535 is($c, "", "U+FFFF backslashed, parsed as atom; Bug 38293");
539 is($c, "", "U+FFFF, parsed in class; Bug 38293");
544 is($c, "", "U+FFFF backslashed, parsed in class; Bug 38293");
548 is($s, "AB", "U+FFFF, EXACTF; Bug 38293");
552 is($s, "\x{ffff}", "U+FFFF, BOUND; Bug 38293");
556 is($s, "\x{ffff}", "U+FFFF, NBOUND; Bug 38293");
561 # The printing characters
562 my @chars = ("A" .. "Z");
564 my $size = 32771 - 4;
567 # Create some random junk. Inefficient, but it works.
568 for (my $i = 0; $i < $size; $ i++) {
569 $str .= $chars [rand @chars];
572 $str .= ($delim x 4);
575 ok($str =~ s/^(.*?)${delim}{4}//s, "Pattern matches; Bug 39583");
576 is($str, "", "Empty string; Bug 39583");
577 ok(defined $1 && length ($1) == $size, '$1 is correct size; Bug 39583');
581 like("\0-A", qr/\c@-A/, '@- should not be interpolated in a pattern; Bug 27940');
582 like("\0\0A", qr/\c@+A/, '@+ should not be interpolated in a pattern; Bug 27940');
583 like("X\@-A", qr/X@-A/, '@- should not be interpolated in a pattern; Bug 27940');
584 like("X\@\@A", qr/X@+A/, '@+ should not be interpolated in a pattern; Bug 27940');
586 like("X\0A", qr/X\c@?A/, '\c@?; Bug 27940');
587 like("X\0A", qr/X\c@*A/, '\c@*; Bug 27940');
588 like("X\0A", qr/X\c@(A)/, '\c@(; Bug 27940');
589 like("X\0A", qr/X(\c@)A/, '\c@); Bug 27940');
590 like("X\0A", qr/X\c@|ZA/, '\c@|; Bug 27940');
592 like("X\@A", qr/X@?A/, '@?; Bug 27940');
593 like("X\@A", qr/X@*A/, '@*; Bug 27940');
594 like("X\@A", qr/X@(A)/, '@(; Bug 27940');
595 like("X\@A", qr/X(@)A/, '@); Bug 27940');
596 like("X\@A", qr/X@|ZA/, '@|; Bug 27940');
598 local $" = ','; # non-whitespace and non-RE-specific
599 like('abc', qr/(.)(.)(.)/, 'The last successful match is bogus; Bug 27940');
600 like("A@+B", qr/A@{+}B/, 'Interpolation of @+ in /@{+}/; Bug 27940');
601 like("A@-B", qr/A@{-}B/, 'Interpolation of @- in /@{-}/; Bug 27940');
602 like("A@+B", qr/A@{+}B/x, 'Interpolation of @+ in /@{+}/x; Bug 27940');
603 like("A@-B", qr/A@{-}B/x, 'Interpolation of @- in /@{-}/x; Bug 27940');
607 my $s = 'foo bar baz';
608 my (@k, @v, @fetch, $res);
610 my @names = qw ($+{A} $+{B} $+{C});
611 if ($s =~ /(?<A>foo)\s+(?<B>bar)?\s+(?<C>baz)/) {
612 while (my ($k, $v) = each (%+)) {
616 @v = sort values (%+);
626 is($fetch[$_][0], $fetch[$_][1], "$names[$_]; Bug 50496");
631 is($res, 1, "'$s' =~ /(?<A>foo)\\s+(?<B>bar)?\\s+(?<C>baz)/; Bug 50496");
632 is($count, 3, "Got 3 keys in %+ via each; Bug 50496");
633 is(0 + @k, 3, "Got 3 keys in %+ via keys; Bug 50496");
634 is("@k", "A B C", "Got expected keys; Bug 50496");
635 is("@v", "bar baz foo", "Got expected values; Bug 50496");
637 no warnings "uninitialized";
638 print for $+ {this_key_doesnt_exist};
640 is($@, '', 'lvalue $+ {...} should not throw an exception; Bug 50496');
645 # Almost the same as the block above, except that the capture is nested.
648 my $s = 'foo bar baz';
649 my (@k, @v, @fetch, $res);
651 my @names = qw ($+{A} $+{B} $+{C} $+{D});
652 if ($s =~ /(?<D>(?<A>foo)\s+(?<B>bar)?\s+(?<C>baz))/) {
653 while (my ($k,$v) = each(%+)) {
657 @v = sort values (%+);
668 is($fetch[$_][0], $fetch[$_][1], "$names[$_]; Bug 50496");
673 is($res, 1, "'$s' =~ /(?<D>(?<A>foo)\\s+(?<B>bar)?\\s+(?<C>baz))/; Bug 50496");
674 is($count, 4, "Got 4 keys in %+ via each; Bug 50496");
675 is(@k, 4, "Got 4 keys in %+ via keys; Bug 50496");
676 is("@k", "A B C D", "Got expected keys; Bug 50496");
677 is("@v", "bar baz foo foo bar baz", "Got expected values; Bug 50496");
679 no warnings "uninitialized";
680 print for $+ {this_key_doesnt_exist};
682 is($@, '', 'lvalue $+ {...} should not throw an exception; Bug 50496');
690 while ($str =~ /b/g) {$mval = $#-; $pval = $#+; $count ++}
691 is($mval, 0, '@- should be empty; Bug 36046');
692 is($pval, 0, '@+ should be empty; Bug 36046');
693 is($count, 1, 'Should have matched once only; Bug 36046');
697 my $message = '/m in precompiled regexp; Bug 40684';
699 my $rex = qr'^abc$'m;
700 ok($s =~ m/$rex/, $message);
701 ok($s =~ m/^abc$/m, $message);
705 my $message = '(?: ... )? should not lose $^R; Bug 36909';
709 ok('x foofoo y' =~ m {
710 (foo) # $^R correctly set
711 (?{ "last regexp code result" })
713 is($^R, 'last regexp code result', $message);
715 is($^R, 'Nothing', $message);
720 ok('x foofoo y' =~ m {
721 (?:foo|bar)+ # $^R correctly set
722 (?{ "last regexp code result" })
724 is($^R, 'last regexp code result', $message);
726 is($^R, 'Nothing', $message);
730 ok('x foofoo y' =~ m {
731 (foo|bar)\1+ # $^R undefined
732 (?{ "last regexp code result" })
734 is($^R, 'last regexp code result', $message);
736 is($^R, 'Nothing', $message);
740 ok('x foofoo y' =~ m {
741 (foo|bar)\1 # This time without the +
742 (?{"last regexp code result"})
744 is($^R, 'last regexp code result', $message);
746 is($^R, 'Nothing', $message);
750 my $message = 'Match is quadratic due to eval; See Bug 22395';
752 for my $l (10, 100, 1000) {
754 ('a' x $l) =~ /(.*)(?{ $count++ })[bc]/;
755 is($count, $l*($l+3)/2+1, $message);
759 my $message = 'Match is linear, not quadratic; Bug 22395.';
762 for my $l (10, 100, 1000) {
764 ('a' x $l) =~ /(.*)(*{ $count++ })[bc]/;
765 $ok += is($count, $l + 1, $message);
767 is($ok,3, "Optimistic eval does not disable optimisations");
771 my $message = '@-/@+ should not have undefined values; Bug 22614';
774 /(.){1,}(?{push @len,0+@-})(.){1,}(?{})^/;
775 is("@len", "2 2 2", $message);
779 my $message = '$& set on s///; Bug 18209';
780 my $text = ' word1 word2 word3 word4 word5 word6 ';
782 my @words = ('word1', 'word3', 'word5');
784 foreach my $word (@words) {
785 $text =~ s/$word\s//gi; # Leave a space to separate words
786 # in the resultant str.
787 # The following block is not working.
793 is($count, 3, $message);
794 is($text, ' word2 word4 word6 ', $message);
800 local $_ = qq (A\nB\nC\n);
802 while (m#(\G|\n)([^\n]*)\n#gsx) {
806 is("@res", "A B C", "/g pattern shouldn't infinite loop; Bug 6893");
811 my @tails = ('', '(?(1))', '(|)', '()?');
812 my @quants = ('*','+');
816 for my $pat (@$pats) {
817 for my $quant (@quants) {
818 for my $tail (@tails) {
819 my $re = "($pat$quant\$)$tail";
820 ok(/$re/ && $1 eq $_, "'$_' =~ /$re/; Bug 41010");
821 ok(/$re/m && $1 eq $_, "'$_' =~ /$re/m; Bug 41010");
830 '(1|[23]|4|[56]|[78]|[90])',
831 '(?:1|[23]|4|[56]|[78]|[90])',
832 '(1|2|3|4|5|6|7|8|9|0)',
833 '(?:1|2|3|4|5|6|7|8|9|0)');
834 my @spats = ('[ ]', ' ', '( |\t)', '(?: |\t)', '[ \t]', '\s');
836 my @dstrs = ('12345');
837 $doit -> (\@spats, @sstrs);
838 $doit -> (\@dpats, @dstrs);
842 # [perl #45605] Regexp failure with utf8-flagged and byte-flagged string
844 my $utf_8 = "\xd6schel";
845 utf8::upgrade ($utf_8);
846 $utf_8 =~ m {(\xd6|Ö)schel};
847 is($1, "\xd6", "Upgrade error; Bug 45605");
851 # Regardless of utf8ness any character matches itself when
852 # doing a case insensitive match. See also [perl #36207]
854 for my $o (0 .. 255) {
855 my @ch = (chr ($o), chr ($o));
856 utf8::upgrade ($ch [1]);
857 for my $u_str (0, 1) {
858 for my $u_pat (0, 1) {
859 like($ch[$u_str], qr/\Q$ch[$u_pat]\E/i,
860 "\$c =~ /\$c/i : chr ($o) : u_str = $u_str u_pat = $u_pat; Bug 36207");
861 like($ch[$u_str], qr/\Q$ch[$u_pat]\E|xyz/i,
862 "\$c=~/\$c|xyz/i : chr($o) : u_str = $u_str u_pat = $u_pat; Bug 36207");
869 my $message = '$REGMARK in replacement; Bug 49190';
872 ok(s/(*:B)A/$REGMARK/, $message);
873 is($_, "B", $message);
875 ok(s/(*:X)A+|(*:Y)B+|(*:Z)C+/$REGMARK/g, $message);
876 is($_, "ZYX", $message);
877 # Use a longer name to force reallocation of $REGMARK.
879 ok(s/(*:X)A+|(*:YYYYYYYYYYYYYYYY)B+|(*:Z)C+/$REGMARK/g, $message);
880 is($_, "ZYYYYYYYYYYYYYYYYX", $message);
884 my $message = 'Substitution evaluation in list context; Bug 52658';
886 my @te = ($reg =~ m{^(/?(?:\.\./)*)},
887 $reg =~ s/(x)/'b'/eg > 1 ? '##' : '++');
888 is($reg, '../bbb/', $message);
889 is($te [0], '../', $message);
893 my $a = "xyzt" x 8192;
894 like($a, qr/\A(?>[a-z])*\z/,
895 '(?>) does not cause wrongness on long string; Bug 60034');
896 my $b = $a . chr 256;
898 is($a, $b, 'Bug 60034');
899 like($b, qr/\A(?>[a-z])*\z/,
900 '(?>) does not cause wrongness on long string with UTF-8; Bug 60034');
904 # Keep the following tests last -- they may crash perl
906 print "# Tests that follow may crash perl\n";
909 my $message = 'Pattern in a loop, failure should not ' .
910 'affect previous success; Bug 19049/38869';
912 'ab cdef', # Matches regex
913 ('e' x 40000 ) .'ab c' # Matches not, but 'ab c' matches part of it
918 m/ab(.+)cd/i; # The ignore-case seems to be important
919 $y = $1; # Use $1, which might not be from the last match!
920 $x = substr ($list [0], $- [0], $+ [0] - $- [0]);
922 is($y, ' ', $message);
923 is($x, 'ab cd', $message);
927 skip("Can run out of memory on os390", 1) if $^O eq 'os390';
928 ok (("a" x (2 ** 15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker; Bug 24274");
931 ok ((q(a)x 100) =~ /^(??{'(.)'x 100})/,
932 "Regexp /^(??{'(.)'x 100})/ crashes older perls; Bug 24274");
936 # [perl #45337] utf8 + "[a]a{2}" + /$.../ = panic: sv_len_utf8 cache
938 local ${^UTF8CACHE} = -1;
939 my $message = "Shouldn't panic; Bug 45337";
942 like("aaa", qr/$s/, $message);
945 my $message = "Check if tree logic breaks \$^R; Bug 57042";
953 for my $line ("(A)","(B)") {
954 if ($line =~ m/$cond_re/) {
955 push @res, $^R ? "#$^R" : "UNDEF";
958 is("@res","#1 #2", $message);
961 no warnings 'closure';
962 my $re = qr/A(??{"1"})/;
963 ok "A1B" =~ m/^((??{ $re }))((??{"B"}))$/;
968 # This only works under -DEBUGGING because it relies on an assert().
970 # Check capture offset re-entrancy of utf8 code.
972 sub fswash { $_[0] =~ s/([>X])//g; }
974 my $k1 = "." x 4 . ">>";
977 my $k2 = "\x{f1}\x{2022}";
978 $k2 =~ s/([\360-\362])/>/g;
981 is($k2, "\x{2022}", "utf8::SWASHNEW doesn't cause capture leaks; Bug 60508");
985 # minimal CURLYM limited to 32767 matches
987 qr{a(x|y)*b}, # CURLYM
988 qr{a(x|y)*?b}, # .. with minmod
989 qr{a([wx]|[yz])*b}, # .. and without tries
993 my $s = join '', 'a', 'x' x $len, 'b';
995 like($s, $pat, "$pat; Bug 65372");
1000 local $::TODO = "[perl #38133]";
1002 "A" =~ /(((?:A))?)+/;
1008 is($first, $second);
1013 = 'utf8 =~ /trie/ where trie matches a continuation octet; Bug 70998';
1017 local $SIG{__WARN__} = sub { $w .= shift };
1019 # This bug can be reduced to
1020 qq{\x{30ab}} =~ /\xab|\xa9/;
1021 # but it's nice to have a more 'real-world' test. The original test
1022 # case from the RT ticket follows:
1028 my $conv_rx = '(' . join('|', map { quotemeta } keys %conv) . ')';
1029 $conv_rx = qr{$conv_rx};
1032 = qq{\x{3042}\x{304b}\x{3055}\x{305f}\x{306a}\x{306f}\x{307e}}
1033 . qq{\x{3084}\x{3089}\x{308f}\x{3093}\x{3042}\x{304b}\x{3055}}
1034 . qq{\x{305f}\x{306a}\x{306f}\x{307e}\x{3084}\x{3089}\x{308f}}
1035 . qq{\x{3093}\x{30a2}\x{30ab}\x{30b5}\x{30bf}\x{30ca}\x{30cf}}
1036 . qq{\x{30de}\x{30e4}\x{30e9}\x{30ef}\x{30f3}\x{30a2}\x{30ab}}
1037 . qq{\x{30b5}\x{30bf}\x{30ca}\x{30cf}\x{30de}\x{30e4}\x{30e9}}
1038 . qq{\x{30ef}\x{30f3}\x{30a2}\x{30ab}\x{30b5}\x{30bf}\x{30ca}}
1039 . qq{\x{30cf}\x{30de}\x{30e4}\x{30e9}\x{30ef}\x{30f3}};
1041 $x =~ s{$conv_rx}{$conv{$1}}eg;
1043 is($w, undef, $message);
1047 # minimal CURLYM limited to 32767 matches
1049 is(join("-", " abc def " =~ /(?=(\S+))/g), "abc-bc-c-def-ef-f",
1050 'stclass optimisation does not break + inside (?=); Bug 68564');
1054 use charnames ":full";
1055 # Delayed interpolation of \N'
1056 my $r1 = qr/\N{THAI CHARACTER SARA I}/;
1057 my $r2 = qr'\N{THAI CHARACTER SARA I}';
1058 my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}";
1061 ok $s1 =~ /$r1+/, 'my $r1 = qr/\N{THAI CHARACTER SARA I}/; my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}; $s1 =~ /$r1+/';
1062 ok $s1 =~ /$r2+/, 'my $r2 = qr\'\N{THAI CHARACTER SARA I}\'; my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}; $s1 =~ \'$r2+\'';
1065 ok "${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/, '"${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/';
1067 ok "abbbbc" =~ m/\N{1}/ && $& eq "a", '"abbbbc" =~ m/\N{1}/ && $& eq "a"';
1068 ok "abbbbc" =~ m'\N{1}' && $& eq "a", '"abbbbc" =~ m\'\N{1}\' && $& eq "a"';
1069 ok "abbbbc" =~ m/\N{3,4}/ && $& eq "abbb", '"abbbbc" =~ m/\N{3,4}/ && $& eq "abbb"';
1070 ok "abbbbc" =~ m'\N{3,4}' && $& eq "abbb", '"abbbbc" =~ m\'\N{3,4}\' && $& eq "abbb"';
1074 use charnames ":full";
1075 my $message = '[perl #74982] Period coming after \N{}';
1076 ok("\x{ff08}." =~ m/\N{FULLWIDTH LEFT PARENTHESIS}./ && $& eq "\x{ff08}.", $message);
1077 ok("\x{ff08}." =~ m'\N{FULLWIDTH LEFT PARENTHESIS}.' && $& eq "\x{ff08}.", $message);
1078 ok("\x{ff08}." =~ m/[\N{FULLWIDTH LEFT PARENTHESIS}]./ && $& eq "\x{ff08}.", $message);
1079 ok("\x{ff08}." =~ m'[\N{FULLWIDTH LEFT PARENTHESIS}].' && $& eq "\x{ff08}.", $message);
1083 ######## "Segfault using HTML::Entities", Richard Jolly <richardjolly@mac.com>, <A3C7D27E-C9F4-11D8-B294-003065AE00B6@mac.com> in perl-unicode@perl.org
1085 skip('Perl configured without Encode module', 1)
1086 unless $Config{extensions} =~ / Encode /;
1088 # Test case cut down by jhi
1089 fresh_perl_like(<<'EOP', qr!Malformed UTF-8 character \(unexpected end of string\)!, {}, 'Segfault using HTML::Entities');
1091 my $t = ord('A') == 193 ? "\xEA" : "\xE9";
1092 Encode::_utf8_on($t);
1099 # pattern must be compiled late or we can break the test file
1100 my $message = '[perl #115050] repeated nothings in a trie can cause panic';
1102 $pattern = '[xyz]|||';
1103 ok("blah blah" =~ /$pattern/, $message);
1104 ok("blah blah" =~ /(?:$pattern)h/, $message);
1105 $pattern = '|||[xyz]';
1106 ok("blah blah" =~ /$pattern/, $message);
1107 ok("blah blah" =~ /(?:$pattern)h/, $message);
1111 # [perl #4289] First mention $& after a match
1112 local $::TODO = "these tests fail without Copy-on-Write enabled"
1113 if $Config{ccflags} =~ /PERL_NO_COW/;
1115 '$_ = "abc"; /b/g; $_ = "hello"; print eval q|$&|, "\n"',
1116 "b\n", {}, '$& first mentioned after match');
1118 '$_ = "abc"; /b/g; $_ = "hello"; print eval q|$`|, "\n"',
1119 "a\n", {}, '$` first mentioned after match');
1121 '$_ = "abc"; /b/g; $_ = "hello"; print eval q|$\'|,"\n"',
1122 "c\n", {}, '$\' first mentioned after match');
1126 # [perl #118175] threaded perl-5.18.0 fails pat_rt_report_thr.t
1127 # this tests some related failures
1129 # The tests in the block *only* fail when run on 32-bit systems
1130 # with a malloc that allocates above the 2GB line. On the system
1131 # in the report above that only happened in a thread.
1132 my $s = "\x{1ff}" . "f" x 32;
1133 ok($s =~ /\x{1ff}[[:alpha:]]+/gca, "POSIXA pointer wrap");
1137 # RT #129012 heap-buffer-overflow Perl_fbm_instr.
1138 # This test is unlikely to not pass, but it used to fail
1141 my $s ="\x{100}0000000";
1142 ok($s !~ /00000?\x80\x80\x80/, "RT #129012");
1146 # RT #129085 heap-buffer-overflow Perl_re_intuit_start
1147 # this did fail under ASAN, but didn't under valgrind
1148 my $s = "\x{f2}\x{140}\x{fe}\x{ff}\x{ff}\x{ff}";
1149 ok($s !~ /^0000.\34500\376\377\377\377/, "RT #129085");
1154 'no warnings "regexp"; "foo"=~/((?1)){8,0}/; print "ok"',
1155 "ok", {}, 'RT #130561 - allowing impossible quantifier should not cause SEGVs');
1157 no warnings 'regexp';
1158 ok($s=~/(foo){1,0}|(?1)/,
1159 "RT #130561 - allowing impossible quantifier should not break recursion");
1162 # RT #133892 Coredump in Perl_re_intuit_start
1163 # Second match flips to checking floating substring before fixed
1164 # substring, which triggers a pathway that failed to check there
1165 # was a non-utf8 version of the string before trying to use it
1166 # resulting in a SEGV.
1167 my $result = grep /b\x{1c0}ss0/i, qw{ xxxx xxxx0 };
1171 } # End of sub run_tests