+ {
+ # Suppress warnings, as the non-unicode one comes out even if turn off
+ # warnings here (because the execution is done in another scope).
+ local $SIG{__WARN__} = sub {};
+ my $str = "\x{110000}";
+
+ unlike($str, qr/\p{ASCII_Hex_Digit=True}/, "Non-Unicode doesn't match \\p{AHEX=True}");
+ like($str, qr/\p{ASCII_Hex_Digit=False}/, "Non-Unicode matches \\p{AHEX=False}");
+ like($str, qr/\P{ASCII_Hex_Digit=True}/, "Non-Unicode matches \\P{AHEX=True}");
+ unlike($str, qr/\P{ASCII_Hex_Digit=False}/, "Non-Unicode matches \\P{AHEX=FALSE}");
+ }
+
+ {
+ # Test that IDstart works, but because the author (khw) knows
+ # regexes much better than the rest of the core, it is being done here
+ # in the context of a regex which relies on buffer names beginng with
+ # IDStarts.
+ use utf8;
+ my $str = "abc";
+ like($str, qr/(?<a>abc)/, "'a' is legal IDStart");
+ like($str, qr/(?<_>abc)/, "'_' is legal IDStart");
+ like($str, qr/(?<ß>abc)/, "U+00DF is legal IDStart");
+ like($str, qr/(?<ℕ>abc)/, "U+2115' is legal IDStart");
+
+ # This test works on Unicode 6.0 in which U+2118 and U+212E are legal
+ # IDStarts there, but are not Word characters, and therefore Perl
+ # doesn't allow them to be IDStarts. But there is no guarantee that
+ # Unicode won't change things around in the future so that at some
+ # future Unicode revision these tests would need to be revised.
+ foreach my $char ("%", "×", chr(0x2118), chr(0x212E)) {
+ my $prog = <<"EOP";
+use utf8;;
+"abc" =~ qr/(?<$char>abc)/;
+EOP
+ utf8::encode($prog);
+ fresh_perl_like($prog, qr!Group name must start with a non-digit word character!, {},
+ sprintf("'U+%04X not legal IDFirst'", ord($char)));
+ }
+ }
+
+ { # [perl #101710]
+ my $pat = "b";
+ utf8::upgrade($pat);
+ like("\xffb", qr/$pat/i, "/i: utf8 pattern, non-utf8 string, latin1-char preceding matching char in string");
+ }
+
+ { # Crash with @a =~ // warning
+ local $SIG{__WARN__} = sub {
+ pass 'no crash for @a =~ // warning'
+ };
+ eval ' sub { my @a =~ // } ';
+ }
+
+ { # Concat overloading and qr// thingies
+ my @refs;
+ my $qr = qr//;
+ package Cat {
+ require overload;
+ overload->import(
+ '""' => sub { ${$_[0]} },
+ '.' => sub {
+ push @refs, ref $_[1] if ref $_[1];
+ bless $_[2] ? \"$_[1]${$_[0]}" : \"${$_[0]}$_[1]"
+ }
+ );
+ }
+ my $s = "foo";
+ my $o = bless \$s, Cat::;
+ /$o$qr/;
+ is "@refs", "Regexp", '/$o$qr/ passes qr ref to cat overload meth';
+ }
+
+ {
+ my $count=0;
+ my $str="\n";
+ $count++ while $str=~/.*/g;
+ is $count, 2, 'test that ANCH_MBOL works properly. We should get 2 from $count++ while "\n"=~/.*/g';
+ my $class_count= 0;
+ $class_count++ while $str=~/[^\n]*/g;
+ is $class_count, $count, 'while "\n"=~/.*/g and while "\n"=~/[^\n]*/g should behave the same';
+ my $anch_count= 0;
+ $anch_count++ while $str=~/^.*/mg;
+ is $anch_count, 1, 'while "\n"=~/^.*/mg should match only once';
+ }
+
+ { # [perl #111174]
+ use re '/u';
+ like "\xe0", qr/(?i:\xc0)/, "(?i: shouldn't lose the passed in /u";
+ use re '/a';
+ unlike "\x{100}", qr/(?i:\w)/, "(?i: shouldn't lose the passed in /a";
+ use re '/aa';
+ unlike 'k', qr/(?i:\N{KELVIN SIGN})/, "(?i: shouldn't lose the passed in /aa";
+ }
+
+ {
+ # the test for whether the pattern should be re-compiled should
+ # consider the UTF8ness of the previous and current pattern
+ # string, as well as the physical bytes of the pattern string
+
+ for my $s ("\xc4\x80", "\x{100}") {
+ ok($s =~ /^$s$/, "re-compile check is UTF8-aware");
+ }
+ }
+
+ # #113682 more overloading and qr//
+ # when doing /foo$overloaded/, if $overloaded returns
+ # a qr/(?{})/ via qr or "" overloading, then 'use re 'eval'
+ # shouldn't be required. Via '.', it still is.
+ {
+ package Qr0;
+ use overload 'qr' => sub { qr/(??{50})/ };
+
+ package Qr1;
+ use overload '""' => sub { qr/(??{51})/ };
+
+ package Qr2;
+ use overload '.' => sub { $_[1] . qr/(??{52})/ };
+
+ package Qr3;
+ use overload '""' => sub { qr/(??{7})/ },
+ '.' => sub { $_[1] . qr/(??{53})/ };
+
+ package Qr_indirect;
+ use overload '""' => sub { $_[0][0] };
+
+ package main;
+
+ for my $i (0..3) {
+ my $o = bless [], "Qr$i";
+ if ((0,0,1,1)[$i]) {
+ eval { "A5$i" =~ /^A$o$/ };
+ like($@, qr/Eval-group not allowed/, "Qr$i");
+ eval { "5$i" =~ /$o/ };
+ like($@, ($i == 3 ? qr/^$/ : qr/no method found,/),
+ "Qr$i bare");
+ {
+ use re 'eval';
+ ok("A5$i" =~ /^A$o$/, "Qr$i - with use re eval");
+ eval { "5$i" =~ /$o/ };
+ like($@, ($i == 3 ? qr/^$/ : qr/no method found,/),
+ "Qr$i bare - with use re eval");
+ }
+ }
+ else {
+ ok("A5$i" =~ /^A$o$/, "Qr$i");
+ ok("5$i" =~ /$o/, "Qr$i bare");
+ }
+ }
+
+ my $o = bless [ bless [], "Qr1" ], 'Qr_indirect';
+ ok("A51" =~ /^A$o/, "Qr_indirect");
+ ok("51" =~ /$o/, "Qr_indirect bare");
+ }
+
+ { # Various flags weren't being set when a [] is optimized into an
+ # EXACTish node
+ ;
+ ;
+ ok("\x{017F}\x{017F}" =~ qr/^[\x{00DF}]?$/i, "[] to EXACTish optimization");
+ }
+
+ {
+ for my $char (":", "\x{f7}", "\x{2010}") {
+ my $utf8_char = $char;
+ utf8::upgrade($utf8_char);
+ my $display = $char;
+ $display = display($display);
+ my $utf8_display = "utf8::upgrade(\"$display\")";
+
+ like($char, qr/^$char?$/, "\"$display\" =~ /^$display?\$/");
+ like($char, qr/^$utf8_char?$/, "my \$p = \"$display\"; utf8::upgrade(\$p); \"$display\" =~ /^\$p?\$/");
+ like($utf8_char, qr/^$char?$/, "my \$c = \"$display\"; utf8::upgrade(\$c); \"\$c\" =~ /^$display?\$/");
+ like($utf8_char, qr/^$utf8_char?$/, "my \$c = \"$display\"; utf8::upgrade(\$c); my \$p = \"$display\"; utf8::upgrade(\$p); \"\$c\" =~ /^\$p?\$/");
+ }
+ }
+
+ {
+ # #116148: Pattern utf8ness sticks around globally
+ # the utf8 in the first match was sticking around for the second
+ # match
+
+ use feature 'unicode_strings';
+
+ my $x = "\x{263a}";
+ $x =~ /$x/;
+
+ my $text = "Perl";
+ ok("Perl" =~ /P.*$/i, '#116148');
+ }
+
+ { # 118297: Mixing up- and down-graded strings in regex
+ utf8::upgrade(my $u = "\x{e5}");
+ utf8::downgrade(my $d = "\x{e5}");
+ my $warned;
+ local $SIG{__WARN__} = sub { $warned++ if $_[0] =~ /\AMalformed UTF-8/ };
+ my $re = qr/$u$d/;
+ ok(!$warned, "no warnings when interpolating mixed up-/downgraded strings in pattern");
+ my $c = "\x{e5}\x{e5}";
+ utf8::downgrade($c);
+ like($c, $re, "mixed up-/downgraded pattern matches downgraded string");
+ utf8::upgrade($c);
+ like($c, $re, "mixed up-/downgraded pattern matches upgraded string");
+ }
+
+ {
+ # if we have 87 capture buffers defined then \87 should refer to the 87th.
+ # test that this is true for 1..100
+ # Note that this test causes the engine to recurse at runtime, and
+ # hence use a lot of C stack.
+ for my $i (1..100) {
+ my $capture= "a";
+ $capture= "($capture)" for 1 .. $i;
+ for my $mid ("","b") {
+ my $str= "a${mid}a";
+ my $backref= "\\$i";
+ eval {
+ ok($str=~/$capture$mid$backref/,"\\$i works with $i buffers '$str'=~/...$mid$backref/");
+ 1;
+ } or do {
+ is("$@","","\\$i works with $i buffers works with $i buffers '$str'=~/...$mid$backref/");
+ };
+ }
+ }
+ }
+
+ # this mixture of readonly (not COWable) and COWable strings
+ # messed up the capture buffers under COW. The actual test results
+ # are incidental; the issue is was an AddressSanitizer failure
+ {
+ my $c ='AB';
+ my $res = '';
+ for ($c, 'C', $c, 'DE') {
+ ok(/(.)/, "COWable match");
+ $res .= $1;
+ }
+ is($res, "ACAD");
+ }
+
+
+ {
+ # RT #45667
+ # /[#$x]/x didn't interpolate the var $x.
+ my $b = 'cd';
+ my $s = 'abcd$%#&';
+ $s =~ s/[a#$b%]/X/g;
+ is ($s, 'XbXX$XX&', 'RT #45667 without /x');
+ $s = 'abcd$%#&';
+ $s =~ s/[a#$b%]/X/gx;
+ is ($s, 'XbXX$XX&', 'RT #45667 with /x');
+ }
+
+ {
+ no warnings "uninitialized";
+ my @a;
+ $a[1]++;
+ /@a/;
+ pass('no crash with /@a/ when array has nonexistent elems');
+ }
+
+ {
+ is runperl(prog => 'delete $::{qq-\cR-}; //; print qq-ok\n-'),
+ "ok\n",
+ 'deleting *^R does not result in crashes';
+ no warnings 'once';
+ *^R = *caretRglobwithnoscalar;
+ "" =~ /(?{42})/;
+ is $^R, 42, 'assigning to *^R does not result in a crash';
+ is runperl(
+ stderr => 1,
+ prog => 'eval q|'
+ .' q-..- =~ /(??{undef *^R;q--})(?{42})/; '
+ .' print qq-$^R\n-'
+ .'|'
+ ),
+ "42\n",
+ 'undefining *^R within (??{}) does not result in a crash';
+ }
+
+ {
+ # [perl #120446]
+ # this code should be virtually instantaneous. If it takes 10s of
+ # seconds, there a bug in intuit_start.
+ # (this test doesn't actually test for slowness - that involves
+ # too much danger of false positives on loaded machines - but by
+ # putting it here, hopefully someone might notice if it suddenly
+ # runs slowly)
+ my $s = ('a' x 1_000_000) . 'b';
+ my $i = 0;
+ for (1..10_000) {
+ pos($s) = $_;
+ $i++ if $s =~/\Gb/g;
+ }
+ is($i, 0, "RT 120446: mustn't run slowly");
+ }
+
+ {
+ # [perl #120692]
+ # these tests should be virtually instantaneous. If they take 10s of
+ # seconds, there's a bug in intuit_start.
+
+ my $s = 'ab' x 1_000_000;
+ utf8::upgrade($s);
+ 1 while $s =~ m/\Ga+ba+b/g;
+ pass("RT#120692 \\G mustn't run slowly");
+
+ $s=~ /^a{1,2}x/ for 1..10_000;
+ pass("RT#120692 a{1,2} mustn't run slowly");
+
+ $s=~ /ab.{1,2}x/;
+ pass("RT#120692 ab.{1,2} mustn't run slowly");
+
+ $s = "-a-bc" x 250_000;
+ $s .= "1a1bc";
+ utf8::upgrade($s);
+ ok($s =~ /\da\d{0,30000}bc/, "\\d{30000}");
+
+ $s = "-ab\n" x 250_000;
+ $s .= "abx";
+ ok($s =~ /^ab.*x/m, "distant float with /m");
+
+ my $r = qr/^abcd/;
+ $s = "abcd-xyz\n" x 500_000;
+ $s =~ /$r\d{1,2}xyz/m for 1..200;
+ pass("BOL within //m mustn't run slowly");
+
+ $s = "abcdefg" x 1_000_000;
+ $s =~ /(?-m:^)abcX?fg/m for 1..100;
+ pass("BOL within //m mustn't skip absolute anchored check");
+
+ $s = "abcdefg" x 1_000_000;
+ $s =~ /^XX\d{1,10}cde/ for 1..100;
+ pass("abs anchored float string should fail quickly");
+
+ }
+
+ # These are based on looking at the code in regcomp.c
+ # We don't look for specific code, just the existence of an SSC
+ foreach my $re (qw( qr/a?c/
+ qr/a?c/i
+ qr/[ab]?c/
+ qr/\R?c/
+ qr/\d?c/d
+ qr/\w?c/l
+ qr/\s?c/a
+ qr/[[:alpha:]]?c/u
+ )) {
+ SKIP: {
+ skip "no re-debug under miniperl" if is_miniperl;
+ my $prog = <<"EOP";
+use re qw(Debug COMPILE);
+$re;
+EOP
+ fresh_perl_like($prog, qr/synthetic stclass/, { stderr=>1 }, "$re generates a synthetic start class");
+ }
+ }
+
+ {
+ like "\x{AA}", qr/a?[\W_]/d, "\\W with /d synthetic start class works";
+ }
+
+ {
+ # Verify that the very last Latin-1 U+00FF
+ # (LATIN SMALL LETTER Y WITH DIAERESIS)
+ # and its UPPER counterpart (U+0178 which is pure Unicode),
+ # and likewise for the very first pure Unicode
+ # (LATIN CAPITAL LETTER A WITH MACRON) fold-match properly,
+ # and there are no off-by-one logic errors in the transition zone.
+
+ ok("\xFF" =~ /\xFF/i, "Y WITH DIAERESIS l =~ l");
+ ok("\xFF" =~ /\x{178}/i, "Y WITH DIAERESIS l =~ u");
+ ok("\x{178}" =~ /\xFF/i, "Y WITH DIAERESIS u =~ l");
+ ok("\x{178}" =~ /\x{178}/i, "Y WITH DIAERESIS u =~ u");
+
+ # U+00FF with U+05D0 (non-casing Hebrew letter).
+ ok("\xFF\x{5D0}" =~ /\xFF\x{5D0}/i, "Y WITH DIAERESIS l =~ l");
+ ok("\xFF\x{5D0}" =~ /\x{178}\x{5D0}/i, "Y WITH DIAERESIS l =~ u");
+ ok("\x{178}\x{5D0}" =~ /\xFF\x{5D0}/i, "Y WITH DIAERESIS u =~ l");
+ ok("\x{178}\x{5D0}" =~ /\x{178}\x{5D0}/i, "Y WITH DIAERESIS u =~ u");
+
+ # U+0100.
+ ok("\x{100}" =~ /\x{100}/i, "A WITH MACRON u =~ u");
+ ok("\x{100}" =~ /\x{101}/i, "A WITH MACRON u =~ l");
+ ok("\x{101}" =~ /\x{100}/i, "A WITH MACRON l =~ u");
+ ok("\x{101}" =~ /\x{101}/i, "A WITH MACRON l =~ l");
+ }
+
+ {
+ use utf8;
+ ok("abc" =~ /a\85b\85c/x, "NEL is white-space under /x");
+ }
+
+ {
+ ok('a(b)c' =~ qr(a\(b\)c), "'\\(' is a literal in qr(...)");
+ ok('a[b]c' =~ qr[a\[b\]c], "'\\[' is a literal in qr[...]");
+ ok('a{3}c' =~ qr{a\{3\}c}, # Only failed when { could be a meta
+ "'\\{' is a literal in qr{...}, where it could be a quantifier");
+
+ # This one is for completeness
+ ok('a<b>c' =~ qr<a\<b\>c>, "'\\<' is a literal in qr<...>)");
+ }
+
+ { # Was getting optimized into EXACT (non-folding node)
+ my $x = qr/[x]/i;
+ utf8::upgrade($x);
+ like("X", qr/$x/, "UTF-8 of /[x]/i matches upper case");
+ }
+