Add test for \p{} failing silently

[perl5.git] / t / re / pat.t
diff --git a/t/re/pat.t b/t/re/pat.t

index 05bb650..ba64d29 100644 (file)
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -15,12 +15,12 @@ $| = 1;
  
  BEGIN {
      chdir 't' if -d 't';
-    @INC = ('../lib','.');
+    @INC = ('../lib','.','../ext/re');
      require Config; import Config;
      require './test.pl';
  }
  
-plan tests => 467;  # Update this when adding/deleting tests.
+plan tests => 739;  # Update this when adding/deleting tests.
  
  run_tests() unless caller;
  
@@ -522,11 +522,7 @@ sub run_tests {
        SKIP: {
              skip 'No locale testing without d_setlocale', 1 if(!$Config{d_setlocale});
  
-            BEGIN {
-                if($Config{d_setlocale}) {
-                    require locale; import locale;
-                }
-            }
+            use locale;
              $locale = qr/\b\v$/;
              is($locale,    '(?^l:\b\v$)', 'Verify has l modifier when compiled under use locale');
              no locale;
@@ -555,11 +551,7 @@ sub run_tests {
        SKIP: {
              skip 'No locale testing without d_setlocale', 2 if(!$Config{d_setlocale});
  
-             BEGIN {
-                if($Config{d_setlocale}) {
-                    require locale; import locale;
-                }
-            }
+             use locale;
              is(qr/abc$dual/,    '(?^l:abc(?^:\b\v$))', 'Verify retains d meaning when interpolated under locale');
              is(qr/abc$unicode/,    '(?^l:abc(?^u:\b\v$))', 'Verify retains u when interpolated under locale');
          }
@@ -703,6 +695,14 @@ sub run_tests {
          /.(a)(ba*)?/;
          is($#+, 2, $message);
          is($#-, 1, $message);
+
+        # Check that values don’t stick
+        "     "=~/()()()(.)(..)/;
+        my($m,$p) = (\$-[5], \$+[5]);
+        () = "$$_" for $m, $p; # FETCH (or eqv.)
+        " " =~ /()/;
+        is $$m, undef, 'values do not stick to @- elements';
+        is $$p, undef, 'values do not stick to @+ elements';
      }
  
      foreach ('$+[0] = 13', '$-[0] = 13', '@+ = (7, 6, 5)',
@@ -726,11 +726,39 @@ sub run_tests {
          like($str, qr/^..\G/, $message);
          unlike($str, qr/^...\G/, $message);
          ok($str =~ /\G../ && $& eq 'cd', $message);
-
-        local $::TODO = $::running_as_thread;
          ok($str =~ /.\G./ && $& eq 'bc', $message);
+
+    }
+
+    {
+        my $message = '\G and intuit and anchoring';
+       $_ = "abcdef";
+       pos = 0;
+       ok($_ =~ /\Gabc/, $message);
+       ok($_ =~ /^\Gabc/, $message);
+
+       pos = 3;
+       ok($_ =~ /\Gdef/, $message);
+       pos = 3;
+       ok($_ =~ /\Gdef$/, $message);
+       pos = 3;
+       ok($_ =~ /abc\Gdef$/, $message);
+       pos = 3;
+       ok($_ =~ /^abc\Gdef$/, $message);
+       pos = 3;
+       ok($_ =~ /c\Gd/, $message);
+       pos = 3;
+       ok($_ =~ /..\GX?def/, $message);
+    }
+
+    {
+        my $s = '123';
+        pos($s) = 1;
+        my @a = $s =~ /(\d)\G/g; # this infinitely looped up till 5.19.1
+        is("@a", "1", '\G looping');
      }
  
+
      {
          my $message = 'pos inside (?{ })';
          my $str = 'abcde';
@@ -799,22 +827,19 @@ sub run_tests {
          my $message = '\G anchor checks';
          my $foo = 'aabbccddeeffgg';
          pos ($foo) = 1;
-        {
-            local $::TODO = $::running_as_thread;
-            no warnings 'uninitialized';
-            ok($foo =~ /.\G(..)/g, $message);
-            is($1, 'ab', $message);
  
-            pos ($foo) += 1;
-            ok($foo =~ /.\G(..)/g, $message);
-            is($1, 'cc', $message);
+       ok($foo =~ /.\G(..)/g, $message);
+       is($1, 'ab', $message);
  
-            pos ($foo) += 1;
-            ok($foo =~ /.\G(..)/g, $message);
-            is($1, 'de', $message);
+       pos ($foo) += 1;
+       ok($foo =~ /.\G(..)/g, $message);
+       is($1, 'cc', $message);
  
-            ok($foo =~ /\Gef/g, $message);
-        }
+       pos ($foo) += 1;
+       ok($foo =~ /.\G(..)/g, $message);
+       is($1, 'de', $message);
+
+       ok($foo =~ /\Gef/g, $message);
  
          undef pos $foo;
          ok($foo =~ /\G(..)/g, $message);
@@ -829,6 +854,36 @@ sub run_tests {
      }
  
      {
+        my $message = 'basic \G floating checks';
+        my $foo = 'aabbccddeeffgg';
+        pos ($foo) = 1;
+
+       ok($foo =~ /a+\G(..)/g, "$message: a+\\G");
+       is($1, 'ab', "$message: ab");
+
+       pos ($foo) += 1;
+       ok($foo =~ /b+\G(..)/g, "$message: b+\\G");
+       is($1, 'cc', "$message: cc");
+
+       pos ($foo) += 1;
+       ok($foo =~ /d+\G(..)/g, "$message: d+\\G");
+       is($1, 'de', "$message: de");
+
+       ok($foo =~ /\Gef/g, "$message: \\Gef");
+
+        pos ($foo) = 1;
+
+       ok($foo =~ /(?=a+\G)(..)/g, "$message: (?a+\\G)");
+       is($1, 'aa', "$message: aa");
+
+        pos ($foo) = 2;
+
+       ok($foo =~ /a(?=a+\G)(..)/g, "$message: a(?=a+\\G)");
+       is($1, 'ab', "$message: ab");
+
+    }
+
+    {
          $_ = '123x123';
          my @res = /(\d*|x)/g;
          local $" = '|';
@@ -1156,12 +1211,10 @@ sub run_tests {
          local $SIG{__WARN__} = sub {};
          my $str = "\x{110000}";
  
-        # No non-unicode code points match any Unicode property, even inverse
-        # ones
-        unlike($str, qr/\p{ASCII_Hex_Digit=True}/, "Non-Unicode doesn't match \\p{}");
-        unlike($str, qr/\p{ASCII_Hex_Digit=False}/, "Non-Unicode doesn't match \\p{}");
-        like($str, qr/\P{ASCII_Hex_Digit=True}/, "Non-Unicode matches \\P{}");
-        like($str, qr/\P{ASCII_Hex_Digit=False}/, "Non-Unicode matches \\P{}");
+        unlike($str, qr/\p{ASCII_Hex_Digit=True}/, "Non-Unicode doesn't match \\p{AHEX=True}");
+        like($str, qr/\p{ASCII_Hex_Digit=False}/, "Non-Unicode matches \\p{AHEX=False}");
+        like($str, qr/\P{ASCII_Hex_Digit=True}/, "Non-Unicode matches \\P{AHEX=True}");
+        unlike($str, qr/\P{ASCII_Hex_Digit=False}/, "Non-Unicode matches \\P{AHEX=FALSE}");
      }
  
      {
@@ -1187,7 +1240,7 @@ use utf8;;
  "abc" =~ qr/(?<$char>abc)/;
  EOP
              utf8::encode($prog);
-            fresh_perl_like($prog, qr!Group name must start with a non-digit word character!, "",
+            fresh_perl_like($prog, qr!Group name must start with a non-digit word character!, {},
                          sprintf("'U+%04X not legal IDFirst'", ord($char)));
          }
      }
@@ -1342,14 +1395,242 @@ EOP
         ok("Perl" =~ /P.*$/i, '#116148');
      }
  
-    { # 117327: Sequence (?#...) not recognized in regex
-      # The space between the '(' and '?' is now deprecated; this test should
-      # be removed when the deprecation is made fatal.
-        no warnings;
-        like("ab", qr/a( ?#foo)b/x);
+    { # 118297: Mixing up- and down-graded strings in regex
+        utf8::upgrade(my $u = "\x{e5}");
+        utf8::downgrade(my $d = "\x{e5}");
+        my $warned;
+        local $SIG{__WARN__} = sub { $warned++ if $_[0] =~ /\AMalformed UTF-8/ };
+        my $re = qr/$u$d/;
+        ok(!$warned, "no warnings when interpolating mixed up-/downgraded strings in pattern");
+        my $c = "\x{e5}\x{e5}";
+        utf8::downgrade($c);
+        like($c, $re, "mixed up-/downgraded pattern matches downgraded string");
+        utf8::upgrade($c);
+        like($c, $re, "mixed up-/downgraded pattern matches upgraded string");
+    }
+
+    {
+        # if we have 87 capture buffers defined then \87 should refer to the 87th.
+        # test that this is true for 1..100
+        # Note that this test causes the engine to recurse at runtime, and
+        # hence use a lot of C stack.
+        for my $i (1..100) {
+            my $capture= "a";
+            $capture= "($capture)" for 1 .. $i;
+            for my $mid ("","b") {
+                my $str= "a${mid}a";
+                my $backref= "\\$i";
+                eval {
+                    ok($str=~/$capture$mid$backref/,"\\$i works with $i buffers '$str'=~/...$mid$backref/");
+                    1;
+                } or do {
+                    is("$@","","\\$i works with $i buffers works with $i buffers '$str'=~/...$mid$backref/");
+                };
+            }
+        }
+    }
+
+    # this mixture of readonly (not COWable) and COWable strings
+    # messed up the capture buffers under COW. The actual test results
+    # are incidental; the issue is was an AddressSanitizer failure
+    {
+       my $c ='AB';
+       my $res = '';
+       for ($c, 'C', $c, 'DE') {
+           ok(/(.)/, "COWable match");
+           $res .= $1;
+       }
+       is($res, "ACAD");
+    }
+
+
+    {
+       # RT #45667
+       # /[#$x]/x didn't interpolate the var $x.
+       my $b = 'cd';
+       my $s = 'abcd$%#&';
+       $s =~ s/[a#$b%]/X/g;
+       is ($s, 'XbXX$XX&', 'RT #45667 without /x');
+       $s = 'abcd$%#&';
+       $s =~ s/[a#$b%]/X/gx;
+       is ($s, 'XbXX$XX&', 'RT #45667 with /x');
+    }
+
+    {
+       no warnings "uninitialized";
+       my @a;
+       $a[1]++;
+       /@a/;
+       pass('no crash with /@a/ when array has nonexistent elems');
+    }
+
+    {
+       is runperl(prog => 'delete $::{qq-\cR-}; //; print qq-ok\n-'),
+          "ok\n",
+          'deleting *^R does not result in crashes';
+       no warnings 'once';
+       *^R = *caretRglobwithnoscalar;
+       "" =~ /(?{42})/;
+       is $^R, 42, 'assigning to *^R does not result in a crash';
+       is runperl(
+            stderr => 1,
+            prog => 'eval q|'
+                   .' q-..- =~ /(??{undef *^R;q--})(?{42})/; '
+                    .' print qq-$^R\n-'
+                   .'|'
+          ),
+          "42\n",
+          'undefining *^R within (??{}) does not result in a crash';
+    }
+
+    {
+        # [perl #120446]
+        # this code should be virtually instantaneous. If it takes 10s of
+        # seconds, there a bug in intuit_start.
+        # (this test doesn't actually test for slowness - that involves
+        # too much danger of false positives on loaded machines - but by
+        # putting it here, hopefully someone might notice if it suddenly
+        # runs slowly)
+        my $s = ('a' x 1_000_000) . 'b';
+        my $i = 0;
+        for (1..10_000) {
+            pos($s) = $_;
+            $i++ if $s =~/\Gb/g;
+        }
+        is($i, 0, "RT 120446: mustn't run slowly");
+    }
+
+    {
+        # [perl #120692]
+        # these tests should be virtually instantaneous. If they take 10s of
+        # seconds, there's a bug in intuit_start.
+
+        my $s = 'ab' x 1_000_000;
+        utf8::upgrade($s);
+        1 while $s =~ m/\Ga+ba+b/g;
+        pass("RT#120692 \\G mustn't run slowly");
+
+        $s=~ /^a{1,2}x/ for  1..10_000;
+        pass("RT#120692 a{1,2} mustn't run slowly");
+
+        $s=~ /ab.{1,2}x/;
+        pass("RT#120692 ab.{1,2} mustn't run slowly");
+
+        $s = "-a-bc" x 250_000;
+        $s .= "1a1bc";
+        utf8::upgrade($s);
+        ok($s =~ /\da\d{0,30000}bc/, "\\d{30000}");
+
+        $s = "-ab\n" x 250_000;
+        $s .= "abx";
+        ok($s =~ /^ab.*x/m, "distant float with /m");
+
+        my $r = qr/^abcd/;
+        $s = "abcd-xyz\n" x 500_000;
+        $s =~ /$r\d{1,2}xyz/m for 1..200;
+        pass("BOL within //m  mustn't run slowly");
+
+        $s = "abcdefg" x 1_000_000;
+        $s =~ /(?-m:^)abcX?fg/m for 1..100;
+        pass("BOL within //m  mustn't skip absolute anchored check");
+
+        $s = "abcdefg" x 1_000_000;
+        $s =~ /^XX\d{1,10}cde/ for 1..100;
+        pass("abs anchored float string should fail quickly");
+
+    }
+
+    # These are based on looking at the code in regcomp.c
+    # We don't look for specific code, just the existence of an SSC
+    foreach my $re (qw(     qr/a?c/
+                            qr/a?c/i
+                            qr/[ab]?c/
+                            qr/\R?c/
+                            qr/\d?c/d
+                            qr/\w?c/l
+                            qr/\s?c/a
+                            qr/[[:alpha:]]?c/u
+    )) {
+      SKIP: {
+        skip "no re-debug under miniperl" if is_miniperl;
+        my $prog = <<"EOP";
+use re qw(Debug COMPILE);
+$re;
+EOP
+        fresh_perl_like($prog, qr/synthetic stclass/, { stderr=>1 }, "$re generates a synthetic start class");
+      }
+    }
+
+    {
+        like "\x{AA}", qr/a?[\W_]/d, "\\W with /d synthetic start class works";
+    }
+
+    {
+        # Verify that the very last Latin-1 U+00FF
+        # (LATIN SMALL LETTER Y WITH DIAERESIS)
+        # and its UPPER counterpart (U+0178 which is pure Unicode),
+        # and likewise for the very first pure Unicode
+        # (LATIN CAPITAL LETTER A WITH MACRON) fold-match properly,
+        # and there are no off-by-one logic errors in the transition zone.
+
+        ok("\xFF" =~ /\xFF/i, "Y WITH DIAERESIS l =~ l");
+        ok("\xFF" =~ /\x{178}/i, "Y WITH DIAERESIS l =~ u");
+        ok("\x{178}" =~ /\xFF/i, "Y WITH DIAERESIS u =~ l");
+        ok("\x{178}" =~ /\x{178}/i, "Y WITH DIAERESIS u =~ u");
+
+        # U+00FF with U+05D0 (non-casing Hebrew letter).
+        ok("\xFF\x{5D0}" =~ /\xFF\x{5D0}/i, "Y WITH DIAERESIS l =~ l");
+        ok("\xFF\x{5D0}" =~ /\x{178}\x{5D0}/i, "Y WITH DIAERESIS l =~ u");
+        ok("\x{178}\x{5D0}" =~ /\xFF\x{5D0}/i, "Y WITH DIAERESIS u =~ l");
+        ok("\x{178}\x{5D0}" =~ /\x{178}\x{5D0}/i, "Y WITH DIAERESIS u =~ u");
+
+        # U+0100.
+        ok("\x{100}" =~ /\x{100}/i, "A WITH MACRON u =~ u");
+        ok("\x{100}" =~ /\x{101}/i, "A WITH MACRON u =~ l");
+        ok("\x{101}" =~ /\x{100}/i, "A WITH MACRON l =~ u");
+        ok("\x{101}" =~ /\x{101}/i, "A WITH MACRON l =~ l");
+    }
+
+    {
+        use utf8;
+        ok("abc" =~ /a\85b\85c/x, "NEL is white-space under /x");
+    }
+
+    {
+        ok('a(b)c' =~ qr(a\(b\)c), "'\\(' is a literal in qr(...)");
+        ok('a[b]c' =~ qr[a\[b\]c], "'\\[' is a literal in qr[...]");
+        ok('a{3}c' =~ qr{a\{3\}c},  # Only failed when { could be a meta
+              "'\\{' is a literal in qr{...}, where it could be a quantifier");
+
+        # This one is for completeness
+        ok('a<b>c' =~ qr<a\<b\>c>, "'\\<' is a literal in qr<...>)");
      }
  
+    {   # Was getting optimized into EXACT (non-folding node)
+        my $x = qr/[x]/i;
+        utf8::upgrade($x);
+        like("X", qr/$x/, "UTF-8 of /[x]/i matches upper case");
+    }
  
+    {   # make sure we get an error when \p{} cannot load Unicode tables
+        fresh_perl_like(<<'        prog that cannot load uni tables',
+            BEGIN {
+                @INC = '../lib';
+                require utf8; require 'utf8_heavy.pl';
+                @INC = ();
+            }
+            $name = 'A B';
+            if ($name =~ /(\p{IsUpper}) (\p{IsUpper})/){
+                print "It's good! >$1< >$2<\n";
+            } else {
+                print "It's not good...\n";
+            }
+        prog that cannot load uni tables
+                  qr/^Can't locate unicore\/Heavy\.pl(?x:
+                   )|^Can't find Unicode property definition/,
+                  undef,
+                 '\p{} should not fail silently when uni tables evanesce');
+    }
  } # End of sub run_tests
  
  1;