This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #123843] fix SEGV reading data->flags
[perl5.git] / t / re / pat.t
index b53853b..137a049 100644 (file)
@@ -15,12 +15,14 @@ $| = 1;
 
 BEGIN {
     chdir 't' if -d 't';
-    @INC = ('../lib','.');
+    @INC = ('../lib','.','../ext/re');
     require Config; import Config;
     require './test.pl';
+    skip_all('no re module') unless defined &DynaLoader::boot_DynaLoader;
+    skip_all_without_unicode_tables();
 }
 
-plan tests => 711;  # Update this when adding/deleting tests.
+plan tests => 772;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -522,11 +524,7 @@ sub run_tests {
       SKIP: {
             skip 'No locale testing without d_setlocale', 1 if(!$Config{d_setlocale});
 
-            BEGIN {
-                if($Config{d_setlocale}) {
-                    require locale; import locale;
-                }
-            }
+            use locale;
             $locale = qr/\b\v$/;
             is($locale,    '(?^l:\b\v$)', 'Verify has l modifier when compiled under use locale');
             no locale;
@@ -555,11 +553,7 @@ sub run_tests {
       SKIP: {
             skip 'No locale testing without d_setlocale', 2 if(!$Config{d_setlocale});
 
-             BEGIN {
-                if($Config{d_setlocale}) {
-                    require locale; import locale;
-                }
-            }
+             use locale;
             is(qr/abc$dual/,    '(?^l:abc(?^:\b\v$))', 'Verify retains d meaning when interpolated under locale');
             is(qr/abc$unicode/,    '(?^l:abc(?^u:\b\v$))', 'Verify retains u when interpolated under locale');
         }
@@ -755,6 +749,8 @@ sub run_tests {
        ok($_ =~ /^abc\Gdef$/, $message);
        pos = 3;
        ok($_ =~ /c\Gd/, $message);
+       pos = 3;
+       ok($_ =~ /..\GX?def/, $message);
     }
 
     {
@@ -1401,13 +1397,6 @@ EOP
        ok("Perl" =~ /P.*$/i, '#116148');
     }
 
-    { # 117327: Sequence (?#...) not recognized in regex
-      # The space between the '(' and '?' is now deprecated; this test should
-      # be removed when the deprecation is made fatal.
-        no warnings;
-        like("ab", qr/a( ?#foo)b/x);
-    }
-
     { # 118297: Mixing up- and down-graded strings in regex
         utf8::upgrade(my $u = "\x{e5}");
         utf8::downgrade(my $d = "\x{e5}");
@@ -1496,21 +1485,43 @@ EOP
           'undefining *^R within (??{}) does not result in a crash';
     }
 
-    {
-        # [perl #120446]
-        # this code should be virtually instantaneous. If it takes 10s of
-        # seconds, there a bug in intuit_start.
-        # (this test doesn't actually test for slowness - that involves
-        # too much danger of false positives on loaded machines - but by
-        # putting it here, hopefully someone might notice if it suddenly
-        # runs slowly)
-        my $s = ('a' x 1_000_000) . 'b';
-        my $i = 0;
-        for (1..10_000) {
-            pos($s) = $_;
-            $i++ if $s =~/\Gb/g;
+    SKIP: {   # Test literal range end point special handling
+        unless ($::IS_EBCDIC) {
+            skip "Valid only for EBCDIC", 24;
         }
-        is($i, 0, "RT 120446: mustn't run slowly");
+
+        like("\x89", qr/[i-j]/, '"\x89" should match [i-j]');
+        unlike("\x8A", qr/[i-j]/, '"\x8A" shouldnt match [i-j]');
+        unlike("\x90", qr/[i-j]/, '"\x90" shouldnt match [i-j]');
+        like("\x91", qr/[i-j]/, '"\x91" should match [i-j]');
+
+        like("\x89", qr/[i-\N{LATIN SMALL LETTER J}]/, '"\x89" should match [i-\N{LATIN SMALL LETTER J}]');
+        unlike("\x8A", qr/[i-\N{LATIN SMALL LETTER J}]/, '"\x8A" shouldnt match [i-\N{LATIN SMALL LETTER J}]');
+        unlike("\x90", qr/[i-\N{LATIN SMALL LETTER J}]/, '"\x90" shouldnt match [i-\N{LATIN SMALL LETTER J}]');
+        like("\x91", qr/[i-\N{LATIN SMALL LETTER J}]/, '"\x91" should match [i-\N{LATIN SMALL LETTER J}]');
+
+        like("\x89", qr/[i-\N{U+6A}]/, '"\x89" should match [i-\N{U+6A}]');
+        unlike("\x8A", qr/[i-\N{U+6A}]/, '"\x8A" shouldnt match [i-\N{U+6A}]');
+        unlike("\x90", qr/[i-\N{U+6A}]/, '"\x90" shouldnt match [i-\N{U+6A}]');
+        like("\x91", qr/[i-\N{U+6A}]/, '"\x91" should match [i-\N{U+6A}]');
+
+        like("\x89", qr/[\N{U+69}-\N{U+6A}]/, '"\x89" should match [\N{U+69}-\N{U+6A}]');
+        unlike("\x8A", qr/[\N{U+69}-\N{U+6A}]/, '"\x8A" shouldnt match [\N{U+69}-\N{U+6A}]');
+        unlike("\x90", qr/[\N{U+69}-\N{U+6A}]/, '"\x90" shouldnt match [\N{U+69}-\N{U+6A}]');
+        like("\x91", qr/[\N{U+69}-\N{U+6A}]/, '"\x91" should match [\N{U+69}-\N{U+6A}]');
+
+        like("\x89", qr/[i-\x{91}]/, '"\x89" should match [i-\x{91}]');
+        like("\x8A", qr/[i-\x{91}]/, '"\x8A" should match [i-\x{91}]');
+        like("\x90", qr/[i-\x{91}]/, '"\x90" should match [i-\x{91}]');
+        like("\x91", qr/[i-\x{91}]/, '"\x91" should match [i-\x{91}]');
+
+        # Need to use eval, because tries to compile on ASCII platforms even
+        # though the tests are skipped, and fails because 0x89-j is an illegal
+        # range there.
+        like("\x89", eval "qr/[\x{89}-j]/", '"\x89" should match [\x{89}-j]');
+        like("\x8A", eval "qr/[\x{89}-j]/", '"\x8A" should match [\x{89}-j]');
+        like("\x90", eval "qr/[\x{89}-j]/", '"\x90" should match [\x{89}-j]');
+        like("\x91", eval "qr/[\x{89}-j]/", '"\x91" should match [\x{89}-j]');
     }
 
     # These are based on looking at the code in regcomp.c
@@ -1522,7 +1533,7 @@ EOP
                             qr/\d?c/d
                             qr/\w?c/l
                             qr/\s?c/a
-                            qr/[[:alpha:]]?c/u
+                            qr/[[:lower:]]?c/u
     )) {
       SKIP: {
         skip "no re-debug under miniperl" if is_miniperl;
@@ -1534,7 +1545,145 @@ EOP
       }
     }
 
+    {
+        like "\x{AA}", qr/a?[\W_]/d, "\\W with /d synthetic start class works";
+    }
+
+    {
+        # Verify that the very last Latin-1 U+00FF
+        # (LATIN SMALL LETTER Y WITH DIAERESIS)
+        # and its UPPER counterpart (U+0178 which is pure Unicode),
+        # and likewise for the very first pure Unicode
+        # (LATIN CAPITAL LETTER A WITH MACRON) fold-match properly,
+        # and there are no off-by-one logic errors in the transition zone.
+
+        ok("\xFF" =~ /\xFF/i, "Y WITH DIAERESIS l =~ l");
+        ok("\xFF" =~ /\x{178}/i, "Y WITH DIAERESIS l =~ u");
+        ok("\x{178}" =~ /\xFF/i, "Y WITH DIAERESIS u =~ l");
+        ok("\x{178}" =~ /\x{178}/i, "Y WITH DIAERESIS u =~ u");
+
+        # U+00FF with U+05D0 (non-casing Hebrew letter).
+        ok("\xFF\x{5D0}" =~ /\xFF\x{5D0}/i, "Y WITH DIAERESIS l =~ l");
+        ok("\xFF\x{5D0}" =~ /\x{178}\x{5D0}/i, "Y WITH DIAERESIS l =~ u");
+        ok("\x{178}\x{5D0}" =~ /\xFF\x{5D0}/i, "Y WITH DIAERESIS u =~ l");
+        ok("\x{178}\x{5D0}" =~ /\x{178}\x{5D0}/i, "Y WITH DIAERESIS u =~ u");
+
+        # U+0100.
+        ok("\x{100}" =~ /\x{100}/i, "A WITH MACRON u =~ u");
+        ok("\x{100}" =~ /\x{101}/i, "A WITH MACRON u =~ l");
+        ok("\x{101}" =~ /\x{100}/i, "A WITH MACRON l =~ u");
+        ok("\x{101}" =~ /\x{101}/i, "A WITH MACRON l =~ l");
+    }
+
+    {
+        use utf8;
+        ok("abc" =~ /a\85b\85c/x, "NEL is white-space under /x");
+    }
+
+    {
+        ok('a(b)c' =~ qr(a\(b\)c), "'\\(' is a literal in qr(...)");
+        ok('a[b]c' =~ qr[a\[b\]c], "'\\[' is a literal in qr[...]");
+        ok('a{3}c' =~ qr{a\{3\}c},  # Only failed when { could be a meta
+              "'\\{' is a literal in qr{...}, where it could be a quantifier");
 
+        # This one is for completeness
+        ok('a<b>c' =~ qr<a\<b\>c>, "'\\<' is a literal in qr<...>)");
+    }
+
+    {   # Was getting optimized into EXACT (non-folding node)
+        my $x = qr/[x]/i;
+        utf8::upgrade($x);
+        like("X", qr/$x/, "UTF-8 of /[x]/i matches upper case");
+    }
+
+    {   # make sure we get an error when \p{} cannot load Unicode tables
+        fresh_perl_like(<<'        prog that cannot load uni tables',
+            BEGIN {
+                @INC = '../lib';
+                require utf8; require 'utf8_heavy.pl';
+                @INC = ();
+            }
+            $name = 'A B';
+            if ($name =~ /(\p{IsUpper}) (\p{IsUpper})/){
+                print "It's good! >$1< >$2<\n";
+            } else {
+                print "It's not good...\n";
+            }
+        prog that cannot load uni tables
+                  qr/^Can't locate unicore\/Heavy\.pl(?x:
+                   )|^Can't find Unicode property definition/,
+                  undef,
+                 '\p{} should not fail silently when uni tables evanesce');
+    }
+
+    {   # Special handling of literal-ended ranges in [...] was breaking this
+        use utf8;
+        like("ÿ", qr/[ÿ-ÿ]/, "\"ÿ\" should match [ÿ-ÿ]");
+    }
+
+    {  # [perl #123539]
+        like("TffffffffffffTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT5TTTTTTTTTTTTTTTTTTTTTTTTT3TTgTTTTTTTTTTTTTTTTTTTTT2TTTTTTTTTTTTTTTTTTTTTTTHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHiHHHHHHHfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff&ffff", qr/TffffffffffffTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT5TTTTTTTTTTTTTTTTTTTTTTTTT3TTgTTTTTTTTTTTTTTTTTTTTT2TTTTTTTTTTTTTTTTTTTTTTTHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHiHHHHHHHfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff&ffff/il, "");
+        like("TffffffffffffT\x{100}TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT5TTTTTTTTTTTTTTTTTTTTTTTTT3TTgTTTTTTTTTTTTTTTTTTTTT2TTTTTTTTTTTTTTTTTTTTTTTHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHiHHHHHHHfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff&ffff", qr/TffffffffffffT\x{100}TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT5TTTTTTTTTTTTTTTTTTTTTTTTT3TTgTTTTTTTTTTTTTTTTTTTTT2TTTTTTTTTTTTTTTTTTTTTTTHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHiHHHHHHHfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff&ffff/il, "");
+    }
+
+       {       # [perl #123604]
+               my($s, $x, @x) = ('abc', 'a', 'd');
+               my $long = 'b' x 2000;
+               my $eval = q{$s =~ m{$x[bbb]c} ? 1 : 0};
+               $eval =~ s{bbb}{$long};
+               my $match = eval $eval;
+               ok(1, "did not crash");
+               ok($match, "[bbb...] resolved as character class, not subscript");
+       }
+
+       {       # [perl #123755]
+               for my $pat ('(??', '(?P', '(?i-') {
+                       eval qq{ qr/$pat/ };
+                       ok(1, "qr/$pat/ did not crash");
+                       eval qq{ qr/${pat}\x{123}/ };
+                       my $e = $@;
+                       like($e, qr{\x{123}},
+                               "qr/${pat}x/ shows x in error even if it's a wide character");
+               }
+       }
+
+       {
+               # Expect one of these sizes to cause overflow and wrap to negative
+               for my $bits (32, 64) {
+                       my $wrapneg = 2 ** ($bits - 2) * 3;
+                       for my $sign ('', '-') {
+                               my $pat = sprintf "qr/(?%s%u)/", $sign, $wrapneg;
+                               eval $pat;
+                               ok(1, "big backref $pat did not crash");
+                       }
+               }
+       }
+        {
+            # Test that we handle qr/\8888888/ and variants without an infinite loop,
+            # we use a test within a test so we can todo it, and make sure we don't
+            # infinite loop our tests.
+            # NOTE - Do not put quotes in the code!
+            # NOTE - We have to triple escape the backref in the pattern below.
+            my $code='
+                BEGIN{require q(test.pl);}
+                watchdog(3);
+                for my $len (1 .. 20) {
+                    my $eights= q(8) x $len;
+                    eval qq{ qr/\\\\$eights/ };
+                }
+                print q(No infinite loop here!);
+            ';
+            fresh_perl_is($code, "No infinite loop here!", {},
+                "test that we handle things like m/\\888888888/ without infinite loops" );
+        }
+
+       {
+               # [perl #123843] hits SEGV trying to compile this pattern
+               my $match;
+               eval q{ ($match) = ("xxyxxyxy" =~ m{(x+(y(?1))*)}) };
+               ok(1, "compiled GOSUB in CURLYM ok");
+               is($match, 'xxyxxyx', "matched GOSUB in CURLYM");
+       }
 } # End of sub run_tests
 
 1;