X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/42e9b60980bb8e29e76629e14c6aa945194c0647..81d11450691ee281f37c6c4e8055735b972733bd:/t/re/pat.t diff --git a/t/re/pat.t b/t/re/pat.t index 16bfc8e..413fbee 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -6,6 +6,7 @@ use strict; use warnings; +no warnings 'experimental::vlb'; use 5.010; sub run_tests; @@ -20,10 +21,10 @@ BEGIN { require './loc_tools.pl'; set_up_inc('../lib', '.', '../ext/re'); } - skip_all('no re module') unless defined &DynaLoader::boot_DynaLoader; - skip_all_without_unicode_tables(); -plan tests => 837; # Update this when adding/deleting tests. +skip_all_without_unicode_tables(); + +plan tests => 1012; # Update this when adding/deleting tests. run_tests() unless caller; @@ -31,6 +32,7 @@ run_tests() unless caller; # Tests start here. # sub run_tests { + my $sharp_s = uni_to_native("\xdf"); { my $x = "abc\ndef\n"; @@ -138,6 +140,21 @@ sub run_tests { $null = ""; $xyz =~ /$null/; is($&, $xyz, $message); + + # each entry: regexp, match string, $&, //o match success + my @tests = + ( + [ "", "xy", "x", 1 ], + [ "y", "yz", "y", !1 ], + ); + for my $test (@tests) { + my ($re, $str, $matched, $omatch) = @$test; + $xyz =~ /x/o; + ok($str =~ /$re/, "$str matches /$re/"); + is($&, $matched, "on $matched"); + $xyz =~ /x/o; + is($str =~ /$re/o, $omatch, "$str matches /$re/o (or not)"); + } } { @@ -302,7 +319,7 @@ sub run_tests { # Defaults assumed if this fails eval { require Config; }; - $::reg_infty = $Config::Config{reg_infty} // 32767; + $::reg_infty = $Config::Config{reg_infty} // 65535; $::reg_infty_m = $::reg_infty - 1; $::reg_infty_p = $::reg_infty + 1; $::reg_infty_m = $::reg_infty_m; # Suppress warning. @@ -322,6 +339,11 @@ sub run_tests { like($@, qr/^\QQuantifier in {,} bigger than/, $message); eval "'aaa' =~ /a{1,$::reg_infty_p}/"; like($@, qr/^\QQuantifier in {,} bigger than/, $message); + + # It should be 'a' x 2147483647, but that exhausts memory on + # reasonably sized modern machines + like('a' x $::reg_infty_p, qr/a{1,}/, + "{1,} matches more times than REG_INFTY"); } { @@ -331,9 +353,14 @@ sub run_tests { ok $@ =~ /^\QLookbehind longer than 255 not/, "Lookbehind limit"; } - { - # Long Monsters - for my $l (125, 140, 250, 270, 300000, 30) { # Ordered to free memory + SKIP: + { # Long Monsters + + my @trials = (125, 140, 250, 270, 300000, 30); + + skip('limited memory', @trials * 4) if $ENV{'PERL_SKIP_BIG_MEM_TESTS'}; + + for my $l (@trials) { # Ordered to free memory my $a = 'a' x $l; my $message = "Long monster, length = $l"; like("ba$a=", qr/a$a=/, $message); @@ -344,10 +371,9 @@ sub run_tests { } } - { - # 20000 nodes, each taking 3 words per string, and 1 per branch - my $long_constant_len = join '|', 12120 .. 32645; - my $long_var_len = join '|', 8120 .. 28645; + SKIP: + { # 20000 nodes, each taking 3 words per string, and 1 per branch + my %ans = ( 'ax13876y25677lbc' => 1, 'ax13876y25677mcb' => 0, # not b. 'ax13876y35677nbc' => 0, # Num too big @@ -358,6 +384,11 @@ sub run_tests { 'ax13876y25677y21378y21378y21378kbc' => 0, # 5 runs ); + skip('limited memory', 2 * scalar keys %ans) if $ENV{'PERL_SKIP_BIG_MEM_TESTS'}; + + my $long_constant_len = join '|', 12120 .. 32645; + my $long_var_len = join '|', 8120 .. 28645; + for (keys %ans) { my $message = "20000 nodes, const-len '$_'"; ok !($ans{$_} xor /a(?=([yx]($long_constant_len)){2,4}[k-o]).*b./o), $message; @@ -372,7 +403,7 @@ sub run_tests { $_ = " a (bla()) and x(y b((l)u((e))) and b(l(e)e)e"; my $expect = "(bla()) ((l)u((e))) (l(e)e)"; - use vars '$c'; + our $c; sub matchit { m/ ( @@ -719,7 +750,7 @@ sub run_tests { is($#+, 2, $message); is($#-, 1, $message); - # Check that values don’t stick + # Check that values don't stick " "=~/()()()(.)(..)/; my($m,$p,$q) = (\$-[5], \$+[5], \${^CAPTURE}[4]); () = "$$_" for $m, $p, $q; # FETCH (or eqv.) @@ -1330,6 +1361,7 @@ EOP unlike "\x{100}", qr/(?i:\w)/, "(?i: shouldn't lose the passed in /a"; use re '/aa'; unlike 'k', qr/(?i:\N{KELVIN SIGN})/, "(?i: shouldn't lose the passed in /aa"; + unlike 'k', qr'(?i:\N{KELVIN SIGN})', "(?i: shouldn't lose the passed in /aa"; } { @@ -1394,12 +1426,96 @@ EOP { # Various flags weren't being set when a [] is optimized into an # EXACTish node - ; - ; - my $sharp_s = uni_to_native("\xdf"); ok("\x{017F}\x{017F}" =~ qr/^[$sharp_s]?$/i, "[] to EXACTish optimization"); } + { # Test that it avoids spllitting a multi-char fold across nodes. + # These all fold to things that are like 'ss', which, if split across + # nodes could fail to match a single character that folds to the + # combination. 1F0 byte expands when folded; + my $utf8_locale = find_utf8_ctype_locale(); + for my $char('F', $sharp_s, "\x{1F0}", "\x{FB00}") { + my $length = 260; # Long enough to overflow an EXACTFish regnode + my $p = $char x $length; + my $s = ($char eq $sharp_s) ? 'ss' + : $char eq "\x{1F0}" + ? "j\x{30c}" + : 'ff'; + $s = $s x $length; + for my $charset (qw(u d l aa)) { + for my $utf8 (0..1) { + for my $locale ('C', $utf8_locale) { + SKIP: + { + skip "test skipped for non-C locales", 2 + if $charset ne 'l' + && (! defined $locale || $locale ne 'C'); + if ($charset eq 'l') { + if (! defined $locale) { + skip "No UTF-8 locale", 2; + } + skip "Can't test in miniperl",2 + if is_miniperl(); + + require POSIX; + POSIX::setlocale(&LC_CTYPE, $locale); + } + + my $pat = $p; + utf8::upgrade($pat) if $utf8; + my $should_pass = + ( $charset eq 'u' + || ($charset eq 'd' && $utf8) + || ($charset eq 'd' && ( $char =~ /[[:ascii:]]/ + || ord $char > 255)) + || ($charset eq 'aa' && $char =~ /[[:ascii:]]/) + || ($charset eq 'l' && $locale ne 'C') + || ($charset eq 'l' && $char =~ /[[:ascii:]]/) + ); + my $name = "(?i$charset), utf8=$utf8, locale=$locale," + . " char=" . sprintf "%x", ord $char; + no warnings 'locale'; + is (eval " '$s' =~ qr/(?i$charset)$pat/;", + $should_pass, $name); + fail "$name: $@" if $@; + is (eval " 'a$s' =~ qr/(?i$charset)a$pat/;", + $should_pass, "extra a, $name"); + fail "$name: $@" if $@; + } + } + } + } + } + } + + SKIP: + { + skip "no re debug", 5 if is_miniperl; + my $s = ("0123456789" x 26214) x 2; # Should fill 2 LEXACTS, plus + # small change + my $pattern_prefix = "use utf8; use re qw(Debug COMPILE)"; + my $pattern = "$pattern_prefix; qr/$s/;"; + my $result = fresh_perl($pattern); + if ($? != 0) { # Re-run so as to display STDERR. + fail($pattern); + fresh_perl($pattern, { stderr => 0, verbose => 1 }); + } + like($result, qr/Final program[^X]*\bLEXACT\b[^X]*\bLEXACT\b[^X]*\bEXACT\b[^X]*\bEND\b/s, + "Check that LEXACT nodes are generated"); + like($s, qr/$s/, "Check that LEXACT nodes match"); + like("a$s", qr/a$s/, "Previous test preceded by an 'a'"); + substr($s, 260000, 1) = "\x{100}"; + $pattern = "$pattern_prefix; qr/$s/;"; + $result = fresh_perl($pattern, { 'wide_chars' => 1 } ); + if ($? != 0) { # Re-run so as to display STDERR. + fail($pattern); + fresh_perl($pattern, { stderr => 0, verbose => 1 }); + } + like($result, qr/Final program[^X]*\bLEXACT_REQ8\b[^X]*\bLEXACT\b[^X]*\bEXACT\b[^X]*\bEND\b/s, + "Check that an LEXACT_ONLY node is generated with a \\x{100}"); + like($s, qr/$s/, "Check that LEXACT_REQ8 nodes match"); + } + { for my $char (":", uni_to_native("\x{f7}"), "\x{2010}") { my $utf8_char = $char; @@ -1448,7 +1564,17 @@ EOP # test that this is true for 1..100 # Note that this test causes the engine to recurse at runtime, and # hence use a lot of C stack. + + # Compiling for all 100 nested captures blows the stack under + # clang and ASan; reduce. + my $max_captures = $Config{ccflags} =~ /sanitize/ ? 20 : 100; + for my $i (1..100) { + if ($i > $max_captures) { + pass("skipping $i buffers under ASan aa"); + pass("skipping $i buffers under ASan aba"); + next; + } my $capture= "a"; $capture= "($capture)" for 1 .. $i; for my $mid ("","b") { @@ -1630,26 +1756,6 @@ EOP like("X", qr/$x/, "UTF-8 of /[x]/i matches upper case"); } - { # make sure we get an error when \p{} cannot load Unicode tables - fresh_perl_like(<<' prog that cannot load uni tables', - BEGIN { - @INC = '../lib'; - require utf8; require 'utf8_heavy.pl'; - @INC = (); - } - $name = 'A B'; - if ($name =~ /(\p{IsUpper}) (\p{IsUpper})/){ - print "It's good! >$1< >$2<\n"; - } else { - print "It's not good...\n"; - } - prog that cannot load uni tables - qr/^Can't locate unicore\/Heavy\.pl(?x: - )|^Can't find Unicode property definition/, - undef, - '\p{} should not fail silently when uni tables evanesce'); - } - { # Special handling of literal-ended ranges in [...] was breaking this use utf8; like("ÿ", qr/[ÿ-ÿ]/, "\"ÿ\" should match [ÿ-ÿ]"); @@ -1711,9 +1817,10 @@ EOP "test that we handle things like m/\\888888888/ without infinite loops" ); } + SKIP: { # Test that we handle some malformed UTF-8 without looping [perl # #123562] - + skip "no Encode", 1 if is_miniperl; my $code=' BEGIN{require q(./test.pl);} use Encode qw(_utf8_on); @@ -1814,11 +1921,6 @@ EOP ok($AE =~ $re, '/[\xE6\s]/i matches \xC6 when in UTF-8'); } - { # [perl #126606 crashed the interpreter - no warnings 'deprecated'; - like("sS", qr/\N{}Ss|/i, "\N{} with empty branch alternation works"); - } - { is(0+("\n" =~ m'\n'), 1, q|m'\n' should interpolate escapes|); } @@ -1860,6 +1962,26 @@ EOF_CODE like($got[5],qr/Error: Infinite recursion via empty pattern/, "empty pattern in regex codeblock: produced the right exception message" ); } + + # This test is based on the one directly above, which happened to + # leak. Repeat the test, but stripped down to the bare essentials + # of the leak, which is to die while executing a regex which is + # already the current regex, thus causing the saved outer set of + # capture offsets to leak. The test itself doesn't do anything + # except sit around hoping not to be triggered by ASan + { + eval { + my $s = "abcd"; + $s =~ m{([abcd]) (?{ die if $1 eq 'd'; })}gx; + $s =~ //g; + $s =~ //g; + $s =~ //g; + }; + pass("call to current regex doesn't leak"); + } + + + { # [perl #130495] /x comment skipping stopped a byte short, leading # to assertion failure or 'malformed utf-8 character" warning @@ -1903,6 +2025,10 @@ EOP } { # buffer overflow + + # This test also used to leak - fixed by the commit which added + # this line. + fresh_perl_is("BEGIN{\$^H=0x200000}\ns/[(?{//xx", "Unmatched [ in regex; marked by <-- HERE in m/[ <-- HERE (?{/ at (eval 1) line 1.\n", {}, "buffer overflow for regexp component"); @@ -1911,6 +2037,203 @@ EOP # [perl #129281] buffer write overflow, detected by ASAN, valgrind fresh_perl_is('/0(?0)|^*0(?0)|^*(^*())0|/', '', {}, "don't bump whilem_c too much"); } + { + # RT #131893 - fails with ASAN -fsanitize=undefined + fresh_perl_is('qr/0(0?(0||00*))|/', '', {}, "integer overflow during compilation"); + } + + { + # RT #131575 intuit skipping back from the end to find the highest + # possible start point, was potentially hopping back beyond pos() + # and crashing by calling fbm_instr with a negative length + + my $text = "=t=\x{5000}"; + pos($text) = 3; + ok(scalar($text !~ m{(~*=[a-z]=)}g), "RT #131575"); + } + { + fresh_perl_is('"AA" =~ m/AA{1,0}/','',{},"handle OPFAIL insert properly"); + } + { + fresh_perl_is('$_="0\x{1000000}";/^000?\0000/','',{},"dont throw assert errors trying to fbm past end of string"); + } + { # [perl $132227] + fresh_perl_is("('0ba' . ('ss' x 300)) =~ m/0B\\N{U+41}" . $sharp_s x 150 . '/i and print "1\n"', 1,{},"Use of sharp s under /di that changes to /ui"); + + # A variation, but as far as khw knows not part of 132227 + fresh_perl_is("'0bssa' =~ m/0B" . $sharp_s . "\\N{U+41}" . '/i and print "1\n"', 1,{},"Use of sharp s under /di that changes to /ui"); + } + { # [perl $132164] + fresh_perl_is('m m0*0+\Rm', "",{},"Undefined behavior in address sanitizer"); + } + { # [perl #133642] + fresh_perl_is('no warnings "experimental::vlb"; + m/((?<=(0?)))/', "",{},"Was getting 'Double free'"); + } + { # [perl #133782] + # this would panic on DEBUGGING builds + fresh_perl_is(<<'CODE', "ok\nok\n",{}, 'Bad length magic was left on $^R'); +while( "\N{U+100}bc" =~ /(..?)(?{$^N})/g ) { + print "ok\n" if length($^R)==length("$^R"); +} +CODE + } + { # [perl #133871], ASAN/valgrind out-of-bounds access + fresh_perl_like('qr/(?|(())|())|//', qr/syntax error/, {}, "[perl #133871]"); + } + { # [perl #133871], ASAN/valgrind out-of-bounds access + fresh_perl_like('qr/\p{nv:NAnq}/', qr/Can't find Unicode property definition/, {}, "GH #17367"); + } + + SKIP: + { # [perl #133921], segfault + skip "Not valid for EBCDIC", 5 if $::IS_EBCDIC; + + fresh_perl_is('qr0||ß+p00000F00000ù\Q00000ÿ00000x00000x0c0e0\Qx0\Qx0\x{0c!}\;\;î0\xÿÿÿþù\Q`\Qx`{0c!}e;ù\ò`\Qm`\x{0c!}\;\;îçÿ ç!F/;îçÿù\Qxÿÿÿÿù`x{0c!}e;ù\Q`\Qx`\x{c!}\;\;îç!}\;îçÿù\Q‡ \xÿÿÿÿ>=\Qx`\Qx`ù\ò`\Qx`\x{0c!};\;îçÿ Fn0t0c €d;t ù ç€!00000000000000000000000m/0000000000000000000000000000000m/\x{){} )|i', "", {}, "[perl #133921]"); + fresh_perl_is('|ß+W0ü0r0\Qx0\Qx0x0c0G00000000000000000O000000000x0x0x0c!}\;îçÿù\Q0 \xÿÿÿÿù\Q`\Qx`{0d ;ù\ò`\Qm`\x{0c!}\;\;îçÿ ç!F/;îçÿù\Qxÿÿÿÿù`x{0c!};ù\Q`\Qq`\x{c!}\;\;îç!}\;îçÿù\Q‡ \xÿÿÿÿ>=\Qx`\Qx`ù\ò`\Qx`\x{0c!};\;îçÿ 0000000Fm0t0c €d;t ù ç€!00000000000000000000000m/0000000000000000000000000000000m/\x{){} )|i', "", {}, "[perl #133921]"); + +fresh_perl_is('s|ß+W0ü0f0\Qx0\Qx0x0c0G0xgive0000000000000O0h000x0 \xòÿÿÿù\Q`\Q + + + + + ç + + + + + + + + + + + + + + + +x{0c!}\;\;çÿ q0/i0/!F/;îçÿù\Qxÿÿÿÿù`x{0c!}e;ù\Q`\Qx`\x{0c!}\;ÿÿÿÿ!}\;îçÿù\Q‡\xÿÿÿÿ>=\Qx`\Qx`ù\ò`ÿ>=\Qx`\Qx`ù\ò`\Qx`\x{0c!};\;îçÿ u00000F000t0p €d? ù ç€!00000000000000000000000m/0000000000000000000000000000000m/0\} )|i', "", {}, "[perl #133921]"); + + fresh_perl_is('a aúúv sWtrt\ó||ß+Wüefù\Qx`\Qx`\x{1c!gGnuc given1111111111111O1111each111\jx` \xòÿÿÿù\Qx`\Q + + + + + + ç + + + + + + + + + + + + + + + +x{1c!}\;\;îçÿp qr/elsif/!eF/;îçÿù\QxÿÿÿÿùHQx`Lx{1c!}e;ù\Qx`\Qx`\x{1c!}\;ÿÿÿÿc!}\;îçÿù\Qx‡\xÿÿÿÿ>=\Qx`\Qx`ù\òx`ÿ>=\Qx`\Qx`ù\òx`\Qx`\x{1c!}8;\;îçÿp unshifteFnormat0cmp €d?not ùp ç€!0000000000000000000000000m/00000000000000000000000000000000m/0R\} )|\aï||K??p€¿ÿÿfúd{\{gri{\x{1x/} ð¹NuntiÀh', "", {}, "[perl #133921]"); + + fresh_perl_is('s|ß+W0ü0f0\Qx0\Qx0x0c0g0c 000n0000000000000O0h000x0 \xòÿÿÿù\Q`\Q + + + + + + ç + + + + + + + + + + + + + + + +x{0c!}\;\;îçÿ /0f/!F/;îçÿù\Qxÿÿÿÿù`x{0c!};ù\Q`\Qx`\x{0c!}\;ÿÿÿÿ!}\;îçÿù\Q‡\xÿÿÿÿ>=\Qx`\Qx`ù\ò`ÿ>=\Qx`\Qx`ù\ò`\Qx`\x{0c!};\;îçÿ 000t0F000t0p €d?n ù ç€!00000000000000000000000m/0000000000000000000000000000000m/\} )|i', "", {}, "[perl #133933]"); + } + + { # perl #133998] + fresh_perl_is('print "\x{110000}" =~ qr/(?l)|[^\S\pC\s]/', 1, {}, + '/[\S\s]/l works'); + } + + { # perl #133995] + use utf8; + fresh_perl_is('"έδωσαν ελληνικήვე" =~ m/[^0](?=0)0?/', "", + {wide_chars => 1}, + '[^0] doesnt crash on UTF-8 target string'); + } + + { # [perl #133992] This is a tokenizer bug of parsing a pattern + fresh_perl_is(q:$z = do { + use utf8; + "q!тест! =~ m'" + }; + $z .= 'è(?#„'; + $z .= "'"; + eval $z;:, "", {}, 'foo'); + } + + { # [perl #134325] + my $quote="\\Q"; + my $back="\\\\"; + my $ff="\xff"; + my $s = sprintf "/\\1|(|%s)%s%s /i", + $quote x 8 . $back x 69, + $quote x 5 . $back x 4, + $ff x 48; + like(runperl(prog => "$s", stderr => 1), qr/Unmatched \(/); + } + + { # GitHub #17196, caused assertion failure + fresh_perl_like('("0" x 258) =~ /(?l)0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000/', + qr/^$/, + {}, + "Assertion failure with /l exact string longer than a single node"); + } + +SKIP: + { # [perl #134334], Assertion failure + my $utf8_locale = find_utf8_ctype_locale(); + skip "no UTF-8 locale available" unless $utf8_locale; + fresh_perl_like("use POSIX; POSIX::setlocale(&LC_CTYPE, '$utf8_locale'); 'ssss' =~ /\xDF+?sX/il;", + qr/^$/, + {}, + "Assertion failure matching /il on single char folding to multi"); + } + + { # Test ANYOFHs + my $pat = qr/[\x{4000001}\x{4000003}\x{4000005}]+/; + unlike("\x{4000000}", $pat, "4000000 isn't in pattern"); + like("\x{4000001}", $pat, "4000001 is in pattern"); + unlike("\x{4000002}", $pat, "4000002 isn't in pattern"); + like("\x{4000003}", $pat, "4000003 is in pattern"); + unlike("\x{4000004}", $pat, "4000004 isn't in pattern"); + like("\x{4000005}", $pat, "4000005 is in pattern"); + unlike("\x{4000006}", $pat, "4000006 isn't in pattern"); + + # gh #17319 + $pat = qr/[\N{U+200D}\N{U+2000}]()/; + unlike("\x{1FFF}", $pat, "1FFF isn't in pattern"); + like("\x{2000}", $pat, "2000 is in pattern"); + unlike("\x{2001}", $pat, "2001 isn't in pattern"); + unlike("\x{200C}", $pat, "200C isn't in pattern"); + like("\x{200D}", $pat, "200 is in pattern"); + unlike("\x{200E}", $pat, "200E isn't in pattern"); + } + } # End of sub run_tests 1;