t/re/pat_rt_report.t

   1 #!./perl
   2 #
   3 # This is a home for regular expression tests that don't fit into
   4 # the format supported by re/regexp.t.  If you want to add a test
   5 # that does fit that format, add it to re/re_tests, not here.
   6
   7 use strict;
   8 use warnings;
   9 use 5.010;
  10 use Config;
  11
  12 sub run_tests;
  13
  14 $| = 1;
  15
  16
  17 BEGIN {
  18     chdir 't' if -d 't';
  19     @INC = ('../lib','.');
  20     require './test.pl';
  21     skip_all_if_miniperl("miniperl can't load Tie::Hash::NamedCapture, need for %+ and %-");
  22 }
  23
  24
  25 plan tests => 2527;  # Update this when adding/deleting tests.
  26
  27 run_tests() unless caller;
  28
  29 #
  30 # Tests start here.
  31 #
  32 sub run_tests {
  33
  34     like("A \x{263a} B z C", qr/A . B (??{ "z" }) C/,
  35          "Match UTF-8 char in presence of (??{ }); Bug 20000731.001");
  36
  37     {
  38         no warnings 'uninitialized';
  39         ok(undef =~ /^([^\/]*)(.*)$/, "Used to cause a SEGV; Bug 20001021.005");
  40     }
  41
  42     {
  43         my $message = 'bug id 20001008.001';
  44
  45         my @x = ("stra\337e 138", "stra\337e 138");
  46         for (@x) {
  47             ok(s/(\d+)\s*([\w\-]+)/$1 . uc $2/e, $message);
  48             ok(my ($latin) = /^(.+)(?:\s+\d)/, $message);
  49             is($latin, "stra\337e", $message);
  50             ok($latin =~ s/stra\337e/straße/, $message);
  51             #
  52             # Previous code follows, but outcommented - there were no tests.
  53             #
  54             # $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a
  55             # use utf8; # needed for the raw UTF-8
  56             # $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a
  57         }
  58     }
  59
  60     {
  61         # Fist half of the bug.
  62         my $message = 'HEBREW ACCENT QADMA matched by .*; Bug 20001028.003';
  63         my $X = chr (1448);
  64         ok(my ($Y) = $X =~ /(.*)/, $message);
  65         is($Y, v1448, $message);
  66         is(length $Y, 1, $message);
  67
  68         # Second half of the bug.
  69         $message = 'HEBREW ACCENT QADMA in replacement; Bug 20001028.003';
  70         $X = '';
  71         $X =~ s/^/chr(1488)/e;
  72         is(length $X, 1, $message);
  73         is(ord $X, 1488, $message);
  74     }
  75
  76     {
  77         my $message = 'Repeated s///; Bug 20001108.001';
  78         my $X = "Szab\x{f3},Bal\x{e1}zs";
  79         my $Y = $X;
  80         $Y =~ s/(B)/$1/ for 0 .. 3;
  81         is($Y, $X, $message);
  82         is($X, "Szab\x{f3},Bal\x{e1}zs", $message);
  83     }
  84
  85     {
  86         my $message = 's/// on UTF-8 string; Bug 20000517.001';
  87         my $x = "\x{100}A";
  88         $x =~ s/A/B/;
  89         is($x, "\x{100}B", $message);
  90         is(length $x, 2, $message);
  91     }
  92
  93     {
  94         my $message = '\C and É; Bug 20001230.002';
  95         ok("École" =~ /^\C\C(.)/ && $1 eq 'c', $message);
  96         like("École", qr/^\C\C(c)/, $message);
  97     }
  98
  99     {
 100         # The original bug report had 'no utf8' here but that was irrelevant.
 101
 102         my $message = "Don't dump core; Bug 20010306.008";
 103         my $a = "a\x{1234}";
 104         like($a, qr/\w/, $message);  # used to core dump.
 105     }
 106
 107     {
 108         my $message = '/g in scalar context; Bug 20010410.006';
 109         for my $rx ('/(.*?)\{(.*?)\}/csg',
 110                     '/(.*?)\{(.*?)\}/cg',
 111                     '/(.*?)\{(.*?)\}/sg',
 112                     '/(.*?)\{(.*?)\}/g',
 113                     '/(.+?)\{(.+?)\}/csg',) {
 114             my $i = 0;
 115             my $input = "a{b}c{d}";
 116             eval <<"            --";
 117                 while (eval \$input =~ $rx) {
 118                     \$i ++;
 119                 }
 120             --
 121             is($i, 2, $message);
 122         }
 123     }
 124
 125     {
 126         # Amazingly vertical tabulator is the same in ASCII and EBCDIC.
 127         for ("\n", "\t", "\014", "\r") {
 128             unlike($_, qr/[[:print:]]/, sprintf "\\%03o not in [[:print:]]; Bug 20010619.003", ord $_);
 129         }
 130         for (" ") {
 131             like($_, qr/[[:print:]]/, "'$_' in [[:print:]]; Bug 20010619.003");
 132         }
 133     }
 134
 135     {
 136         # [ID 20010814.004] pos() doesn't work when using =~m// in list context
 137
 138         $_ = "ababacadaea";
 139         my $a = join ":", /b./gc;
 140         my $b = join ":", /a./gc;
 141         my $c = pos;
 142         is("$a $b $c", 'ba:ba ad:ae 10', "pos() works with () = m//; Bug 20010814.004");
 143     }
 144
 145     {
 146         # [ID 20010407.006] matching utf8 return values from
 147         # functions does not work
 148
 149         my $message = 'UTF-8 return values from functions; Bug 20010407.006';
 150         package ID_20010407_006;
 151         sub x {"a\x{1234}"}
 152         my $x = x;
 153         my $y;
 154       ::ok($x =~ /(..)/, $message);
 155         $y = $1;
 156       ::ok(length ($y) == 2 && $y eq $x, $message);
 157       ::ok(x =~ /(..)/, $message);
 158         $y = $1;
 159       ::ok(length ($y) == 2 && $y eq $x, $message);
 160     }
 161
 162     {
 163         # High bit bug -- japhy
 164         my $x = "ab\200d";
 165         ok $x =~ /.*?\200/, "High bit fine";
 166     }
 167
 168     {
 169         my $message = 'UTF-8 hash keys and /$/';
 170         # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters
 171         #                                         /2002-01/msg01327.html
 172
 173         my $u = "a\x{100}";
 174         my $v = substr ($u, 0, 1);
 175         my $w = substr ($u, 1, 1);
 176         my %u = ($u => $u, $v => $v, $w => $w);
 177         for (keys %u) {
 178             my $m1 =            /^\w*$/ ? 1 : 0;
 179             my $m2 = $u {$_} =~ /^\w*$/ ? 1 : 0;
 180             is($m1, $m2, $message);
 181         }
 182     }
 183
 184     {
 185         my $message = "s///eg [change 13f46d054db22cf4]; Bug 20020124.005";
 186
 187         for my $char ("a", "\x{df}", "\x{100}") {
 188             my $x = "$char b $char";
 189             $x =~ s{($char)}{
 190                   "c" =~ /c/;
 191                   "x";
 192             }ge;
 193             is(substr ($x, 0, 1), substr ($x, -1, 1), $message);
 194         }
 195     }
 196
 197     {
 198         my $message = "Correct pmop flags checked when empty pattern; Bug 20020412.005";
 199
 200         # Requires reuse of last successful pattern.
 201         my $num = 123;
 202         $num =~ /\d/;
 203         for (0 .. 1) {
 204             my $match = m?? + 0;
 205             ok($match != $_, $message)
 206                 or diag(sprintf "'match one' %s on %s iteration" =>
 207                         $match ? 'succeeded' : 'failed',
 208                         $_     ? 'second'    : 'first');
 209         }
 210         $num =~ /(\d)/;
 211         my $result = join "" => $num =~ //g;
 212         is($result, $num, $message);
 213     }
 214
 215     {
 216         my $message = 'UTF-8 regex matches above 32k; Bug 20020630.002';
 217         for (['byte', "\x{ff}"], ['utf8', "\x{1ff}"]) {
 218             my ($type, $char) = @$_;
 219             for my $len (32000, 32768, 33000) {
 220                 my  $s = $char . "f" x $len;
 221                 my  $r = $s =~ /$char([f]*)/gc;
 222                 ok($r, $message) or diag("<$type x $len>");
 223                 ok(!$r || pos ($s) == $len + 1, $message)
 224                     or diag("<$type x $len>; pos = @{[pos $s]}");
 225             }
 226         }
 227     }
 228
 229     {
 230         my $s = "\x{100}" x 5;
 231         my $ok = $s =~ /(\x{100}{4})/;
 232         my ($ord, $len) = (ord $1, length $1);
 233         ok $ok && $ord == 0x100 && $len == 4, "No panic: end_shift [change 0e933229fa758625]";
 234     }
 235
 236     {
 237         our $a = "x\x{100}";
 238         chop $a;    # Leaves the UTF-8 flag
 239         $a .= "y";  # 1 byte before 'y'.
 240
 241         like($a, qr/^\C/,        'match one \C on 1-byte UTF-8; Bug 15763');
 242         like($a, qr/^\C{1}/,     'match \C{1}; Bug 15763');
 243
 244         like($a, qr/^\Cy/,       'match \Cy; Bug 15763');
 245         like($a, qr/^\C{1}y/,    'match \C{1}y; Bug 15763');
 246
 247         unlike($a, qr/^\C\Cy/,     q {don't match two \Cy; Bug 15763});
 248         unlike($a, qr/^\C{2}y/,    q {don't match \C{2}y; Bug 15763});
 249
 250         $a = "\x{100}y"; # 2 bytes before "y"
 251
 252         like($a, qr/^\C/,        'match one \C on 2-byte UTF-8; Bug 15763');
 253         like($a, qr/^\C{1}/,     'match \C{1}; Bug 15763');
 254         like($a, qr/^\C\C/,      'match two \C; Bug 15763');
 255         like($a, qr/^\C{2}/,     'match \C{2}; Bug 15763');
 256
 257         like($a, qr/^\C\C\C/,    'match three \C on 2-byte UTF-8 and a byte; Bug 15763');
 258         like($a, qr/^\C{3}/,     'match \C{3}; Bug 15763');
 259
 260         like($a, qr/^\C\Cy/,     'match two \C; Bug 15763');
 261         like($a, qr/^\C{2}y/,    'match \C{2}; Bug 15763');
 262
 263         unlike($a, qr/^\C\C\Cy/,   q {don't match three \Cy; Bug 15763});
 264         unlike($a, qr/^\C{2}\Cy/,  q {don't match \C{2}\Cy; Bug 15763});
 265         unlike($a, qr/^\C{3}y/,    q {don't match \C{3}y; Bug 15763});
 266
 267         $a = "\x{1000}y"; # 3 bytes before "y"
 268
 269         like($a, qr/^\C/,        'match one \C on three-byte UTF-8; Bug 15763');
 270         like($a, qr/^\C{1}/,     'match \C{1}; Bug 15763');
 271         like($a, qr/^\C\C/,      'match two \C; Bug 15763');
 272         like($a, qr/^\C{2}/,     'match \C{2}; Bug 15763');
 273         like($a, qr/^\C\C\C/,    'match three \C; Bug 15763');
 274         like($a, qr/^\C{3}/,     'match \C{3}; Bug 15763');
 275
 276         like($a, qr/^\C\C\C\C/,  'match four \C on three-byte UTF-8 and a byte; Bug 15763');
 277         like($a, qr/^\C{4}/,     'match \C{4}; Bug 15763');
 278
 279         like($a, qr/^\C\C\Cy/,   'match three \Cy; Bug 15763');
 280         like($a, qr/^\C{3}y/,    'match \C{3}y; Bug 15763');
 281
 282         unlike($a, qr/^\C\C\C\Cy/, q {don't match four \Cy; Bug 15763});
 283         unlike($a, qr/^\C{4}y/,    q {don't match \C{4}y; Bug 15763});
 284     }
 285
 286
 287     {
 288         my $message = 'UTF-8 matching; Bug 15397';
 289         like("\x{100}", qr/\x{100}/, $message);
 290         like("\x{100}", qr/(\x{100})/, $message);
 291         like("\x{100}", qr/(\x{100}){1}/, $message);
 292         like("\x{100}\x{100}", qr/(\x{100}){2}/, $message);
 293         like("\x{100}\x{100}", qr/(\x{100})(\x{100})/, $message);
 294     }
 295
 296     {
 297         my $message = 'Neither ()* nor ()*? sets $1 when matched 0 times; Bug 7471';
 298         local $_       = 'CD';
 299         ok(/(AB)*?CD/ && !defined $1, $message);
 300         ok(/(AB)*CD/  && !defined $1, $message);
 301     }
 302
 303     {
 304         my $message = "Caching shouldn't prevent match; Bug 3547";
 305         my $pattern = "^(b+?|a){1,2}c";
 306         ok("bac"    =~ /$pattern/ && $1 eq 'a', $message);
 307         ok("bbac"   =~ /$pattern/ && $1 eq 'a', $message);
 308         ok("bbbac"  =~ /$pattern/ && $1 eq 'a', $message);
 309         ok("bbbbac" =~ /$pattern/ && $1 eq 'a', $message);
 310     }
 311
 312     {
 313         ok("\x{100}" =~ /(.)/, '$1 should keep UTF-8 ness; Bug 18232');
 314         is($1, "\x{100}",  '$1 is UTF-8; Bug 18232');
 315         { 'a' =~ /./; }
 316         is($1, "\x{100}",  '$1 is still UTF-8; Bug 18232');
 317         isnt($1, "\xC4\x80", '$1 is not non-UTF-8; Bug 18232');
 318     }
 319
 320     {
 321         my $message = "Optimizer doesn't prematurely reject match; Bug 19767";
 322         use utf8;
 323
 324         my $attr = 'Name-1';
 325         my $NormalChar      = qr /[\p{IsDigit}\p{IsLower}\p{IsUpper}]/;
 326         my $NormalWord      = qr /${NormalChar}+?/;
 327         my $PredNameHyphen  = qr /^${NormalWord}(\-${NormalWord})*?$/;
 328
 329         $attr =~ /^$/;
 330         like($attr, $PredNameHyphen, $message);  # Original test.
 331
 332         "a" =~ m/[b]/;
 333         like("0", qr/\p{N}+\z/, $message);         # Variant.
 334     }
 335
 336     {
 337         my $message = "(??{ }) doesn't return stale values; Bug 20683";
 338         our $p = 1;
 339         foreach (1, 2, 3, 4) {
 340             $p ++ if /(??{ $p })/
 341         }
 342         is($p, 5, $message);
 343
 344         {
 345             package P;
 346             $a = 1;
 347             sub TIESCALAR {bless []}
 348             sub FETCH     {$a ++}
 349         }
 350         tie $p, "P";
 351         foreach (1, 2, 3, 4) {
 352             /(??{ $p })/
 353         }
 354         is($p, 5, $message);
 355     }
 356
 357     {
 358         # Subject: Odd regexp behavior
 359         # From: Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>
 360         # Date: Wed, 26 Feb 2003 16:53:12 +0000
 361         # Message-Id: <E18o4nw-0008Ly-00@wisbech.cl.cam.ac.uk>
 362         # To: perl-unicode@perl.org
 363
 364         my $message = 'Markus Kuhn 2003-02-26';
 365
 366         my $x = "\x{2019}\nk";
 367         ok($x =~ s/(\S)\n(\S)/$1 $2/sg, $message);
 368         is($x, "\x{2019} k", $message);
 369
 370         $x = "b\nk";
 371         ok($x =~ s/(\S)\n(\S)/$1 $2/sg, $message);
 372         is($x, "b k", $message);
 373
 374         like("\x{2019}", qr/\S/, $message);
 375     }
 376
 377     {
 378         my $message = "(??{ .. }) in split doesn't corrupt its stack; Bug 21411";
 379         our $i;
 380         is('-1-3-5-', join('', split /((??{$i++}))/, '-1-3-5-'), $message);
 381         no warnings 'syntax';
 382         @_ = split /(?{'WOW'})/, 'abc';
 383         local $" = "|";
 384         is("@_", "a|b|c", $message);
 385     }
 386
 387     is(join('-', split /(?{ split "" })/, "abc"), 'a-b-c', 'nested split');
 388
 389     {
 390         $_ = "code:   'x' { '...' }\n"; study;
 391         my @x; push @x, $& while m/'[^\']*'/gx;
 392         local $" = ":";
 393         is("@x", "'x':'...'", "Parse::RecDescent triggered infinite loop; Bug 17757");
 394     }
 395
 396     {
 397         sub func ($) {
 398             ok("a\nb" !~ /^b/,  "Propagated modifier; $_[0]; Bug 22354");
 399             ok("a\nb" =~ /^b/m, "Propagated modifier; $_[0] - with /m; Bug 22354");
 400         }
 401         func "standalone";
 402         $_ = "x"; s/x/func "in subst"/e;
 403         $_ = "x"; s/x/func "in multiline subst"/em;
 404         $_ = "x"; /x(?{func "in regexp"})/;
 405         $_ = "x"; /x(?{func "in multiline regexp"})/m;
 406     }
 407
 408     {
 409         $_    = "abcdef\n";
 410         my @x = m/./g;
 411         is("abcde", $`, 'Global match sets $`; Bug 19049');
 412     }
 413
 414     {
 415         # [perl #23769] Unicode regex broken on simple example
 416         # regrepeat() didn't handle UTF-8 EXACT case right.
 417
 418         my $Mess       = 'regrepeat() handles UTF-8 EXACT case right';
 419         my $message = "$Mess; Bug 23769";
 420
 421         my $s = "\x{a0}\x{a0}\x{a0}\x{100}"; chop $s;
 422
 423         like($s, qr/\x{a0}/, $message);
 424         like($s, qr/\x{a0}+/, $message);
 425         like($s, qr/\x{a0}\x{a0}/, $message);
 426
 427         $message = "$Mess (easy variant); Bug 23769";
 428         ok("aaa\x{100}" =~ /(a+)/, $message);
 429         is($1, "aaa", $message);
 430
 431         $message = "$Mess (easy invariant); Bug 23769";
 432         ok("aaa\x{100}     " =~ /(a+?)/, $message);
 433         is($1, "a", $message);
 434
 435         $message = "$Mess (regrepeat variant); Bug 23769";
 436         ok("\xa0\xa0\xa0\x{100}    " =~ /(\xa0+?)/, $message);
 437         is($1, "\xa0", $message);
 438
 439         $message = "$Mess (regrepeat invariant); Bug 23769";
 440         ok("\xa0\xa0\xa0\x{100}" =~ /(\xa0+)/, $message);
 441         is($1, "\xa0\xa0\xa0", $message);
 442
 443         $message = "$Mess (hard variant); Bug 23769";
 444         ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+?)/, $message);
 445         is($1, "\xa0\xa1", $message);
 446
 447         $message = "$Mess (hard invariant); Bug 23769";
 448         ok("ababab\x{100}  " =~ /((?:ab)+)/, $message);
 449         is($1, 'ababab', $message);
 450
 451         ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+)/, $message);
 452         is($1, "\xa0\xa1\xa0\xa1\xa0\xa1", $message);
 453
 454         ok("ababab\x{100}  " =~ /((?:ab)+?)/, $message);
 455         is($1, "ab", $message);
 456
 457         $message = "Don't match first byte of UTF-8 representation; Bug 23769";
 458         unlike("\xc4\xc4\xc4", qr/(\x{100}+)/, $message);
 459         unlike("\xc4\xc4\xc4", qr/(\x{100}+?)/, $message);
 460         unlike("\xc4\xc4\xc4", qr/(\x{100}++)/, $message);
 461     }
 462
 463     {
 464         # perl panic: pp_match start/end pointers
 465
 466         is(eval {my ($x, $y) = "bca" =~ /^(?=.*(a)).*(bc)/; "$x-$y"}, "a-bc",
 467            'Captures can move backwards in string; Bug 25269');
 468     }
 469
 470     {
 471         # \cA not recognized in character classes
 472         like("a\cAb", qr/\cA/, '\cA in pattern; Bug 27940');
 473         like("a\cAb", qr/[\cA]/, '\cA in character class; Bug 27940');
 474         like("a\cAb", qr/[\cA-\cB]/, '\cA in character class range; Bug 27940');
 475         like("abc", qr/[^\cA-\cB]/, '\cA in negated character class range; Bug 27940');
 476         like("a\cBb", qr/[\cA-\cC]/, '\cB in character class range; Bug 27940');
 477         like("a\cCbc", qr/[^\cA-\cB]/, '\cC in negated character class range; Bug 27940');
 478         like("a\cAb", qr/(??{"\cA"})/, '\cA in ??{} pattern; Bug 27940');
 479         unlike("ab", qr/a\cIb/x, '\cI in pattern; Bug 27940');
 480     }
 481
 482     {
 483         # perl #28532: optional zero-width match at end of string is ignored
 484
 485         ok("abc" =~ /^abc(\z)?/ && defined($1),
 486            'Optional zero-width match at end of string; Bug 28532');
 487         ok("abc" =~ /^abc(\z)??/ && !defined($1),
 488            'Optional zero-width match at end of string; Bug 28532');
 489     }
 490
 491     {
 492         my $utf8 = "\xe9\x{100}"; chop $utf8;
 493         my $latin1 = "\xe9";
 494
 495         like($utf8, qr/\xe9/i, "utf8/latin; Bug 36207");
 496         like($utf8, qr/$latin1/i, "utf8/latin runtime; Bug 36207");
 497         like($utf8, qr/(abc|\xe9)/i, "utf8/latin trie; Bug 36207");
 498         like($utf8, qr/(abc|$latin1)/i, "utf8/latin trie runtime; Bug 36207");
 499
 500         like("\xe9", qr/$utf8/i, "latin/utf8; Bug 36207");
 501         like("\xe9", qr/(abc|$utf8)/i, "latin/utf8 trie; Bug 36207");
 502         like($latin1, qr/$utf8/i, "latin/utf8 runtime; Bug 36207");
 503         like($latin1, qr/(abc|$utf8)/i, "latin/utf8 trie runtime; Bug 36207");
 504     }
 505
 506     {
 507         my $s = "abcd";
 508         $s =~ /(..)(..)/g;
 509         $s = $1;
 510         $s = $2;
 511         is($2, 'cd',
 512            "Assigning to original string does not corrupt match vars; Bug 37038");
 513     }
 514
 515     {
 516         {
 517             package wooosh;
 518             sub gloople {"!"}
 519         }
 520         my $aeek = bless {} => 'wooosh';
 521         is(do {$aeek -> gloople () =~ /(.)/g}, 1,
 522            "//g match against return value of sub [change e26a497577f3ce7b]");
 523
 524         sub gloople {"!"}
 525         is(do{gloople () =~ /(.)/g}, 1,
 526            "change e26a497577f3ce7b didn't affect sub calls for some reason");
 527     }
 528
 529     {
 530         # [perl #78680]
 531         # See changes 26925-26928, which reverted change 26410
 532         {
 533             package lv;
 534             our $var = "abc";
 535             sub variable : lvalue {$var}
 536         }
 537         my $o = bless [] => 'lv';
 538         my $f = "";
 539         my $r = eval {
 540             for (1 .. 2) {
 541                 $f .= $1 if $o -> variable =~ /(.)/g;
 542             }
 543             1;
 544         };
 545         if ($r) {
 546             is($f, "ab", "pos() retained between calls");
 547         }
 548         else {
 549             ok 0, "Code failed: $@";
 550         }
 551
 552         our $var = "abc";
 553         sub variable : lvalue {$var}
 554         my $g = "";
 555         my $s = eval {
 556             for (1 .. 2) {
 557                 $g .= $1 if variable =~ /(.)/g;
 558             }
 559             1;
 560         };
 561         if ($s) {
 562             is($g, "ab", "pos() retained between calls");
 563         }
 564         else {
 565             ok 0, "Code failed: $@";
 566         }
 567     }
 568
 569   SKIP:
 570     {
 571         skip "In EBCDIC" if $::IS_EBCDIC;
 572         no warnings 'utf8';
 573         $_ = pack 'U0C2', 0xa2, 0xf8;  # Ill-formed UTF-8
 574         my $ret = 0;
 575         is(do {!($ret = s/[\0]+//g)}, 1,
 576            "Ill-formed UTF-8 doesn't match NUL in class; Bug 37836");
 577     }
 578
 579     {
 580         # chr(65535) should be allowed in regexes
 581
 582         no warnings 'utf8'; # To allow non-characters
 583         my ($c, $r, $s);
 584
 585         $c = chr 0xffff;
 586         $c =~ s/$c//g;
 587         is($c, "", "U+FFFF, parsed as atom; Bug 38293");
 588
 589         $c = chr 0xffff;
 590         $r = "\\$c";
 591         $c =~ s/$r//g;
 592         is($c, "", "U+FFFF backslashed, parsed as atom; Bug 38293");
 593
 594         $c = chr 0xffff;
 595         $c =~ s/[$c]//g;
 596         is($c, "", "U+FFFF, parsed in class; Bug 38293");
 597
 598         $c = chr 0xffff;
 599         $r = "[\\$c]";
 600         $c =~ s/$r//g;
 601         is($c, "", "U+FFFF backslashed, parsed in class; Bug 38293");
 602
 603         $s = "A\x{ffff}B";
 604         $s =~ s/\x{ffff}//i;
 605         is($s, "AB", "U+FFFF, EXACTF; Bug 38293");
 606
 607         $s = "\x{ffff}A";
 608         $s =~ s/\bA//;
 609         is($s, "\x{ffff}", "U+FFFF, BOUND; Bug 38293");
 610
 611         $s = "\x{ffff}!";
 612         $s =~ s/\B!//;
 613         is($s, "\x{ffff}", "U+FFFF, NBOUND; Bug 38293");
 614     }
 615
 616     {
 617
 618         # The printing characters
 619         my @chars = ("A" .. "Z");
 620         my $delim = ",";
 621         my $size = 32771 - 4;
 622         my $str = '';
 623
 624         # Create some random junk. Inefficient, but it works.
 625         for (my $i = 0; $i < $size; $ i++) {
 626             $str .= $chars [rand @chars];
 627         }
 628
 629         $str .= ($delim x 4);
 630         my $res;
 631         my $matched;
 632         ok($str =~ s/^(.*?)${delim}{4}//s, "Pattern matches; Bug 39583");
 633         is($str, "", "Empty string; Bug 39583");
 634         ok(defined $1 && length ($1) == $size, '$1 is correct size; Bug 39583');
 635     }
 636
 637     {
 638         like("\0-A", qr/\c@-A/, '@- should not be interpolated in a pattern; Bug 27940');
 639         like("\0\0A", qr/\c@+A/, '@+ should not be interpolated in a pattern; Bug 27940');
 640         like("X\@-A", qr/X@-A/, '@- should not be interpolated in a pattern; Bug 27940');
 641         like("X\@\@A", qr/X@+A/, '@+ should not be interpolated in a pattern; Bug 27940');
 642
 643         like("X\0A", qr/X\c@?A/,  '\c@?; Bug 27940');
 644         like("X\0A", qr/X\c@*A/,  '\c@*; Bug 27940');
 645         like("X\0A", qr/X\c@(A)/, '\c@(; Bug 27940');
 646         like("X\0A", qr/X(\c@)A/, '\c@); Bug 27940');
 647         like("X\0A", qr/X\c@|ZA/, '\c@|; Bug 27940');
 648
 649         like("X\@A", qr/X@?A/,  '@?; Bug 27940');
 650         like("X\@A", qr/X@*A/,  '@*; Bug 27940');
 651         like("X\@A", qr/X@(A)/, '@(; Bug 27940');
 652         like("X\@A", qr/X(@)A/, '@); Bug 27940');
 653         like("X\@A", qr/X@|ZA/, '@|; Bug 27940');
 654
 655         local $" = ','; # non-whitespace and non-RE-specific
 656         like('abc', qr/(.)(.)(.)/, 'The last successful match is bogus; Bug 27940');
 657         like("A@+B", qr/A@{+}B/,  'Interpolation of @+ in /@{+}/; Bug 27940');
 658         like("A@-B", qr/A@{-}B/,  'Interpolation of @- in /@{-}/; Bug 27940');
 659         like("A@+B", qr/A@{+}B/x, 'Interpolation of @+ in /@{+}/x; Bug 27940');
 660         like("A@-B", qr/A@{-}B/x, 'Interpolation of @- in /@{-}/x; Bug 27940');
 661     }
 662
 663     {
 664         my $s = 'foo bar baz';
 665         my (@k, @v, @fetch, $res);
 666         my $count = 0;
 667         my @names = qw ($+{A} $+{B} $+{C});
 668         if ($s =~ /(?<A>foo)\s+(?<B>bar)?\s+(?<C>baz)/) {
 669             while (my ($k, $v) = each (%+)) {
 670                 $count++;
 671             }
 672             @k = sort keys   (%+);
 673             @v = sort values (%+);
 674             $res = 1;
 675             push @fetch,
 676                 ["$+{A}", "$1"],
 677                 ["$+{B}", "$2"],
 678                 ["$+{C}", "$3"],
 679             ;
 680         }
 681         foreach (0 .. 2) {
 682             if ($fetch [$_]) {
 683                 is($fetch[$_][0], $fetch[$_][1], "$names[$_]; Bug 50496");
 684             } else {
 685                 ok 0, $names[$_];
 686             }
 687         }
 688         is($res, 1, "'$s' =~ /(?<A>foo)\\s+(?<B>bar)?\\s+(?<C>baz)/; Bug 50496");
 689         is($count, 3, "Got 3 keys in %+ via each; Bug 50496");
 690         is(0 + @k, 3, "Got 3 keys in %+ via keys; Bug 50496");
 691         is("@k", "A B C", "Got expected keys; Bug 50496");
 692         is("@v", "bar baz foo", "Got expected values; Bug 50496");
 693         eval '
 694             no warnings "uninitialized";
 695             print for $+ {this_key_doesnt_exist};
 696         ';
 697         is($@, '', 'lvalue $+ {...} should not throw an exception; Bug 50496');
 698     }
 699
 700     {
 701         #
 702         # Almost the same as the block above, except that the capture is nested.
 703         #
 704
 705         my $s = 'foo bar baz';
 706         my (@k, @v, @fetch, $res);
 707         my $count = 0;
 708         my @names = qw ($+{A} $+{B} $+{C} $+{D});
 709         if ($s =~ /(?<D>(?<A>foo)\s+(?<B>bar)?\s+(?<C>baz))/) {
 710             while (my ($k,$v) = each(%+)) {
 711                 $count++;
 712             }
 713             @k = sort keys   (%+);
 714             @v = sort values (%+);
 715             $res = 1;
 716             push @fetch,
 717                 ["$+{A}", "$2"],
 718                 ["$+{B}", "$3"],
 719                 ["$+{C}", "$4"],
 720                 ["$+{D}", "$1"],
 721             ;
 722         }
 723         foreach (0 .. 3) {
 724             if ($fetch [$_]) {
 725                 is($fetch[$_][0], $fetch[$_][1], "$names[$_]; Bug 50496");
 726             } else {
 727                 ok 0, $names [$_];
 728             }
 729         }
 730         is($res, 1, "'$s' =~ /(?<D>(?<A>foo)\\s+(?<B>bar)?\\s+(?<C>baz))/; Bug 50496");
 731         is($count, 4, "Got 4 keys in %+ via each; Bug 50496");
 732         is(@k, 4, "Got 4 keys in %+ via keys; Bug 50496");
 733         is("@k", "A B C D", "Got expected keys; Bug 50496");
 734         is("@v", "bar baz foo foo bar baz", "Got expected values; Bug 50496");
 735         eval '
 736             no warnings "uninitialized";
 737             print for $+ {this_key_doesnt_exist};
 738         ';
 739         is($@, '', 'lvalue $+ {...} should not throw an exception; Bug 50496');
 740     }
 741
 742     {
 743         my $str = 'abc';
 744         my $count = 0;
 745         my $mval = 0;
 746         my $pval = 0;
 747         while ($str =~ /b/g) {$mval = $#-; $pval = $#+; $count ++}
 748         is($mval,  0, '@- should be empty; Bug 36046');
 749         is($pval,  0, '@+ should be empty; Bug 36046');
 750         is($count, 1, 'Should have matched once only; Bug 36046');
 751     }
 752
 753     {
 754         my $message = '/m in precompiled regexp; Bug 40684';
 755         my $s = "abc\ndef";
 756         my $rex = qr'^abc$'m;
 757         ok($s =~ m/$rex/, $message);
 758         ok($s =~ m/^abc$/m, $message);
 759     }
 760
 761     {
 762         my $message = '(?: ... )? should not lose $^R; Bug 36909';
 763         $^R = 'Nothing';
 764         {
 765             local $^R = "Bad";
 766             ok('x foofoo y' =~ m {
 767                       (foo) # $^R correctly set
 768                       (?{ "last regexp code result" })
 769             }x, $message);
 770             is($^R, 'last regexp code result', $message);
 771         }
 772         is($^R, 'Nothing', $message);
 773
 774         {
 775             local $^R = "Bad";
 776
 777             ok('x foofoo y' =~ m {
 778                       (?:foo|bar)+ # $^R correctly set
 779                       (?{ "last regexp code result" })
 780             }x, $message);
 781             is($^R, 'last regexp code result', $message);
 782         }
 783         is($^R, 'Nothing', $message);
 784
 785         {
 786             local $^R = "Bad";
 787             ok('x foofoo y' =~ m {
 788                       (foo|bar)\1+ # $^R undefined
 789                       (?{ "last regexp code result" })
 790             }x, $message);
 791             is($^R, 'last regexp code result', $message);
 792         }
 793         is($^R, 'Nothing', $message);
 794
 795         {
 796             local $^R = "Bad";
 797             ok('x foofoo y' =~ m {
 798                       (foo|bar)\1 # This time without the +
 799                       (?{"last regexp code result"})
 800             }x, $message);
 801             is($^R, 'last regexp code result', $message);
 802         }
 803         is($^R, 'Nothing', $message);
 804     }
 805
 806     {
 807         my $message = 'Match is linear, not quadratic; Bug 22395';
 808         our $count;
 809         for my $l (10, 100, 1000) {
 810             $count = 0;
 811             ('a' x $l) =~ /(.*)(?{$count++})[bc]/;
 812             local $::TODO = "Should be L+1 not L*(L+3)/2 (L=$l)";
 813             is($count, $l + 1, $message);
 814         }
 815     }
 816
 817     {
 818         my $message = '@-/@+ should not have undefined values; Bug 22614';
 819         local $_ = 'ab';
 820         our @len = ();
 821         /(.){1,}(?{push @len,0+@-})(.){1,}(?{})^/;
 822         is("@len", "2 2 2", $message);
 823     }
 824
 825     {
 826         my $message = '$& set on s///; Bug 18209';
 827         my $text = ' word1 word2 word3 word4 word5 word6 ';
 828
 829         my @words = ('word1', 'word3', 'word5');
 830         my $count;
 831         foreach my $word (@words) {
 832             $text =~ s/$word\s//gi; # Leave a space to separate words
 833                                     # in the resultant str.
 834             # The following block is not working.
 835             if ($&) {
 836                 $count ++;
 837             }
 838             # End bad block
 839         }
 840         is($count, 3, $message);
 841         is($text, ' word2 word4 word6 ', $message);
 842     }
 843
 844     {
 845         # RT#6893
 846
 847         local $_ = qq (A\nB\nC\n);
 848         my @res;
 849         while (m#(\G|\n)([^\n]*)\n#gsx) {
 850             push @res, "$2";
 851             last if @res > 3;
 852         }
 853         is("@res", "A B C", "/g pattern shouldn't infinite loop; Bug 6893");
 854     }
 855
 856     {
 857         # No optimizer bug
 858         my @tails  = ('', '(?(1))', '(|)', '()?');
 859         my @quants = ('*','+');
 860         my $doit = sub {
 861             my $pats = shift;
 862             for (@_) {
 863                 for my $pat (@$pats) {
 864                     for my $quant (@quants) {
 865                         for my $tail (@tails) {
 866                             my $re = "($pat$quant\$)$tail";
 867                             ok(/$re/  && $1 eq $_, "'$_' =~ /$re/; Bug 41010");
 868                             ok(/$re/m && $1 eq $_, "'$_' =~ /$re/m; Bug 41010");
 869                         }
 870                     }
 871                 }
 872             }
 873         };
 874
 875         my @dpats = ('\d',
 876                      '[1234567890]',
 877                      '(1|[23]|4|[56]|[78]|[90])',
 878                      '(?:1|[23]|4|[56]|[78]|[90])',
 879                      '(1|2|3|4|5|6|7|8|9|0)',
 880                      '(?:1|2|3|4|5|6|7|8|9|0)');
 881         my @spats = ('[ ]', ' ', '( |\t)', '(?: |\t)', '[ \t]', '\s');
 882         my @sstrs = ('  ');
 883         my @dstrs = ('12345');
 884         $doit -> (\@spats, @sstrs);
 885         $doit -> (\@dpats, @dstrs);
 886     }
 887
 888     {
 889         # [perl #45605] Regexp failure with utf8-flagged and byte-flagged string
 890
 891         my $utf_8 = "\xd6schel";
 892         utf8::upgrade ($utf_8);
 893         $utf_8 =~ m {(\xd6|&Ouml;)schel};
 894         is($1, "\xd6", "Upgrade error; Bug 45605");
 895     }
 896
 897     {
 898         # Regardless of utf8ness any character matches itself when
 899         # doing a case insensitive match. See also [perl #36207]
 900
 901         for my $o (0 .. 255) {
 902             my @ch = (chr ($o), chr ($o));
 903             utf8::upgrade ($ch [1]);
 904             for my $u_str (0, 1) {
 905                 for my $u_pat (0, 1) {
 906                     like($ch[$u_str], qr/\Q$ch[$u_pat]\E/i,
 907                          "\$c =~ /\$c/i : chr ($o) : u_str = $u_str u_pat = $u_pat; Bug 36207");
 908                     like($ch[$u_str], qr/\Q$ch[$u_pat]\E|xyz/i,
 909                          "\$c=~/\$c|xyz/i : chr($o) : u_str = $u_str u_pat = $u_pat; Bug 36207");
 910                 }
 911             }
 912         }
 913     }
 914
 915     {
 916          my $message = '$REGMARK in replacement; Bug 49190';
 917          our $REGMARK;
 918          my $_ = "A";
 919          ok(s/(*:B)A/$REGMARK/, $message);
 920          is($_, "B", $message);
 921          $_ = "CCCCBAA";
 922          ok(s/(*:X)A+|(*:Y)B+|(*:Z)C+/$REGMARK/g, $message);
 923          is($_, "ZYX", $message);
 924          # Use a longer name to force reallocation of $REGMARK.
 925          $_ = "CCCCBAA";
 926          ok(s/(*:X)A+|(*:YYYYYYYYYYYYYYYY)B+|(*:Z)C+/$REGMARK/g, $message);
 927          is($_, "ZYYYYYYYYYYYYYYYYX", $message);
 928     }
 929
 930     {
 931         my $message = 'Substitution evaluation in list context; Bug 52658';
 932         my $reg = '../xxx/';
 933         my @te  = ($reg =~ m{^(/?(?:\.\./)*)},
 934                    $reg =~ s/(x)/'b'/eg > 1 ? '##' : '++');
 935         is($reg, '../bbb/', $message);
 936         is($te [0], '../', $message);
 937     }
 938
 939     {
 940         my $a = "xyzt" x 8192;
 941         like($a, qr/\A(?>[a-z])*\z/,
 942              '(?>) does not cause wrongness on long string; Bug 60034');
 943         my $b = $a . chr 256;
 944         chop $b;
 945         is($a, $b, 'Bug 60034');
 946         like($b, qr/\A(?>[a-z])*\z/,
 947              '(?>) does not cause wrongness on long string with UTF-8; Bug 60034');
 948     }
 949
 950     #
 951     # Keep the following tests last -- they may crash perl
 952     #
 953     print "# Tests that follow may crash perl\n";
 954     {
 955
 956         my $message = 'Pattern in a loop, failure should not ' .
 957                          'affect previous success; Bug 19049/38869';
 958         my @list = (
 959             'ab cdef',             # Matches regex
 960             ('e' x 40000 ) .'ab c' # Matches not, but 'ab c' matches part of it
 961         );
 962         my $y;
 963         my $x;
 964         foreach (@list) {
 965             m/ab(.+)cd/i; # The ignore-case seems to be important
 966             $y = $1;      # Use $1, which might not be from the last match!
 967             $x = substr ($list [0], $- [0], $+ [0] - $- [0]);
 968         }
 969         is($y, ' ', $message);
 970         is($x, 'ab cd', $message);
 971     }
 972
 973     {
 974         ok (("a" x (2 ** 15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker; Bug 24274");
 975         ok ((q(a)x 100) =~ /^(??{'(.)'x 100})/,
 976             "Regexp /^(??{'(.)'x 100})/ crashes older perls; Bug 24274");
 977     }
 978
 979     {
 980         # [perl #45337] utf8 + "[a]a{2}" + /$.../ = panic: sv_len_utf8 cache
 981
 982         local ${^UTF8CACHE} = -1;
 983         my $message = "Shouldn't panic; Bug 45337";
 984         my $s = "[a]a{2}";
 985         utf8::upgrade $s;
 986         like("aaa", qr/$s/, $message);
 987     }
 988     {
 989         my $message = "Check if tree logic breaks \$^R; Bug 57042";
 990         my $cond_re = qr/\s*
 991             \s* (?:
 992                    \( \s* A  (?{1})
 993                  | \( \s* B  (?{2})
 994                )
 995            /x;
 996         my @res;
 997         for my $line ("(A)","(B)") {
 998            if ($line =~ m/$cond_re/) {
 999                push @res, $^R ? "#$^R" : "UNDEF";
1000            }
1001         }
1002         is("@res","#1 #2", $message);
1003     }
1004     {
1005         no warnings 'closure';
1006         my $re = qr/A(??{"1"})/;
1007         ok "A1B" =~ m/^((??{ $re }))((??{"B"}))$/;
1008         ok $1 eq "A1";
1009         ok $2 eq "B";
1010     }
1011
1012     # This only works under -DEBUGGING because it relies on an assert().
1013     {
1014         # Check capture offset re-entrancy of utf8 code.
1015
1016         sub fswash { $_[0] =~ s/([>X])//g; }
1017
1018         my $k1 = "." x 4 . ">>";
1019         fswash($k1);
1020
1021         my $k2 = "\x{f1}\x{2022}";
1022         $k2 =~ s/([\360-\362])/>/g;
1023         fswash($k2);
1024
1025         is($k2, "\x{2022}", "utf8::SWASHNEW doesn't cause capture leaks; Bug 60508");
1026     }
1027
1028     {
1029         # minimal CURLYM limited to 32767 matches
1030         my @pat = (
1031             qr{a(x|y)*b},       # CURLYM
1032             qr{a(x|y)*?b},      # .. with minmod
1033             qr{a([wx]|[yz])*b}, # .. and without tries
1034             qr{a([wx]|[yz])*?b},
1035         );
1036         my $len = 32768;
1037         my $s = join '', 'a', 'x' x $len, 'b';
1038         for my $pat (@pat) {
1039             like($s, $pat, "$pat; Bug 65372");
1040         }
1041     }
1042
1043     {
1044         local $::TODO = "[perl #38133]";
1045
1046         "A" =~ /(((?:A))?)+/;
1047         my $first = $2;
1048
1049         "A" =~ /(((A))?)+/;
1050         my $second = $2;
1051
1052         is($first, $second);
1053     }
1054
1055     {
1056        my $message
1057         = 'utf8 =~ /trie/ where trie matches a continuation octet; Bug 70998';
1058
1059        # Catch warnings:
1060        my $w;
1061        local $SIG{__WARN__} = sub { $w .= shift };
1062
1063        # This bug can be reduced to
1064        qq{\x{30ab}} =~ /\xab|\xa9/;
1065        # but it's nice to have a more 'real-world' test. The original test
1066        # case from the RT ticket follows:
1067
1068        my %conv = (
1069                    "\xab"     => "&lt;",
1070                    "\xa9"     => "(c)",
1071                   );
1072        my $conv_rx = '(' . join('|', map { quotemeta } keys %conv) . ')';
1073        $conv_rx = qr{$conv_rx};
1074
1075        my $x
1076         = qq{\x{3042}\x{304b}\x{3055}\x{305f}\x{306a}\x{306f}\x{307e}}
1077         . qq{\x{3084}\x{3089}\x{308f}\x{3093}\x{3042}\x{304b}\x{3055}}
1078         . qq{\x{305f}\x{306a}\x{306f}\x{307e}\x{3084}\x{3089}\x{308f}}
1079         . qq{\x{3093}\x{30a2}\x{30ab}\x{30b5}\x{30bf}\x{30ca}\x{30cf}}
1080         . qq{\x{30de}\x{30e4}\x{30e9}\x{30ef}\x{30f3}\x{30a2}\x{30ab}}
1081         . qq{\x{30b5}\x{30bf}\x{30ca}\x{30cf}\x{30de}\x{30e4}\x{30e9}}
1082         . qq{\x{30ef}\x{30f3}\x{30a2}\x{30ab}\x{30b5}\x{30bf}\x{30ca}}
1083         . qq{\x{30cf}\x{30de}\x{30e4}\x{30e9}\x{30ef}\x{30f3}};
1084
1085        $x =~ s{$conv_rx}{$conv{$1}}eg;
1086
1087        is($w, undef, $message);
1088     }
1089
1090     {
1091         # minimal CURLYM limited to 32767 matches
1092
1093         is(join("-", "   abc   def  " =~ /(?=(\S+))/g), "abc-bc-c-def-ef-f",
1094            'stclass optimisation does not break + inside (?=); Bug 68564');
1095     }
1096
1097     {
1098         use charnames ":full";
1099         # Delayed interpolation of \N'
1100         my $r1 = qr/\N{THAI CHARACTER SARA I}/;
1101         my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}";
1102
1103         # Bug #56444
1104         ok $s1 =~ /$r1+/, 'my $r1 = qr/\N{THAI CHARACTER SARA I}/; my $s1 = "\x{E34}\x{E34}\x{E34}\x{E34}; $s1 =~ /$r1+/';
1105
1106         # Bug #62056
1107         ok "${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/, '"${s1}A" =~ m/$s1\N{LATIN CAPITAL LETTER A}/';
1108
1109         ok "abbbbc" =~ m/\N{1}/ && $& eq "a", '"abbbbc" =~ m/\N{1}/ && $& eq "a"';
1110         ok "abbbbc" =~ m/\N{3,4}/ && $& eq "abbb", '"abbbbc" =~ m/\N{3,4}/ && $& eq "abbb"';
1111     }
1112
1113     {
1114         use charnames ":full";
1115         my $message = '[perl #74982] Period coming after \N{}';
1116         ok("\x{ff08}." =~ m/\N{FULLWIDTH LEFT PARENTHESIS}./ && $& eq "\x{ff08}.", $message);
1117         ok("\x{ff08}." =~ m/[\N{FULLWIDTH LEFT PARENTHESIS}]./ && $& eq "\x{ff08}.", $message);
1118     }
1119
1120 SKIP: {
1121     ######## "Segfault using HTML::Entities", Richard Jolly <richardjolly@mac.com>, <A3C7D27E-C9F4-11D8-B294-003065AE00B6@mac.com> in perl-unicode@perl.org
1122
1123     skip('Perl configured without Encode module', 1)
1124         unless $Config{extensions} =~ / Encode /;
1125
1126     # Test case cut down by jhi
1127     fresh_perl_like(<<'EOP', qr!Malformed UTF-8 character \(unexpected end of string\) in substitution \(s///\) at!, 'Segfault using HTML::Entities');
1128 use Encode;
1129 my $t = ord('A') == 193 ? "\xEA" : "\xE9";
1130 Encode::_utf8_on($t);
1131 $t =~ s/([^a])//ge;
1132 EOP
1133     }
1134
1135     {
1136         # pattern must be compiled late or we can break the test file
1137         my $message = '[perl #115050] repeated nothings in a trie can cause panic';
1138         my $pattern;
1139         $pattern = '[xyz]|||';
1140         ok("blah blah" =~ /$pattern/, $message);
1141         ok("blah blah" =~ /(?:$pattern)h/, $message);
1142         $pattern = '|||[xyz]';
1143         ok("blah blah" =~ /$pattern/, $message);
1144         ok("blah blah" =~ /(?:$pattern)h/, $message);
1145     }
1146 } # End of sub run_tests
1147
1148 1;