Revert patch #25993.

[perl5.git] / t / op / pat.t
diff --git a/t/op/pat.t b/t/op/pat.t

index 54f7d14..10ecaf8 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
  
  $| = 1;
  
-print "1..995\n";
+print "1..1187\n";
  
  BEGIN {
      chdir 't' if -d 't';
@@ -20,9 +20,8 @@ $x = "abc\ndef\n";
  if ($x =~ /^abc/) {print "ok 1\n";} else {print "not ok 1\n";}
  if ($x !~ /^def/) {print "ok 2\n";} else {print "not ok 2\n";}
  
-$* = 1;
-if ($x =~ /^def/) {print "ok 3\n";} else {print "not ok 3\n";}
-$* = 0;
+# used to be a test for $*
+if ($x =~ /^def/m) {print "ok 3\n";} else {print "not ok 3\n";}
  
  $_ = '123';
  if (/^([0-9][0-9]*)/) {print "ok 4\n";} else {print "not ok 4\n";}
@@ -69,9 +68,8 @@ if (m|bc/*d|) {print "ok 22\n";} else {print "not ok 22\n";}
  
  if (/^$_$/) {print "ok 23\n";} else {print "not ok 23\n";}
  
-$* = 1;                # test 3 only tested the optimized version--this one is for real
-if ("ab\ncd\n" =~ /^cd/) {print "ok 24\n";} else {print "not ok 24\n";}
-$* = 0;
+# used to be a test for $*
+if ("ab\ncd\n" =~ /^cd/m) {print "ok 24\n";} else {print "not ok 24\n";}
  
  $XXX{123} = 123;
  $XXX{234} = 234;
@@ -1367,10 +1365,10 @@ print "ok 247\n";
      print "ok 263\n";
  }
  
-{
+SKIP: {
      my $test = 264; # till 575
  
-    use charnames ':full';
+    use charnames ":full";
  
      # This is far from complete testing, there are dozens of character
      # classes in Unicode.  The mixing of literals and \N{...} is
@@ -1691,10 +1689,11 @@ EOT
      print "not " if     $x =~ /[\x{100}]/;
      print "ok 604\n";
  
-    print "not " unless $x =~ /\p{InLatin1Supplement}/;
+    # the next two tests must be ignored on EBCDIC
+    print "not " unless $x =~ /\p{InLatin1Supplement}/ or ord("A") == 193;
      print "ok 605\n";
  
-    print "not " if     $x =~ /\P{InLatin1Supplement}/;
+    print "not " if     $x =~ /\P{InLatin1Supplement}/ and ord("A") != 193;
      print "ok 606\n";
  
      print "not " if     $x =~ /\p{InLatinExtendedA}/;
@@ -1909,8 +1908,10 @@ print "ok 663\n";
  print "not " unless chr(0xfb4f) =~ /\p{IsHebrew}/; # outside InHebrew
  print "ok 664\n";
  
-print "not " unless chr(0xb5) =~ /\p{IsGreek}/; # singleton (not in a range)
-print "ok 665\n";
+# # singleton (not in a range, this test must be ignored on EBCDIC)
+# print "not " unless chr(0xb5) =~ /\p{IsGreek}/ or ord("A") == 193;
+# print "ok 665\n";
+print "ok 665 # 0xb5 moved from Greek to Common with Unicode 4.0.1\n";
  
  print "not " unless chr(0x37a) =~ /\p{IsGreek}/; # singleton
  print "ok 666\n";
@@ -2235,10 +2236,11 @@ print "# some Unicode properties\n";
  }
  
  {
-    print "not " unless "a" =~ /\p{L&}/;
+    # L& and LC are the same
+    print "not " unless "a" =~ /\p{LC}/ and "a" =~ /\p{L&}/;
      print "ok 743\n";
  
-    print "not " if     "1" =~ /\p{L&}/;
+    print "not " if     "1" =~ /\p{LC}/ or "1" =~ /\p{L&}/;
      print "ok 744\n";
  }
  
@@ -3142,11 +3144,254 @@ ok("bbbbac" =~ /$pattern/ && $1 eq 'a', "[perl #3547]");
  {
      my $i;
      ok('-1-3-5-' eq join('', split /((??{$i++}))/, '-1-3-5-'),
-       "[perl #21411] (??{ .. }) corrupts split's stack")
+       "[perl #21411] (??{ .. }) corrupts split's stack");
+    split /(?{'WOW'})/, 'abc';
+    ok('a|b|c' eq join ('|', @_),
+       "[perl #21411] (?{ .. }) version of the above");
+}
+
+{
+    split /(?{ split "" })/, "abc";
+    ok(1,'cache_re & "(?{": it dumps core in 5.6.1 & 5.8.0');
  }
  
  {
      ok("\x{100}\n" =~ /\x{100}\n$/, "UTF8 length cache and fbm_compile");  
  }
  
-# last test 995
+{
+    package Str;
+    use overload q/""/ => sub { ${$_[0]}; };
+    sub new { my ($c, $v) = @_; bless \$v, $c; }
+
+    package main;
+    $_ = Str->new("a\x{100}/\x{100}b");
+    ok(join(":", /\b(.)\x{100}/g) eq "a:/", "re_intuit_start and PL_bostr");
+}
+
+{
+    $_ = "code:   'x' { '...' }\n"; study;
+    my @x; push @x, $& while m/'[^\']*'/gx;
+    ok(join(":", @x) eq "'x':'...'",
+       "[perl #17757] Parse::RecDescent triggers infinite loop");
+}
+
+{
+    my $re = qq/^([^X]*)X/;
+    utf8::upgrade($re);
+    ok("\x{100}X" =~ /$re/, "S_cl_and ANYOF_UNICODE & ANYOF_INVERTED");
+}
+
+# bug #22354
+sub func ($) {
+    ok( "a\nb" !~ /^b/, $_[0] );
+    ok( "a\nb" =~ /^b/m, "$_[0] - with /m" );
+}
+func "standalone";
+$_ = "x"; s/x/func "in subst"/e;
+$_ = "x"; s/x/func "in multiline subst"/em;
+#$_ = "x"; /x(?{func "in regexp"})/;
+#$_ = "x"; /x(?{func "in multiline regexp"})/m;
+
+# bug #19049
+$_="abcdef\n";
+@x = m/./g;
+ok("abcde" eq "$`", '# TODO #19049 - global match not setting $`');
+
+ok("123\x{100}" =~ /^.*1.*23\x{100}$/, 'uft8 + multiple floating substr');
+
+# LATIN SMALL/CAPITAL LETTER A WITH MACRON
+ok("  \x{101}" =~ qr/\x{100}/i,
+   "<20030808193656.5109.1@llama.ni-s.u-net.com>");
+
+# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW
+ok("  \x{1E01}" =~ qr/\x{1E00}/i,
+   "<20030808193656.5109.1@llama.ni-s.u-net.com>");
+
+# DESERET SMALL/CAPITAL LETTER LONG I
+ok("  \x{10428}" =~ qr/\x{10400}/i,
+   "<20030808193656.5109.1@llama.ni-s.u-net.com>");
+
+# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X'
+ok("  \x{1E01}x" =~ qr/\x{1E00}X/i,
+   "<20030808193656.5109.1@llama.ni-s.u-net.com>");
+
+{
+    # [perl #23769] Unicode regex broken on simple example
+    # regrepeat() didn't handle UTF-8 EXACT case right.
+
+    my $s = "\x{a0}\x{a0}\x{a0}\x{100}"; chop $s;
+
+    ok($s =~ /\x{a0}/,       "[perl #23769]");
+    ok($s =~ /\x{a0}+/,      "[perl #23769]");
+    ok($s =~ /\x{a0}\x{a0}/, "[perl #23769]");
+
+    ok("aaa\x{100}" =~ /(a+)/, "[perl #23769] easy invariant");
+    ok($1 eq "aaa", "[perl #23769]");
+
+    ok("\xa0\xa0\xa0\x{100}" =~ /(\xa0+)/, "[perl #23769] regrepeat invariant");
+    ok($1 eq "\xa0\xa0\xa0", "[perl #23769]");
+
+    ok("ababab\x{100}  " =~ /((?:ab)+)/, "[perl #23769] hard invariant");
+    ok($1 eq "ababab", "[perl #23769]");
+
+    ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+)/, "[perl #23769] hard variant");
+    ok($1 eq "\xa0\xa1\xa0\xa1\xa0\xa1", "[perl #23769]");
+
+    ok("aaa\x{100}     " =~ /(a+?)/, "[perl #23769] easy invariant");
+    ok($1 eq "a", "[perl #23769]");
+
+    ok("\xa0\xa0\xa0\x{100}    " =~ /(\xa0+?)/, "[perl #23769] regrepeat variant");
+    ok($1 eq "\xa0", "[perl #23769]");
+
+    ok("ababab\x{100}  " =~ /((?:ab)+?)/, "[perl #23769] hard invariant");
+    ok($1 eq "ab", "[perl #23769]");
+
+    ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+?)/, "[perl #23769] hard variant");
+    ok($1 eq "\xa0\xa1", "[perl #23769]");
+
+    ok("\xc4\xc4\xc4" !~ /(\x{100}+)/, "[perl #23769] don't match first byte of utf8 representation");
+    ok("\xc4\xc4\xc4" !~ /(\x{100}+?)/, "[perl #23769] don't match first byte of utf8 representation");
+}
+
+for (120 .. 130) {
+    my $head = 'x' x $_;
+    for my $tail ('\x{0061}', '\x{1234}') {
+       ok(
+           eval qq{ "$head$tail" =~ /$head$tail/ },
+           '\x{...} misparsed in regexp near 127 char EXACT limit'
+       );
+    }
+}
+
+# perl #25269: panic: pp_match start/end pointers
+ok("a-bc" eq eval {
+       my($x, $y) = "bca" =~ /^(?=.*(a)).*(bc)/;
+       "$x-$y";
+}, 'captures can move backwards in string');
+
+# perl #27940: \cA not recognized in character classes
+ok("a\cAb" =~ /\cA/, '\cA in pattern');
+ok("a\cAb" =~ /[\cA]/, '\cA in character class');
+ok("a\cAb" =~ /[\cA-\cB]/, '\cA in character class range');
+ok("abc" =~ /[^\cA-\cB]/, '\cA in negated character class range');
+ok("a\cBb" =~ /[\cA-\cC]/, '\cB in character class range');
+ok("a\cCbc" =~ /[^\cA-\cB]/, '\cC in negated character class range');
+ok("a\cAb" =~ /(??{"\cA"})/, '\cA in ??{} pattern');
+
+# perl #28532: optional zero-width match at end of string is ignored
+ok(("abc" =~ /^abc(\z)?/) && defined($1),
+    'optional zero-width match at end of string');
+ok(("abc" =~ /^abc(\z)??/) && !defined($1),
+    'optional zero-width match at end of string');
+
+
+
+{ # TRIE related
+    my @got=();
+    "words"=~/(word|word|word)(?{push @got,$1})s$/;
+    ok(@got==1,"TRIE optimation is working") or warn "# @got";
+    @got=();
+    "words"=~/(word|word|word)(?{push @got,$1})s$/i;
+    ok(@got==1,"TRIEF optimisation is working") or warn "# @got";
+
+    my @nums=map {int rand 1000} 1..100;
+    my $re="(".(join "|",@nums).")";
+    $re=qr/\b$re\b/;
+
+    foreach (@nums) {
+        ok($_=~/$re/,"Trie nums");
+    }
+    $_=join " ", @nums;
+    @got=();
+    push @got,$1 while /$re/g;
+
+    my %count;
+    $count{$_}++ for @got;
+    my $ok=1;
+    for (@nums) {
+        $ok=0 if --$count{$_}<0;
+    }
+    ok($ok,"Trie min count matches");
+}
+
+
+# TRIE related
+# LATIN SMALL/CAPITAL LETTER A WITH MACRON
+ok(("foba  \x{101}foo" =~ qr/(foo|\x{100}foo|bar)/i) && $1 eq "\x{101}foo",
+   "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH MACRON");
+
+# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW
+ok(("foba  \x{1E01}foo" =~ qr/(foo|\x{1E00}foo|bar)/i) && $1 eq "\x{1E01}foo",
+   "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW");
+
+# DESERET SMALL/CAPITAL LETTER LONG I
+ok(("foba  \x{10428}foo" =~ qr/(foo|\x{10400}foo|bar)/i) &&  $1 eq "\x{10428}foo",
+   "TRIEF + DESERET SMALL/CAPITAL LETTER LONG I");
+
+# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X'
+ok(("foba  \x{1E01}xfoo" =~ qr/(foo|\x{1E00}Xfoo|bar)/i) &&  $1 eq "\x{1E01}xfoo",
+   "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X'");
+
+{# TRIE related
+
+use charnames ':full';
+
+$s="\N{LATIN SMALL LETTER SHARP S}";
+ok(("foba  ba$s" =~ qr/(foo|Ba$s|bar)/i)
+    &&  $1 eq "ba$s",
+   "TRIEF + LATIN SMALL LETTER SHARP S =~ ss");
+ok(("foba  ba$s" =~ qr/(Ba$s|foo|bar)/i)
+    &&  $1 eq "ba$s",
+   "TRIEF + LATIN SMALL LETTER SHARP S =~ ss");
+ok(("foba  ba$s" =~ qr/(foo|bar|Ba$s)/i)
+    &&  $1 eq "ba$s",
+   "TRIEF + LATIN SMALL LETTER SHARP S =~ ss");
+
+ok(("foba  ba$s" =~ qr/(foo|Bass|bar)/i)
+    &&  $1 eq "ba$s",
+   "TRIEF + LATIN SMALL LETTER SHARP S =~ ss");
+
+ok(("foba  ba$s" =~ qr/(foo|BaSS|bar)/i)
+    &&  $1 eq "ba$s",
+   "TRIEF + LATIN SMALL LETTER SHARP S =~ SS");
+}
+
+
+
+{
+    my @normal=qw(these are some normal words);
+    my $psycho=join "|",@normal,map chr $_,255..20000;
+    ok(('these'=~/($psycho)/) && $1 eq 'these','Pyscho');
+}
+
+# [perl #36207] mixed utf8 / latin-1 and case folding
+
+{
+    my $utf8 = "\xe9\x{100}"; chop $utf8;
+    my $latin1 = "\xe9";
+
+    ok($utf8 =~ /\xe9/i, "utf8/latin");
+    ok($utf8 =~ /$latin1/i, "utf8/latin runtime");
+    ok($utf8 =~ /(abc|\xe9)/i, "utf8/latin trie");
+    ok($utf8 =~ /(abc|$latin1)/i, "utf8/latin trie runtime");
+
+    ok("\xe9" =~ /$utf8/i, "# TODO latin/utf8");
+    ok("\xe9" =~ /(abc|$utf8)/i, "# latin/utf8 trie");
+    ok($latin1 =~ /$utf8/i, "# TODO latin/utf8 runtime");
+    ok($latin1 =~ /(abc|$utf8)/i, "# latin/utf8 trie runtime");
+}
+
+# [perl #37038] Global regular matches generate invalid pointers
+
+{
+    my $s = "abcd";
+    $s =~ /(..)(..)/g;
+    $s = $1;
+    $s = $2;
+    ok($s eq 'cd',
+       "# TODO assigning to original string should not corrupt match vars");
+}
+
+# last test 1187
+