Re: \N{...} in regular expression [PATCH]

[perl5.git] / t / op / pat.t
diff --git a/t/op/pat.t b/t/op/pat.t

index a6007a6..97bad61 100755 (executable)
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
  
  $| = 1;
  
-print "1..942\n";
+# Test counter output is generated by a BEGIN block at bottom of file
  
  BEGIN {
      chdir 't' if -d 't';
@@ -20,9 +20,8 @@ $x = "abc\ndef\n";
  if ($x =~ /^abc/) {print "ok 1\n";} else {print "not ok 1\n";}
  if ($x !~ /^def/) {print "ok 2\n";} else {print "not ok 2\n";}
  
-$* = 1;
-if ($x =~ /^def/) {print "ok 3\n";} else {print "not ok 3\n";}
-$* = 0;
+# used to be a test for $*
+if ($x =~ /^def/m) {print "ok 3\n";} else {print "not ok 3\n";}
  
  $_ = '123';
  if (/^([0-9][0-9]*)/) {print "ok 4\n";} else {print "not ok 4\n";}
@@ -69,9 +68,8 @@ if (m|bc/*d|) {print "ok 22\n";} else {print "not ok 22\n";}
  
  if (/^$_$/) {print "ok 23\n";} else {print "not ok 23\n";}
  
-$* = 1;                # test 3 only tested the optimized version--this one is for real
-if ("ab\ncd\n" =~ /^cd/) {print "ok 24\n";} else {print "not ok 24\n";}
-$* = 0;
+# used to be a test for $*
+if ("ab\ncd\n" =~ /^cd/m) {print "ok 24\n";} else {print "not ok 24\n";}
  
  $XXX{123} = 123;
  $XXX{234} = 234;
@@ -81,12 +79,21 @@ $XXX{345} = 345;
  while ($_ = shift(@XXX)) {
      ?(.*)? && (print $1,"\n");
      /not/ && reset;
-    /not ok 26/ && reset 'X';
+    if (/not ok 26/) {
+      if ($^O eq 'VMS') {
+       $_ = shift(@XXX);
+      }
+      else {
+       reset 'X';
+      }
+   }
  }
  
-while (($key,$val) = each(%XXX)) {
+if ($^O ne 'VMS') {
+  while (($key,$val) = each(%XXX)) {
      print "not ok 27\n";
      exit;
+  }
  }
  
  print "ok 27\n";
@@ -1278,7 +1285,7 @@ print "ok 247\n";
  {
      # bug id 20001008.001
  
-    my $test = 248;
+    $test = 248;
      my @x = ("stra\337e 138","stra\337e 138");
      for (@x) {
         s/(\d+)\s*([\w\-]+)/$1 . uc $2/e;
@@ -1367,10 +1374,10 @@ print "ok 247\n";
      print "ok 263\n";
  }
  
-{
-    my $test = 264; # till 575
+SKIP: {
+    $test = 264; # till 575
  
-    use charnames ':full';
+    use charnames ":full";
  
      # This is far from complete testing, there are dozens of character
      # classes in Unicode.  The mixing of literals and \N{...} is
@@ -1691,10 +1698,11 @@ EOT
      print "not " if     $x =~ /[\x{100}]/;
      print "ok 604\n";
  
-    print "not " unless $x =~ /\p{InLatin1Supplement}/;
+    # the next two tests must be ignored on EBCDIC
+    print "not " unless $x =~ /\p{InLatin1Supplement}/ or ord("A") == 193;
      print "ok 605\n";
  
-    print "not " if     $x =~ /\P{InLatin1Supplement}/;
+    print "not " if     $x =~ /\P{InLatin1Supplement}/ and ord("A") != 193;
      print "ok 606\n";
  
      print "not " if     $x =~ /\p{InLatinExtendedA}/;
@@ -1909,8 +1917,10 @@ print "ok 663\n";
  print "not " unless chr(0xfb4f) =~ /\p{IsHebrew}/; # outside InHebrew
  print "ok 664\n";
  
-print "not " unless chr(0xb5) =~ /\p{IsGreek}/; # singleton (not in a range)
-print "ok 665\n";
+# # singleton (not in a range, this test must be ignored on EBCDIC)
+# print "not " unless chr(0xb5) =~ /\p{IsGreek}/ or ord("A") == 193;
+# print "ok 665\n";
+print "ok 665 # 0xb5 moved from Greek to Common with Unicode 4.0.1\n";
  
  print "not " unless chr(0x37a) =~ /\p{IsGreek}/; # singleton
  print "ok 666\n";
@@ -2021,13 +2031,13 @@ print "ok 683\n" if @a == 9 && "@a" eq "f o o \n $a $b b a r";
  }
  
  
-my $test = 687;
+$test = 687;
  
  # Force scalar context on the patern match
-sub ok ($$) {
+sub ok ($;$) {
      my($ok, $name) = @_;
  
-    printf "%sok %d - %s\n", ($ok ? "" : "not "), $test, $name;
+    printf "%sok %d - %s\n", ($ok ? "" : "not "), $test, $name||'unnamed';
  
      printf "# Failed test at line %d\n", (caller)[2] unless $ok;
  
@@ -2235,10 +2245,11 @@ print "# some Unicode properties\n";
  }
  
  {
-    print "not " unless "a" =~ /\p{L&}/;
+    # L& and LC are the same
+    print "not " unless "a" =~ /\p{LC}/ and "a" =~ /\p{L&}/;
      print "ok 743\n";
  
-    print "not " if     "1" =~ /\p{L&}/;
+    print "not " if     "1" =~ /\p{LC}/ or "1" =~ /\p{L&}/;
      print "ok 744\n";
  }
  
@@ -2592,42 +2603,32 @@ print "# some Unicode properties\n";
  
      use charnames ':full';
  
-    print "\N{LATIN SMALL LETTER SHARP S}" =~
-       /\N{LATIN SMALL LETTER SHARP S}/    ? "ok 835\n" : "not ok 835\n";
-
-    print "\N{LATIN SMALL LETTER SHARP S}" =~
-       /\N{LATIN SMALL LETTER SHARP S}/i   ? "ok 836\n" : "not ok 836\n";
-
-    print "\N{LATIN SMALL LETTER SHARP S}" =~
-       /[\N{LATIN SMALL LETTER SHARP S}]/  ? "ok 837\n" : "not ok 837\n";
-
-    print "\N{LATIN SMALL LETTER SHARP S}" =~
-       /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 838\n" : "not ok 838\n";
-
-    print "ss" =~
-       /\N{LATIN SMALL LETTER SHARP S}/i   ? "ok 839\n" : "not ok 839\n";
-
-    print "SS" =~
-       /\N{LATIN SMALL LETTER SHARP S}/i   ? "ok 840\n" : "not ok 840\n";
+    $test= 835;
  
-    print "ss" =~
-       /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 841\n" : "not ok 841\n";
+    ok("\N{LATIN SMALL LETTER SHARP S}" =~ /\N{LATIN SMALL LETTER SHARP S}/);
+    ok("\N{LATIN SMALL LETTER SHARP S}" =~ /\N{LATIN SMALL LETTER SHARP S}/i);
  
-    print "SS" =~
-       /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 842\n" : "not ok 842\n";
+    ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}]/);
+    ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i);
  
-    print "\N{LATIN SMALL LETTER SHARP S}" =~ /ss/i ?
-       "ok 843\n" : "not ok 843\n";
+    ok("ss" =~ /\N{LATIN SMALL LETTER SHARP S}/i);
+    ok("SS" =~ /\N{LATIN SMALL LETTER SHARP S}/i);
+    ok("ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i);
+    ok("SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i);
  
-    print "\N{LATIN SMALL LETTER SHARP S}" =~ /SS/i ?
-       "ok 844\n" : "not ok 844\n";
+    ok("\N{LATIN SMALL LETTER SHARP S}" =~ /ss/i);
+    ok("\N{LATIN SMALL LETTER SHARP S}" =~ /SS/i);
  }
  
  {
      print "# more whitespace: U+0085, U+2028, U+2029\n";
  
      # U+0085 needs to be forced to be Unicode, the \x{100} does that.
-    print "<\x{100}\x{0085}>" =~ /<\x{100}\s>/ ? "ok 845\n" : "not ok 845\n";
+    if ($ordA == 193) {
+       print "<\x{100}\x{0085}>" =~ /<\x{100}e>/ ? "ok 845\n" : "not ok 845\n";
+    } else {
+       print "<\x{100}\x{0085}>" =~ /<\x{100}\s>/ ? "ok 845\n" : "not ok 845\n";
+    }
      print "<\x{2028}>" =~ /<\s>/ ? "ok 846\n" : "not ok 846\n";
      print "<\x{2029}>" =~ /<\s>/ ? "ok 847\n" : "not ok 847\n";
  }
@@ -2735,7 +2736,7 @@ print "# some Unicode properties\n";
      # check utf8/non-utf8 mixtures
      # try to force all float/anchored check combinations
      my $c = "\x{100}";
-    my $test = 865;
+    $test = 865;
      my $subst;
      for my $re (
         "xx.*$c", "x.*$c$c", "$c.*xx", "$c$c.*x", "xx.*(?=$c)", "(?=$c).*xx",
@@ -2774,7 +2775,7 @@ print "# some Unicode properties\n";
  
  {
      print "# qr/.../x\n";
-    my $test = 893;
+    $test = 893;
  
      my $R = qr/ A B C # D E/x;
  
@@ -2790,7 +2791,7 @@ print "# some Unicode properties\n";
  
  {
      print "# illegal Unicode properties\n";
-    my $test = 896;
+    $test = 896;
  
      print eval { "a" =~ /\pq / }      ? "not ok $test\n" : "ok $test\n";
      $test++;
@@ -2802,7 +2803,7 @@ print "# some Unicode properties\n";
  {
      print "# [ID 20020412.005] wrong pmop flags checked when empty pattern\n";
      # requires reuse of last successful pattern
-    my $test = 898;
+    $test = 898;
      $test =~ /\d/;
      for (0 .. 1) {
         my $match = ?? + 0;
@@ -2982,8 +2983,8 @@ sub IsSyriac1 {
  END
  }
  
-print "\x{0712}" =~ /\p{IsSyriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
-print "\x{072F}" =~ /\P{IsSyriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+ok("\x{0712}" =~ /\p{IsSyriac1}/, '\x{0712}, \p{IsSyriac1}');
+ok("\x{072F}" =~ /\P{IsSyriac1}/, '\x{072F}, \P{IsSyriac1}');
  
  sub Syriac1 {
      return <<'END';
@@ -2992,17 +2993,645 @@ sub Syriac1 {
  END
  }
  
-print "\x{0712}" =~ /\p{Syriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
-print "\x{072F}" =~ /\P{Syriac1}/ ? "ok $test\n" : "not ok $test\n"; $test++;
+ok("\x{0712}" =~ /\p{Syriac1}/, '\x{0712}, \p{Syriac1}');
+ok("\x{072F}" =~ /\P{Syriac1}/, '\x{072F}, \p{Syriac1}');
+
+print "# user-defined character properties may lack \\n at the end\n";
+sub InGreekSmall   { return "03B1\t03C9" }
+sub InGreekCapital { return "0391\t03A9\n-03A2" }
+
+ok("\x{03C0}" =~ /\p{InGreekSmall}/,   "Small pi");
+ok("\x{03C2}" =~ /\p{InGreekSmall}/,   "Final sigma");
+ok("\x{03A0}" =~ /\p{InGreekCapital}/, "Capital PI");
+ok("\x{03A2}" =~ /\P{InGreekCapital}/, "Reserved");
+
+sub AsciiHexAndDash {
+    return <<'END';
++utf8::ASCII_Hex_Digit
++utf8::Dash
+END
+}
+
+ok("-" =~ /\p{Dash}/,            "'-' is Dash");
+ok("A" =~ /\p{ASCII_Hex_Digit}/, "'A' is ASCII_Hex_Digit");
+ok("-" =~ /\p{AsciiHexAndDash}/, "'-' is AsciiHexAndDash");
+ok("A" =~ /\p{AsciiHexAndDash}/, "'A' is AsciiHexAndDash");
  
  {
-    # Change #18179: previously failed with "panic: end_shift"
+    print "# Change #18179\n";
+    # previously failed with "panic: end_shift
      my $s = "\x{100}" x 5;
      my $ok = $s =~ /(\x{100}{4})/;
      my($ord, $len) = (ord $1, length $1);
      print +($ok && $ord == 0x100 && $len == 4)
-           ? "ok $test\n" : "not ok $test\t# $ok/$ord/$len\n";
+           ? "ok $test\n" : "not ok $test\t# [#18179] $ok/$ord/$len\n";
      ++$test;
  }
  
-# last test 942
+{
+    print "# [perl #15763]\n";
+
+    $a = "x\x{100}";
+    chop $a; # but leaves the UTF-8 flag
+    $a .= "y"; # 1 byte before "y"
+
+    ok($a =~ /^\C/,      'match one \C on 1-byte UTF-8');
+    ok($a =~ /^\C{1}/,   'match \C{1}');
+
+    ok($a =~ /^\Cy/,      'match \Cy');
+    ok($a =~ /^\C{1}y/,   'match \C{1}y');
+
+    $a = "\x{100}y"; # 2 bytes before "y"
+
+    ok($a =~ /^\C/,       'match one \C on 2-byte UTF-8');
+    ok($a =~ /^\C{1}/,    'match \C{1}');
+    ok($a =~ /^\C\C/,     'match two \C');
+    ok($a =~ /^\C{2}/,    'match \C{2}');
+
+    ok($a =~ /^\C\C\C/,    'match three \C on 2-byte UTF-8 and a byte');
+    ok($a =~ /^\C{3}/,     'match \C{3}');
+
+    ok($a =~ /^\C\Cy/,     'match two \C');
+    ok($a =~ /^\C{2}y/,    'match \C{2}');
+
+    ok($a !~ /^\C\C\Cy/,    q{don't match three \Cy});
+    ok($a !~ /^\C{2}\Cy/,   q{don't match \C{3}y});
+
+    $a = "\x{1000}y"; # 3 bytes before "y"
+
+    ok($a =~ /^\C/,         'match one \C on three-byte UTF-8');
+    ok($a =~ /^\C{1}/,      'match \C{1}');
+    ok($a =~ /^\C\C/,       'match two \C');
+    ok($a =~ /^\C{2}/,      'match \C{2}');
+    ok($a =~ /^\C\C\C/,     'match three \C');
+    ok($a =~ /^\C{3}/,      'match \C{3}');
+
+    ok($a =~ /^\C\C\C\C/,   'match four \C on three-byte UTF-8 and a byte');
+    ok($a =~ /^\C{4}/,      'match \C{4}');
+
+    ok($a =~ /^\C\C\Cy/,    'match three \Cy');
+    ok($a =~ /^\C{3}y/,     'match \C{3}y');
+
+    ok($a !~ /^\C\C\C\C\y/, q{don't match four \Cy});
+    ok($a !~ /^\C{4}y/,     q{don't match \C{4}y});
+}
+
+$_ = 'aaaaaaaaaa';
+utf8::upgrade($_); chop $_; $\="\n";
+ok(/[^\s]+/, "m/[^\s]/ utf8");
+ok(/[^\d]+/, "m/[^\d]/ utf8");
+ok(($a = $_, $_ =~ s/[^\s]+/./g), "s/[^\s]/ utf8");
+ok(($a = $_, $a =~ s/[^\d]+/./g), "s/[^\s]/ utf8");
+
+ok("\x{100}" =~ /\x{100}/, "[perl #15397]");
+ok("\x{100}" =~ /(\x{100})/, "[perl #15397]");
+ok("\x{100}" =~ /(\x{100}){1}/, "[perl #15397]");
+ok("\x{100}\x{100}" =~ /(\x{100}){2}/, "[perl #15397]");
+ok("\x{100}\x{100}" =~ /(\x{100})(\x{100})/, "[perl #15397]");
+
+$x = "CD";
+$x =~ /(AB)*?CD/;
+ok(!defined $1, "[perl #7471]");
+
+$x = "CD";
+$x =~ /(AB)*CD/;
+ok(!defined $1, "[perl #7471]");
+
+$pattern = "^(b+?|a){1,2}c";
+ok("bac"    =~ /$pattern/ && $1 eq 'a', "[perl #3547]");
+ok("bbac"   =~ /$pattern/ && $1 eq 'a', "[perl #3547]");
+ok("bbbac"  =~ /$pattern/ && $1 eq 'a', "[perl #3547]");
+ok("bbbbac" =~ /$pattern/ && $1 eq 'a', "[perl #3547]");
+
+{
+    # [perl #18232]
+    "\x{100}" =~ /(.)/;
+    ok( $1 eq "\x{100}", '$1 is utf-8 [perl #18232]' );
+    { 'a' =~ /./; }
+    ok( $1 eq "\x{100}", '$1 is still utf-8' );
+    ok( $1 ne "\xC4\x80", '$1 is not non-utf-8' );
+}
+
+{
+    use utf8;
+    my $attr = 'Name-1' ;
+
+    my $NormalChar          = qr/[\p{IsDigit}\p{IsLower}\p{IsUpper}]/;
+    my $NormalWord          = qr/${NormalChar}+?/;
+    my $PredNameHyphen      = qr/^${NormalWord}(\-${NormalWord})*?$/;
+
+    $attr =~ /^$/;
+    ok( $attr =~ $PredNameHyphen, "[perl #19767] original test" );
+}
+
+{
+    use utf8;
+    "a" =~ m/[b]/;
+    ok ( "0" =~ /\p{N}+\z/, "[perl #19767] variant test" );
+}
+
+{
+
+    $p = 1;
+    foreach (1,2,3,4) {
+           $p++ if /(??{ $p })/
+    }
+    ok ($p == 5, "[perl #20683] (??{ }) returns stale values");
+    { package P; $a=1; sub TIESCALAR { bless[] } sub FETCH { $a++ } }
+    tie $p, P;
+    foreach (1,2,3,4) {
+           /(??{ $p })/
+    }
+    ok ( $p == 5, "(??{ }) returns stale values");
+}
+
+{
+  # Subject: Odd regexp behavior
+  # From: Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>
+  # Date: Wed, 26 Feb 2003 16:53:12 +0000
+  # Message-Id: <E18o4nw-0008Ly-00@wisbech.cl.cam.ac.uk>
+  # To: perl-unicode@perl.org
+    
+  $x = "\x{2019}\nk"; $x =~ s/(\S)\n(\S)/$1 $2/sg;
+  ok($x eq "\x{2019} k", "Markus Kuhn 2003-02-26");
+
+  $x = "b\nk"; $x =~ s/(\S)\n(\S)/$1 $2/sg;
+  ok($x eq "b k", "Markus Kuhn 2003-02-26");
+
+  ok("\x{2019}" =~ /\S/, "Markus Kuhn 2003-02-26");
+}
+
+{
+    my $i;
+    ok('-1-3-5-' eq join('', split /((??{$i++}))/, '-1-3-5-'),
+       "[perl #21411] (??{ .. }) corrupts split's stack");
+    split /(?{'WOW'})/, 'abc';
+    ok('a|b|c' eq join ('|', @_),
+       "[perl #21411] (?{ .. }) version of the above");
+}
+
+{
+    # XXX DAPM 13-Apr-06. Recursive split is still broken. It's only luck it
+    # hasn't been crashing. Disable this test until it is fixed properly.
+    # XXX also check what it returns rather than just doing ok(1,...)
+    # split /(?{ split "" })/, "abc";
+    ok(1,'cache_re & "(?{": it dumps core in 5.6.1 & 5.8.0');
+}
+
+{
+    ok("\x{100}\n" =~ /\x{100}\n$/, "UTF8 length cache and fbm_compile");  
+}
+
+{
+    package Str;
+    use overload q/""/ => sub { ${$_[0]}; };
+    sub new { my ($c, $v) = @_; bless \$v, $c; }
+
+    package main;
+    $_ = Str->new("a\x{100}/\x{100}b");
+    ok(join(":", /\b(.)\x{100}/g) eq "a:/", "re_intuit_start and PL_bostr");
+}
+
+{
+    $_ = "code:   'x' { '...' }\n"; study;
+    my @x; push @x, $& while m/'[^\']*'/gx;
+    ok(join(":", @x) eq "'x':'...'",
+       "[perl #17757] Parse::RecDescent triggers infinite loop");
+}
+
+{
+    my $re = qq/^([^X]*)X/;
+    utf8::upgrade($re);
+    ok("\x{100}X" =~ /$re/, "S_cl_and ANYOF_UNICODE & ANYOF_INVERTED");
+}
+
+# bug #22354
+sub func ($) {
+    ok( "a\nb" !~ /^b/, $_[0] );
+    ok( "a\nb" =~ /^b/m, "$_[0] - with /m" );
+}
+func "standalone";
+$_ = "x"; s/x/func "in subst"/e;
+$_ = "x"; s/x/func "in multiline subst"/em;
+#$_ = "x"; /x(?{func "in regexp"})/;
+#$_ = "x"; /x(?{func "in multiline regexp"})/m;
+
+# bug #19049
+$_="abcdef\n";
+@x = m/./g;
+ok("abcde" eq "$`", '# TODO #19049 - global match not setting $`');
+
+ok("123\x{100}" =~ /^.*1.*23\x{100}$/, 'uft8 + multiple floating substr');
+
+# LATIN SMALL/CAPITAL LETTER A WITH MACRON
+ok("  \x{101}" =~ qr/\x{100}/i,
+   "<20030808193656.5109.1@llama.ni-s.u-net.com>");
+
+# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW
+ok("  \x{1E01}" =~ qr/\x{1E00}/i,
+   "<20030808193656.5109.1@llama.ni-s.u-net.com>");
+
+# DESERET SMALL/CAPITAL LETTER LONG I
+ok("  \x{10428}" =~ qr/\x{10400}/i,
+   "<20030808193656.5109.1@llama.ni-s.u-net.com>");
+
+# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X'
+ok("  \x{1E01}x" =~ qr/\x{1E00}X/i,
+   "<20030808193656.5109.1@llama.ni-s.u-net.com>");
+
+{
+    # [perl #23769] Unicode regex broken on simple example
+    # regrepeat() didn't handle UTF-8 EXACT case right.
+
+    my $s = "\x{a0}\x{a0}\x{a0}\x{100}"; chop $s;
+
+    ok($s =~ /\x{a0}/,       "[perl #23769]");
+    ok($s =~ /\x{a0}+/,      "[perl #23769]");
+    ok($s =~ /\x{a0}\x{a0}/, "[perl #23769]");
+
+    ok("aaa\x{100}" =~ /(a+)/, "[perl #23769] easy invariant");
+    ok($1 eq "aaa", "[perl #23769]");
+
+    ok("\xa0\xa0\xa0\x{100}" =~ /(\xa0+)/, "[perl #23769] regrepeat invariant");
+    ok($1 eq "\xa0\xa0\xa0", "[perl #23769]");
+
+    ok("ababab\x{100}  " =~ /((?:ab)+)/, "[perl #23769] hard invariant");
+    ok($1 eq "ababab", "[perl #23769]");
+
+    ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+)/, "[perl #23769] hard variant");
+    ok($1 eq "\xa0\xa1\xa0\xa1\xa0\xa1", "[perl #23769]");
+
+    ok("aaa\x{100}     " =~ /(a+?)/, "[perl #23769] easy invariant");
+    ok($1 eq "a", "[perl #23769]");
+
+    ok("\xa0\xa0\xa0\x{100}    " =~ /(\xa0+?)/, "[perl #23769] regrepeat variant");
+    ok($1 eq "\xa0", "[perl #23769]");
+
+    ok("ababab\x{100}  " =~ /((?:ab)+?)/, "[perl #23769] hard invariant");
+    ok($1 eq "ab", "[perl #23769]");
+
+    ok("\xa0\xa1\xa0\xa1\xa0\xa1\x{100}" =~ /((?:\xa0\xa1)+?)/, "[perl #23769] hard variant");
+    ok($1 eq "\xa0\xa1", "[perl #23769]");
+
+    ok("\xc4\xc4\xc4" !~ /(\x{100}+)/, "[perl #23769] don't match first byte of utf8 representation");
+    ok("\xc4\xc4\xc4" !~ /(\x{100}+?)/, "[perl #23769] don't match first byte of utf8 representation");
+}
+
+for (120 .. 130) {
+    my $head = 'x' x $_;
+    for my $tail ('\x{0061}', '\x{1234}') {
+       ok(
+           eval qq{ "$head$tail" =~ /$head$tail/ },
+           '\x{...} misparsed in regexp near 127 char EXACT limit'
+       );
+    }
+}
+
+# perl #25269: panic: pp_match start/end pointers
+ok("a-bc" eq eval {
+       my($x, $y) = "bca" =~ /^(?=.*(a)).*(bc)/;
+       "$x-$y";
+}, 'captures can move backwards in string');
+
+# perl #27940: \cA not recognized in character classes
+ok("a\cAb" =~ /\cA/, '\cA in pattern');
+ok("a\cAb" =~ /[\cA]/, '\cA in character class');
+ok("a\cAb" =~ /[\cA-\cB]/, '\cA in character class range');
+ok("abc" =~ /[^\cA-\cB]/, '\cA in negated character class range');
+ok("a\cBb" =~ /[\cA-\cC]/, '\cB in character class range');
+ok("a\cCbc" =~ /[^\cA-\cB]/, '\cC in negated character class range');
+ok("a\cAb" =~ /(??{"\cA"})/, '\cA in ??{} pattern');
+ok("ab" !~ /a\cIb/x, '\cI in pattern');
+
+# perl #28532: optional zero-width match at end of string is ignored
+ok(("abc" =~ /^abc(\z)?/) && defined($1),
+    'optional zero-width match at end of string');
+ok(("abc" =~ /^abc(\z)??/) && !defined($1),
+    'optional zero-width match at end of string');
+
+
+
+{ # TRIE related
+    my @got=();
+    "words"=~/(word|word|word)(?{push @got,$1})s$/;
+    ok(@got==1,"TRIE optimation is working") or warn "# @got";
+    @got=();
+    "words"=~/(word|word|word)(?{push @got,$1})s$/i;
+    ok(@got==1,"TRIEF optimisation is working") or warn "# @got";
+
+    my @nums=map {int rand 1000} 1..100;
+    my $re="(".(join "|",@nums).")";
+    $re=qr/\b$re\b/;
+
+    foreach (@nums) {
+        ok($_=~/$re/,"Trie nums");
+    }
+    $_=join " ", @nums;
+    @got=();
+    push @got,$1 while /$re/g;
+
+    my %count;
+    $count{$_}++ for @got;
+    my $ok=1;
+    for (@nums) {
+        $ok=0 if --$count{$_}<0;
+    }
+    ok($ok,"Trie min count matches");
+}
+
+
+# TRIE related
+# LATIN SMALL/CAPITAL LETTER A WITH MACRON
+ok(("foba  \x{101}foo" =~ qr/(foo|\x{100}foo|bar)/i) && $1 eq "\x{101}foo",
+   "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH MACRON");
+
+# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW
+ok(("foba  \x{1E01}foo" =~ qr/(foo|\x{1E00}foo|bar)/i) && $1 eq "\x{1E01}foo",
+   "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW");
+
+# DESERET SMALL/CAPITAL LETTER LONG I
+ok(("foba  \x{10428}foo" =~ qr/(foo|\x{10400}foo|bar)/i) &&  $1 eq "\x{10428}foo",
+   "TRIEF + DESERET SMALL/CAPITAL LETTER LONG I");
+
+# LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X'
+ok(("foba  \x{1E01}xfoo" =~ qr/(foo|\x{1E00}Xfoo|bar)/i) &&  $1 eq "\x{1E01}xfoo",
+   "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X'");
+
+{# TRIE related
+
+use charnames ':full';
+
+$s="\N{LATIN SMALL LETTER SHARP S}";
+ok(("foba  ba$s" =~ qr/(foo|Ba$s|bar)/i)
+    &&  $1 eq "ba$s",
+   "TRIEF + LATIN SMALL LETTER SHARP S =~ ss");
+ok(("foba  ba$s" =~ qr/(Ba$s|foo|bar)/i)
+    &&  $1 eq "ba$s",
+   "TRIEF + LATIN SMALL LETTER SHARP S =~ ss");
+ok(("foba  ba$s" =~ qr/(foo|bar|Ba$s)/i)
+    &&  $1 eq "ba$s",
+   "TRIEF + LATIN SMALL LETTER SHARP S =~ ss");
+
+ok(("foba  ba$s" =~ qr/(foo|Bass|bar)/i)
+    &&  $1 eq "ba$s",
+   "TRIEF + LATIN SMALL LETTER SHARP S =~ ss");
+
+ok(("foba  ba$s" =~ qr/(foo|BaSS|bar)/i)
+    &&  $1 eq "ba$s",
+   "TRIEF + LATIN SMALL LETTER SHARP S =~ SS");
+
+ok(("foba  ba${s}pxySS$s$s" =~ qr/(b(?:a${s}t|a${s}f|a${s}p)[xy]+$s*)/i)
+    &&  $1 eq "ba${s}pxySS$s$s",
+   "COMMON PREFIX TRIEF + LATIN SMALL LETTER SHARP S");
+
+   
+}
+
+
+
+if (!$ENV{PERL_SKIP_PSYCHO_TEST}){
+    my @normal=qw(these are some normal words);
+    my $psycho=join "|",@normal,map chr $_,255..20000;
+    ok(('these'=~/($psycho)/) && $1 eq 'these','Pyscho');
+} else {
+    ok(1,'Skipped Psycho');
+}
+
+# [perl #36207] mixed utf8 / latin-1 and case folding
+
+{
+    my $utf8 = "\xe9\x{100}"; chop $utf8;
+    my $latin1 = "\xe9";
+
+    ok($utf8 =~ /\xe9/i, "utf8/latin");
+    ok($utf8 =~ /$latin1/i, "utf8/latin runtime");
+    ok($utf8 =~ /(abc|\xe9)/i, "utf8/latin trie");
+    ok($utf8 =~ /(abc|$latin1)/i, "utf8/latin trie runtime");
+
+    ok("\xe9" =~ /$utf8/i, "# TODO latin/utf8");
+    ok("\xe9" =~ /(abc|$utf8)/i, "# latin/utf8 trie");
+    ok($latin1 =~ /$utf8/i, "# TODO latin/utf8 runtime");
+    ok($latin1 =~ /(abc|$utf8)/i, "# latin/utf8 trie runtime");
+}
+
+# [perl #37038] Global regular matches generate invalid pointers
+
+{
+    my $s = "abcd";
+    $s =~ /(..)(..)/g;
+    $s = $1;
+    $s = $2;
+    ok($s eq 'cd',
+       "# assigning to original string should not corrupt match vars");
+}
+
+{
+    package wooosh;
+    sub gloople {
+      "!";
+    }
+    package main;
+    
+    my $aeek = bless {}, 'wooosh';
+    eval {$aeek->gloople() =~ /(.)/g;};
+    ok($@ eq "", "//g match against return value of sub") or print "# $@\n";
+}
+
+{
+    sub gloople {
+      "!";
+    }
+    eval {gloople() =~ /(.)/g;};
+    ok($@ eq "", "# 26410 didn't affect sub calls for some reason")
+       or print "# $@\n";
+}
+
+{
+    package lv;
+    $var = "abc";
+    sub variable : lvalue { $var }
+
+    package main;
+    my $o = bless [], "lv";
+    my $f = "";
+    eval { for (1..2) { $f .= $1 if $o->variable =~ /(.)/g } };
+    ok($f eq "ab", "pos retained between calls # TODO") or print "# $@\n";
+}
+
+{
+    $var = "abc";
+    sub variable : lvalue { $var }
+
+    my $f = "";
+    eval { for (1..2) { $f .= $1 if variable() =~ /(.)/g } };
+    ok($f eq "ab", "pos retained between calls # TODO") or print "# $@\n";
+}
+
+# [perl #37836] Simple Regex causes SEGV when run on specific data
+if ($ordA == 193) {
+    print "ok $test # Skip: in EBCDIC\n"; $test++;
+} else {
+    no warnings 'utf8';
+    $_ = pack('U0C2', 0xa2, 0xf8); # ill-formed UTF-8
+    my $ret = 0;
+    eval { $ret = s/[\0]+//g };
+    ok($ret == 0, "ill-formed UTF-8 doesn't match NUL in class");
+}
+
+{ # [perl #38293] chr(65535) should be allowed in regexes
+    no warnings 'utf8'; # to allow non-characters
+    my($c, $r, $s);
+
+    $c = chr 0xffff;
+    $c =~ s/$c//g;
+    ok($c eq "", "U+FFFF, parsed as atom");
+
+    $c = chr 0xffff;
+    $r = "\\$c";
+    $c =~ s/$r//g;
+    ok($c eq "", "U+FFFF backslashed, parsed as atom");
+
+    $c = chr 0xffff;
+    $c =~ s/[$c]//g;
+    ok($c eq "", "U+FFFF, parsed in class");
+
+    $c = chr 0xffff;
+    $r = "[\\$c]";
+    $c =~ s/$r//g;
+    ok($c eq "", "U+FFFF backslashed, parsed in class");
+
+    $s = "A\x{ffff}B";
+    $s =~ s/\x{ffff}//i;
+    ok($s eq "AB", "U+FFFF, EXACTF");
+
+    $s = "\x{ffff}A";
+    $s =~ s/\bA//;
+    ok($s eq "\x{ffff}", "U+FFFF, BOUND");
+
+    $s = "\x{ffff}!";
+    $s =~ s/\B!//;
+    ok($s eq "\x{ffff}", "U+FFFF, NBOUND");
+} # non-characters end
+
+{
+    # https://rt.perl.org/rt3/Ticket/Display.html?id=39583
+    
+    # The printing characters
+    my @chars = ("A".."Z");
+    my $delim = ",";
+    my $size = 32771 - 4;
+    my $str = '';
+
+    # create some random junk. Inefficient, but it works.
+    for ($i = 0 ; $i < $size ; $i++) {
+        $str .= $chars[int(rand(@chars))];
+    }
+
+    $str .= ($delim x 4);
+    my $res;
+    my $matched;
+    if ($str =~ s/^(.*?)${delim}{4}//s) {
+        $res = $1;
+        $matched=1;
+    } 
+    ok($matched,'pattern matches');
+    ok(length($str)==0,"Empty string");
+    ok(defined($res) && length($res)==$size,"\$1 is correct size");
+}
+
+{ # related to [perl #27940]
+    ok("\0-A"  =~ /\c@-A/, '@- should not be interpolated in a pattern');
+    ok("\0\0A" =~ /\c@+A/, '@+ should not be interpolated in a pattern');
+    ok("X\@-A"  =~ /X@-A/, '@- should not be interpolated in a pattern');
+    ok("X\@\@A" =~ /X@+A/, '@+ should not be interpolated in a pattern');
+
+    ok("X\0A" =~ /X\c@?A/,  '\c@?');
+    ok("X\0A" =~ /X\c@*A/,  '\c@*');
+    ok("X\0A" =~ /X\c@(A)/, '\c@(');
+    ok("X\0A" =~ /X(\c@)A/, '\c@)');
+    ok("X\0A" =~ /X\c@|ZA/, '\c@|');
+
+    ok("X\@A" =~ /X@?A/,  '@?');
+    ok("X\@A" =~ /X@*A/,  '@*');
+    ok("X\@A" =~ /X@(A)/, '@(');
+    ok("X\@A" =~ /X(@)A/, '@)');
+    ok("X\@A" =~ /X@|ZA/, '@|');
+
+    local $" = ','; # non-whitespace and non-RE-specific
+    ok('abc' =~ /(.)(.)(.)/, 'the last successful match is bogus');
+    ok("A@+B"  =~ /A@{+}B/,  'interpolation of @+ in /@{+}/');
+    ok("A@-B"  =~ /A@{-}B/,  'interpolation of @- in /@{-}/');
+    ok("A@+B"  =~ /A@{+}B/x, 'interpolation of @+ in /@{+}/x');
+    ok("A@-B"  =~ /A@{-}B/x, 'interpolation of @- in /@{-}/x');
+}
+
+{
+    use lib 'lib';
+    use Cname;
+    
+    ok('fooB'=~/\N{foo}[\N{B}\N{b}]/,"Passthrough charname");
+    $test=1233; my $handle=make_must_warn('Ignoring excess chars from');
+    $handle->('q(xxWxx) =~ /[\N{WARN}]/');
+    {
+        my $code;
+        my $w="";
+        local $SIG{__WARN__} = sub { $w.=shift };
+        eval($code=<<'EOFTEST') or die "$@\n$code\n";
+        {
+            use warnings;
+            
+            #1234
+            ok("\0" !~ /[\N{EMPTY-STR}XY]/,
+                "Zerolength charname in charclass doesnt match \0");
+            1;
+        }
+EOFTEST
+        ok($w=~/Ignoring zero length/,
+            "Got expected zero length warning");
+        warn $code;                    
+        
+    }
+    $handle= make_must_warn('Ignoring zero length');
+    $handle->('qq(\\0) =~ /[\N{EMPTY-STR}XY]/');
+    ok('AB'=~/(\N{EVIL})/ && $1 eq 'A',"Charname caching $1");
+    ok('ABC'=~/(\N{EVIL})/,"Charname caching $1");    
+    ok('xy'=~/x\N{EMPTY-STR}y/, 'Empty string charname produces NOTHING node');
+    ok(''=~/\N{EMPTY-STR}/, 'Empty string charname produces NOTHING node 2');
+        
+}
+{
+    print "# MORE LATIN SMALL LETTER SHARP S\n";
+
+    use charnames ':full';
+
+    #see also test #835
+    ok("ss" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i,
+        "unoptimized named sequence in class 1");
+    ok("SS" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i,
+        "unoptimized named sequence in class 2");        
+    ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/,
+        "unoptimized named sequence in class 3");
+    ok("\N{LATIN SMALL LETTER SHARP S}" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i,
+        "unoptimized named sequence in class 4");        
+    
+    ok('aabc' !~ /a\N{PLUS SIGN}b/,'/a\N{PLUS SIGN}b/ against aabc');
+    ok('a+bc' =~ /a\N{PLUS SIGN}b/,'/a\N{PLUS SIGN}b/ against a+bc');
+    ok('a+bc' =~ /a\N{PLUS SIGN}b/,'/a\N{PLUS SIGN}b/ against a+bc');
+
+    ok(' A B'=~/\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/,
+        'Intermixed named and unicode escapes 1');
+    ok("\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}"=~
+       /\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/,
+        'Intermixed named and unicode escapes 2');
+    ok("\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042} 3"=~
+       /[\N{SPACE}\N{U+0041}][\N{SPACE}\N{U+0042}]/,
+        'Intermixed named and unicode escapes');     
+}
+# Keep the following test last -- it may crash perl
+
+ok(("a" x (2**15 - 10)) =~ /^()(a|bb)*$/, "Recursive stack cracker: #24274")
+    or print "# Unexpected outcome: should pass or crash perl\n";
+
+# Don't forget to update this!
+BEGIN{print "1..1251\n"};
+