[perl5.git] / t / re / regexp.t

#!./perl

# The tests are in a separate file 't/re/re_tests'.
# Each line in that file is a separate test.
# There are five columns, separated by tabs.
#
# Column 1 contains the pattern, optionally enclosed in C<''>.
# Modifiers can be put after the closing C<'>.
#
# Column 2 contains the string to be matched.
#
# Column 3 contains the expected result:
# 	y	expect a match
# 	n	expect no match
# 	c	expect an error
#	T	the test is a TODO (can be combined with y/n/c)
#	M	skip test on miniperl (combine with y/n/c/T)
#	B	test exposes a known bug in Perl, should be skipped
#	b	test exposes a known bug in Perl, should be skipped if noamp
#	t	test exposes a bug with threading, TODO if qr_embed_thr
#       s       test should only be run for regex_sets_compat.t
#       S       test should not be run for regex_sets_compat.t
#
# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
#
# Column 4 contains a string, usually C<$&>.
#
# Column 5 contains the expected result of double-quote
# interpolating that string after the match, or start of error message.
#
# Column 6, if present, contains a reason why the test is skipped.
# This is printed with "skipped", for harness to pick up.
#
# \n in the tests are interpolated, as are variables of the form ${\w+}.
#
# Blanks lines are treated as PASSING tests to keep the line numbers
# linked to the test number.
#
# If you want to add a regular expression test that can't be expressed
# in this format, don't add it here: put it in re/pat.t instead.
#
# Note that the inputs get passed on as "m're'", so the re bypasses the lexer.
# This means this file cannot be used for testing anything that the lexer
# handles; in 5.12 this means just \N{NAME} and \N{U+...}.
#
# Note that columns 2,3 and 5 are all enclosed in double quotes and then
# evalled; so something like a\"\x{100}$1 has length 3+length($1).

my ($file, $iters);
BEGIN {
    $iters = shift || 1;	# Poor man performance suite, 10000 is OK.

    # Do this open before any chdir
    $file = shift;
    if (defined $file) {
	open TESTS, $file or die "Can't open $file";
    }

    chdir 't' if -d 't';
    @INC = '../lib';

}

sub _comment {
    return map { /^#/ ? "$_\n" : "# $_\n" }
           map { split /\n/ } @_;
}

use strict;
use warnings FATAL=>"all";
use vars qw($bang $ffff $nulnul); # used by the tests
use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets); # set by our callers


if (!defined $file) {
    open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!";
}

my @tests = <TESTS>;

close TESTS;

$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable.
$ffff  = chr(0xff) x 2;
$nulnul = "\0" x 2;
my $OP = $qr ? 'qr' : 'm';

$| = 1;
printf "1..%d\n# $iters iterations\n", scalar @tests;

my $test;
TEST:
foreach (@tests) {
    $test++;
    if (!/\S/ || /^\s*#/ || /^__END__$/) {
        print "ok $test # (Blank line or comment)\n";
        if (/#/) { print $_ };
        next;
    }
    chomp;
    s/\\n/\n/g unless $regex_sets;
    my ($pat, $subject, $result, $repl, $expect, $reason) = split(/\t/,$_,6);
    if (!defined $subject) {
        die "Bad test definition on line $test: $_\n";
    }
    $reason = '' unless defined $reason;
    my $input = join(':',$pat,$subject,$result,$repl,$expect);
    # the double '' below keeps simple syntax highlighters from going crazy
    $pat = "'$pat'" unless $pat =~ /^[:''\/]/; 
    $pat =~ s/(\$\{\w+\})/$1/eeg;
    $pat =~ s/\\n/\n/g unless $regex_sets;
    $subject = eval qq("$subject"); die $@ if $@;
    $expect  = eval qq("$expect"); die $@ if $@;
    $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/;
    my $todo_qr = $qr_embed_thr && ($result =~ s/t//);
    my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
    ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader;
    if ($result =~ s/ ( [Ss] ) //x) {
        if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) {
            $skip++;
            $reason = "Test not valid for $0";
        }
    }
    $reason = 'skipping $&' if $reason eq  '' && $skip_amp;
    $result =~ s/B//i unless $skip;
    my $todo= $result =~ s/T// ? " # TODO" : "";
    if (! $skip && $regex_sets) {

        # If testing regex sets, change the [bracketed] classes into
        # (?[bracketed]).

        if ($pat !~ / \[ /x) {

            $skip++;
            $reason = "Pattern doesn't contain [brackets]";
        }
        else { # Use non-regex features of Perl to accomplish this.
            my $modified = "";
            my $in_brackets = 0;

            # Go through the pattern character-by-character.  We also add
            # blanks around each token to test the /x parts of (?[ ])
            my $pat_len = length($pat);
      CHAR: for (my $i = 0; $i < $pat_len; $i++) {
                my $curchar = substr($pat, $i, 1);
                if ($curchar eq '\\') {
                    $modified .= " " if $in_brackets;
                    $modified .= $curchar;
                    $i++;

                    # Get the character the backslash is escaping
                    $curchar = substr($pat, $i, 1);
                    $modified .= $curchar;

                    # If the character following that is a '{}', treat the
                    # entire amount as a single token
                    if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') {
                        my $j = index($pat, '}', $i+2);
                        if ($j < 0) {
                            last unless $in_brackets;
                            if ($result eq 'c') {
                                $skip++;
                                $reason = "Can't handle compilation errors with unmatched '{'";
                            }
                            else {
                                print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n";
                                next TEST;
                            }
                        }
                        $modified .= substr($pat, $i+1, $j - $i);
                        $i = $j;
                    }
                    elsif ($curchar eq 'x') {

                        # \x without brackets is supposed to be followed by 2
                        # hex digits.  Take up to 2, and then add a blank
                        # after the last one.  This avoids getting errors from
                        # (?[ ]) for run-ons, like \xabc
                        my $j = $i + 1;
                        for (; $j < $i + 3 && $j < $pat_len; $j++) {
                            my $curord = ord(substr($pat, $j, 1));
                            if (!(($curord >= ord("A") && $curord <= ord("F"))
                                 || ($curord >= ord("a") && $curord <= ord("f"))
                                 || ($curord >= ord("0") && $curord <= ord("9"))))
                            {
                                $j++;
                                last;
                            }
                        }
                        $j--;
                        $modified .= substr($pat, $i + 1, $j - $i) . " ";
                        $i = $j;
                    }
                    elsif (ord($curchar) >= ord('0')
                           && (ord($curchar) <= ord('7')))
                    {
                        # Similarly, octal constants have up to 3 digits.
                        my $j = $i + 1;
                        for (; $j < $i + 3 && $j < $pat_len; $j++) {
                            my $curord = ord(substr($pat, $j, 1));
                            if (! ($curord >= ord("0") &&  $curord <= ord("7"))) {
                                $j++;
                                last;
                            }
                        }
                        $j--;
                        $modified .= substr($pat, $i + 1, $j - $i);
                        $i = $j;
                    }

                    next;
                } # End of processing a backslash sequence

                if (! $in_brackets  # Skip (?{ })
                    && $curchar eq '('
                    && $i < $pat_len - 2
                    && substr($pat, $i+1, 1) eq '?'
                    && substr($pat, $i+2, 1) eq '{')
                {
                    $skip++;
                    $reason = "Pattern contains '(?{'";
                    last;
                }

                # Closing ']'
                if ($curchar eq ']' && $in_brackets) {
                    $modified .= " ] ])";
                    $in_brackets = 0;
                    next;
                }

                # A regular character.
                if ($curchar ne '[') {
                    if (! $in_brackets) {
                        $modified .= $curchar;
                    }
                    else {
                        $modified .= " $curchar ";
                    }
                    next;
                }

                # Here is a '['; If not in a bracketed class, treat as the
                # beginning of one.
                if (! $in_brackets) {
                    $in_brackets = 1;
                    $modified .= "(?[ [ ";

                    # An immediately following ']' or '^]' is not the ending
                    # of the class, but is to be treated literally.
                    if ($i < $pat_len - 1
                        && substr($pat, $i+1, 1) eq ']')
                    {
                        $i ++;
                        $modified .= " ] ";
                    }
                    elsif ($i < $pat_len - 2
                            && substr($pat, $i+1, 1) eq '^'
                            && substr($pat, $i+2, 1) eq ']')
                    {
                        $i += 2;
                        $modified .= " ^ ] ";
                    }
                    next;
                }

                # Here is a plain '[' within [ ].  Could mean wants to
                # match a '[', or it could be a posix class that has a
                # corresponding ']'.  Absorb either

                $modified .= ' [';
                last if $i >= $pat_len - 1;

                $i++;
                $curchar = substr($pat, $i, 1);
                if ($curchar =~ /[:=.]/) {
                    for (my $j = $i + 1; $j < $pat_len; $j++) {
                        next unless substr($pat, $j, 1) eq ']';
                        last if $j - $i < 2;
                        if (substr($pat, $j - 1, 1) eq $curchar) {
                            # Here, is a posix class
                            $modified .= substr($pat, $i, $j - $i + 1) . " ";
                            $i = $j;
                            next CHAR;
                        }
                    }
                }

                # Here wasn't a posix class, just process normally
                $modified .= " $curchar ";
            }

            if ($in_brackets && ! $skip) {
                if ($result eq 'c') {
                    $skip++;
                    $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error";
                }
                else {
                    print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n";
                    next TEST;
                }
            }

            # Use our modified pattern instead of the original
            $pat = $modified;
        }
    }

    for my $study ('', 'study $subject', 'utf8::upgrade($subject)',
		   'utf8::upgrade($subject); study $subject') {
	# Need to make a copy, else the utf8::upgrade of an already studied
	# scalar confuses things.
	my $subject = $subject;
	my $c = $iters;
	my ($code, $match, $got);
        if ($repl eq 'pos') {
            $code= <<EOFCODE;
                $study;
                pos(\$subject)=0;
                \$match = ( \$subject =~ m${pat}g );
                \$got = pos(\$subject);
EOFCODE
        }
        elsif ($qr_embed) {
            $code= <<EOFCODE;
                my \$RE = qr$pat;
                $study;
                \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
                \$got = "$repl";
EOFCODE
        }
        elsif ($qr_embed_thr) {
            $code= <<EOFCODE;
		# Can't run the match in a subthread, but can do this and
	 	# clone the pattern the other way.
                my \$RE = threads->new(sub {qr$pat})->join();
                $study;
                \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
                \$got = "$repl";
EOFCODE
        }
        else {
            $code= <<EOFCODE;
                $study;
                \$match = (\$subject =~ $OP$pat) while \$c--;
                \$got = "$repl";
EOFCODE
        }
        $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets;
        #$code.=qq[\n\$expect="$expect";\n];
        #use Devel::Peek;
        #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
	{
	    # Probably we should annotate specific tests with which warnings
	    # categories they're known to trigger, and hence should be
	    # disabled just for that test
	    no warnings qw(uninitialized regexp);
	    eval $code;
	}
	chomp( my $err = $@ );
	if ( $skip ) {
	    print "ok $test # skipped", length($reason) ? ".  $reason" : '', "\n";
	    next TEST;
	}
	elsif ($result eq 'c') {
	    if ($err !~ m!^\Q$expect!) { print "not ok $test$todo (compile) $input => '$err'\n"; next TEST }
	    last;  # no need to study a syntax error
	}
	elsif ( $todo_qr ) {
	    print "not ok $test # TODO", length($reason) ? " - $reason" : '', "\n";
	    next TEST;
	}
	elsif ($@) {
	    print "not ok $test$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST;
	}
	elsif ($result =~ /^n/) {
	    if ($match) { print "not ok $test$todo ($study) $input => false positive\n"; next TEST }
	}
	else {
	    if (!$match || $got ne $expect) {
	        eval { require Data::Dumper };
                no warnings "utf8"; # But handle should be utf8
		if ($@ || !defined &DynaLoader::boot_DynaLoader) {
		    # Data::Dumper will load on miniperl, but fail when used in
		    # anger as it tries to load B. I'd prefer to keep the
		    # regular calls below outside of an eval so that real
		    # (unknown) failures get spotted, not ignored.
		    print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$code\n");
		}
		else { # better diagnostics
		    my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump;
		    my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump;
		    print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$g\n$code\n");
		}
		next TEST;
	    }
	}
    }
    print "ok $test$todo\n";
}

1;
Commit	Line	Data
378cc40b LW	1	#!./perl
378cc40b LW	2
ae34ee58	3	# The tests are in a separate file 't/re/re_tests'.
ad4f75a6 HM	4	# Each line in that file is a separate test.
	5	# There are five columns, separated by tabs.
	6	#
	7	# Column 1 contains the pattern, optionally enclosed in C<''>.
	8	# Modifiers can be put after the closing C<'>.
	9	#
	10	# Column 2 contains the string to be matched.
	11	#
	12	# Column 3 contains the expected result:
	13	# y expect a match
	14	# n expect no match
	15	# c expect an error
24d786f4	16	# T the test is a TODO (can be combined with y/n/c)
cb6fa888	17	# M skip test on miniperl (combine with y/n/c/T)
cf93c79d IZ	18	# B test exposes a known bug in Perl, should be skipped
cf93c79d IZ	19	# b test exposes a known bug in Perl, should be skipped if noamp
e3faa678	20	# t test exposes a bug with threading, TODO if qr_embed_thr
073b366a KW	21	# s test should only be run for regex_sets_compat.t
073b366a KW	22	# S test should not be run for regex_sets_compat.t
ad4f75a6	23	#
1b1626e4	24	# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
ad4f75a6 HM	25	#
	26	# Column 4 contains a string, usually C<$&>.
	27	#
	28	# Column 5 contains the expected result of double-quote
c277df42 IZ	29	# interpolating that string after the match, or start of error message.
c277df42 IZ	30	#
ee595aa6 LC	31	# Column 6, if present, contains a reason why the test is skipped.
	32	# This is printed with "skipped", for harness to pick up.
	33	#
9d116dd7	34	# \n in the tests are interpolated, as are variables of the form ${\w+}.
83e898de	35	#
b9b4dddf YO	36	# Blanks lines are treated as PASSING tests to keep the line numbers
	37	# linked to the test number.
	38	#
8d37f932	39	# If you want to add a regular expression test that can't be expressed
67a2b8c6	40	# in this format, don't add it here: put it in re/pat.t instead.
b2a156bd	41	#
ff3f963a KW	42	# Note that the inputs get passed on as "m're'", so the re bypasses the lexer.
	43	# This means this file cannot be used for testing anything that the lexer
	44	# handles; in 5.12 this means just \N{NAME} and \N{U+...}.
	45	#
b2a156bd DM	46	# Note that columns 2,3 and 5 are all enclosed in double quotes and then
b2a156bd DM	47	# evalled; so something like a\"\x{100}$1 has length 3+length($1).
c277df42	48
7e1dab6a	49	my ($file, $iters);
e4d48cc9	50	BEGIN {
1a610890 NC	51	$iters = shift \|\| 1; # Poor man performance suite, 10000 is OK.
	52
	53	# Do this open before any chdir
	54	$file = shift;
	55	if (defined $file) {
	56	open TESTS, $file or die "Can't open $file";
	57	}
	58
e4d48cc9	59	chdir 't' if -d 't';
20822f61	60	@INC = '../lib';
e3faa678	61
e4d48cc9	62	}
1a610890	63
1b7228c9 KW	64	sub _comment {
	65	return map { /^#/ ? "$_\n" : "# $_\n" }
	66	map { split /\n/ } @_;
	67	}
	68
1286eaeb	69	use strict;
66fb63c1	70	use warnings FATAL=>"all";
7e1dab6a	71	use vars qw($bang $ffff $nulnul); # used by the tests
073b366a KW	72	use vars qw($qr $skip_amp $qr_embed $qr_embed_thr $regex_sets); # set by our callers
073b366a KW	73
e4d48cc9	74
ad4f75a6	75
1a610890	76	if (!defined $file) {
7e1dab6a	77	open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!";
1a610890 NC	78	}
	79
	80	my @tests = <TESTS>;
cfa4f241	81
1a610890	82	close TESTS;
378cc40b	83
9d116dd7	84	$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable.
b8c5462f JH	85	$ffff = chr(0xff) x 2;
b8c5462f JH	86	$nulnul = "\0" x 2;
7e1dab6a	87	my $OP = $qr ? 'qr' : 'm';
9d116dd7	88
1462b684	89	$\| = 1;
1a610890	90	printf "1..%d\n# $iters iterations\n", scalar @tests;
e3faa678	91
1a610890	92	my $test;
cfa4f241	93	TEST:
1a610890 NC	94	foreach (@tests) {
1a610890 NC	95	$test++;
5a51db05	96	if (!/\S/ \|\| /^\s*#/ \|\| /^__END__$/) {
1a610890	97	print "ok $test # (Blank line or comment)\n";
5a51db05	98	if (/#/) { print $_ };
b9b4dddf YO	99	next;
b9b4dddf YO	100	}
b85d18e9	101	chomp;
073b366a	102	s/\\n/\n/g unless $regex_sets;
1286eaeb	103	my ($pat, $subject, $result, $repl, $expect, $reason) = split(/\t/,$_,6);
b8f6efdd YO	104	if (!defined $subject) {
	105	die "Bad test definition on line $test: $_\n";
	106	}
66fb63c1	107	$reason = '' unless defined $reason;
1286eaeb	108	my $input = join(':',$pat,$subject,$result,$repl,$expect);
24d786f4 YO	109	# the double '' below keeps simple syntax highlighters from going crazy
24d786f4 YO	110	$pat = "'$pat'" unless $pat =~ /^[:''\/]/;
9d116dd7	111	$pat =~ s/(\$\{\w+\})/$1/eeg;
073b366a	112	$pat =~ s/\\n/\n/g unless $regex_sets;
1a610890 NC	113	$subject = eval qq("$subject"); die $@ if $@;
1a610890 NC	114	$expect = eval qq("$expect"); die $@ if $@;
c277df42	115	$expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/;
24d786f4	116	my $todo_qr = $qr_embed_thr && ($result =~ s/t//);
1286eaeb	117	my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
cb6fa888	118	++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader;
073b366a KW	119	if ($result =~ s/ ( [Ss] ) //x) {
	120	if (($1 eq 'S' && $regex_sets) \|\| ($1 eq 's' && ! $regex_sets)) {
	121	$skip++;
	122	$reason = "Test not valid for $0";
	123	}
	124	}
906e884f	125	$reason = 'skipping $&' if $reason eq '' && $skip_amp;
cf93c79d	126	$result =~ s/B//i unless $skip;
24d786f4	127	my $todo= $result =~ s/T// ? " # TODO" : "";
073b366a KW	128	if (! $skip && $regex_sets) {
	129
	130	# If testing regex sets, change the [bracketed] classes into
	131	# (?[bracketed]).
	132
	133	if ($pat !~ / \[ /x) {
	134
	135	$skip++;
	136	$reason = "Pattern doesn't contain [brackets]";
	137	}
	138	else { # Use non-regex features of Perl to accomplish this.
	139	my $modified = "";
	140	my $in_brackets = 0;
	141
	142	# Go through the pattern character-by-character. We also add
	143	# blanks around each token to test the /x parts of (?[ ])
	144	my $pat_len = length($pat);
	145	CHAR: for (my $i = 0; $i < $pat_len; $i++) {
	146	my $curchar = substr($pat, $i, 1);
	147	if ($curchar eq '\\') {
	148	$modified .= " " if $in_brackets;
	149	$modified .= $curchar;
	150	$i++;
	151
	152	# Get the character the backslash is escaping
	153	$curchar = substr($pat, $i, 1);
	154	$modified .= $curchar;
	155
	156	# If the character following that is a '{}', treat the
	157	# entire amount as a single token
	158	if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') {
	159	my $j = index($pat, '}', $i+2);
	160	if ($j < 0) {
	161	last unless $in_brackets;
	162	if ($result eq 'c') {
	163	$skip++;
	164	$reason = "Can't handle compilation errors with unmatched '{'";
	165	}
	166	else {
	167	print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n";
	168	next TEST;
	169	}
	170	}
	171	$modified .= substr($pat, $i+1, $j - $i);
	172	$i = $j;
	173	}
	174	elsif ($curchar eq 'x') {
	175
	176	# \x without brackets is supposed to be followed by 2
	177	# hex digits. Take up to 2, and then add a blank
	178	# after the last one. This avoids getting errors from
	179	# (?[ ]) for run-ons, like \xabc
	180	my $j = $i + 1;
	181	for (; $j < $i + 3 && $j < $pat_len; $j++) {
	182	my $curord = ord(substr($pat, $j, 1));
	183	if (!(($curord >= ord("A") && $curord <= ord("F"))
	184	\|\| ($curord >= ord("a") && $curord <= ord("f"))
	185	\|\| ($curord >= ord("0") && $curord <= ord("9"))))
	186	{
	187	$j++;
	188	last;
	189	}
	190	}
	191	$j--;
192	$modified .= substr($pat, $i + 1, $j - $i) . " ";
193	$i = $j;
194	}
195	elsif (ord($curchar) >= ord('0')
196	&& (ord($curchar) <= ord('7')))
197	{
198	# Similarly, octal constants have up to 3 digits.
199	my $j = $i + 1;
200	for (; $j < $i + 3 && $j < $pat_len; $j++) {
201	my $curord = ord(substr($pat, $j, 1));
202	if (! ($curord >= ord("0") && $curord <= ord("7"))) {
203	$j++;
204	last;
205	}
206	}
207	$j--;
208	$modified .= substr($pat, $i + 1, $j - $i);
209	$i = $j;
210	}
211
212	next;
213	} # End of processing a backslash sequence
214
215	if (! $in_brackets # Skip (?{ })
216	&& $curchar eq '('
217	&& $i < $pat_len - 2
218	&& substr($pat, $i+1, 1) eq '?'
219	&& substr($pat, $i+2, 1) eq '{')
220	{
221	$skip++;
222	$reason = "Pattern contains '(?{'";
223	last;
224	}
225
226	# Closing ']'
227	if ($curchar eq ']' && $in_brackets) {
228	$modified .= " ] ])";
229	$in_brackets = 0;
230	next;
231	}
232
233	# A regular character.
234	if ($curchar ne '[') {
235	if (! $in_brackets) {
236	$modified .= $curchar;
237	}
238	else {
239	$modified .= " $curchar ";
240	}
241	next;
242	}
243
244	# Here is a '['; If not in a bracketed class, treat as the
245	# beginning of one.
246	if (! $in_brackets) {
247	$in_brackets = 1;
248	$modified .= "(?[ [ ";
249
250	# An immediately following ']' or '^]' is not the ending
251	# of the class, but is to be treated literally.
252	if ($i < $pat_len - 1
253	&& substr($pat, $i+1, 1) eq ']')
254	{
255	$i ++;
256	$modified .= " ] ";
257	}
258	elsif ($i < $pat_len - 2
259	&& substr($pat, $i+1, 1) eq '^'
260	&& substr($pat, $i+2, 1) eq ']')
261	{
262	$i += 2;
263	$modified .= " ^ ] ";
264	}
265	next;
266	}
267
268	# Here is a plain '[' within [ ]. Could mean wants to
269	# match a '[', or it could be a posix class that has a
270	# corresponding ']'. Absorb either
271
272	$modified .= ' [';
273	last if $i >= $pat_len - 1;
274
275	$i++;
276	$curchar = substr($pat, $i, 1);
277	if ($curchar =~ /[:=.]/) {
278	for (my $j = $i + 1; $j < $pat_len; $j++) {
279	next unless substr($pat, $j, 1) eq ']';
280	last if $j - $i < 2;
281	if (substr($pat, $j - 1, 1) eq $curchar) {
282	# Here, is a posix class
283	$modified .= substr($pat, $i, $j - $i + 1) . " ";
284	$i = $j;
285	next CHAR;
286	}
287	}
288	}
289
290	# Here wasn't a posix class, just process normally
291	$modified .= " $curchar ";
292	}
293
294	if ($in_brackets && ! $skip) {
295	if ($result eq 'c') {
296	$skip++;
297	$reason = "Can't figure out where to put the (?[ and ]) since is a compilation error";
298	}
299	else {
300	print "not ok $test # Problem in $0; original = '$pat'; mod = '$modified'\n";
301	next TEST;
302	}
303	}
304
305	# Use our modified pattern instead of the original
306	$pat = $modified;
307	}
308	}
1de06328	309
52e33015 NC	310	for my $study ('', 'study $subject', 'utf8::upgrade($subject)',
52e33015 NC	311	'utf8::upgrade($subject); study $subject') {
93f09d7b	312	# Need to make a copy, else the utf8::upgrade of an already studied
52e33015 NC	313	# scalar confuses things.
52e33015 NC	314	my $subject = $subject;
1286eaeb NC	315	my $c = $iters;
1286eaeb NC	316	my ($code, $match, $got);
1de06328 YO	317	if ($repl eq 'pos') {
	318	$code= <<EOFCODE;
	319	$study;
	320	pos(\$subject)=0;
	321	\$match = ( \$subject =~ m${pat}g );
	322	\$got = pos(\$subject);
	323	EOFCODE
	324	}
	325	elsif ($qr_embed) {
	326	$code= <<EOFCODE;
	327	my \$RE = qr$pat;
	328	$study;
	329	\$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
	330	\$got = "$repl";
	331	EOFCODE
	332	}
e3faa678 NC	333	elsif ($qr_embed_thr) {
	334	$code= <<EOFCODE;
	335	# Can't run the match in a subthread, but can do this and
	336	# clone the pattern the other way.
	337	my \$RE = threads->new(sub {qr$pat})->join();
	338	$study;
	339	\$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
	340	\$got = "$repl";
	341	EOFCODE
	342	}
1de06328 YO	343	else {
	344	$code= <<EOFCODE;
	345	$study;
1286eaeb	346	\$match = (\$subject =~ $OP$pat) while \$c--;
1de06328 YO	347	\$got = "$repl";
	348	EOFCODE
	349	}
073b366a	350	$code = "no warnings 'experimental::regex_sets';$code" if $regex_sets;
e1d1eefb YO	351	#$code.=qq[\n\$expect="$expect";\n];
	352	#use Devel::Peek;
	353	#die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
66fb63c1 NC	354	{
	355	# Probably we should annotate specific tests with which warnings
	356	# categories they're known to trigger, and hence should be
	357	# disabled just for that test
	358	no warnings qw(uninitialized regexp);
	359	eval $code;
	360	}
1286eaeb	361	chomp( my $err = $@ );
565b86e2	362	if ( $skip ) {
3c6cc85e	363	print "ok $test # skipped", length($reason) ? ". $reason" : '', "\n";
ee595aa6	364	next TEST;
cf93c79d	365	}
565b86e2 KW	366	elsif ($result eq 'c') {
	367	if ($err !~ m!^\Q$expect!) { print "not ok $test$todo (compile) $input => '$err'\n"; next TEST }
	368	last; # no need to study a syntax error
	369	}
24d786f4	370	elsif ( $todo_qr ) {
e0892690	371	print "not ok $test # TODO", length($reason) ? " - $reason" : '', "\n";
e3faa678 NC	372	next TEST;
e3faa678 NC	373	}
c277df42	374	elsif ($@) {
2fe1f0f5	375	print "not ok $test$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST;
c277df42	376	}
e3faa678	377	elsif ($result =~ /^n/) {
24d786f4	378	if ($match) { print "not ok $test$todo ($study) $input => false positive\n"; next TEST }
378cc40b LW	379	}
378cc40b LW	380	else {
cfa4f241	381	if (!$match \|\| $got ne $expect) {
cde0cee5	382	eval { require Data::Dumper };
969c44e7	383	no warnings "utf8"; # But handle should be utf8
65016092 NC	384	if ($@ \|\| !defined &DynaLoader::boot_DynaLoader) {
	385	# Data::Dumper will load on miniperl, but fail when used in
	386	# anger as it tries to load B. I'd prefer to keep the
	387	# regular calls below outside of an eval so that real
	388	# (unknown) failures get spotted, not ignored.
2fe1f0f5	389	print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$code\n");
cde0cee5 YO	390	}
	391	else { # better diagnostics
	392	my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump;
	393	my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump;
2fe1f0f5	394	print "not ok $test$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$g\n$code\n");
cde0cee5	395	}
cfa4f241 CS	396	next TEST;
cfa4f241 CS	397	}
378cc40b LW	398	}
378cc40b LW	399	}
24d786f4	400	print "ok $test$todo\n";
378cc40b	401	}
cfa4f241	402
1a610890	403	1;