[perl5.git] / t / re / charset.t

# Test the /a, /d, etc regex modifiers

BEGIN {
    chdir 't' if -d 't';
    require './test.pl';
    require './loc_tools.pl';
    set_up_inc('../lib', '../dist/if');
}

use strict;
use warnings;
no warnings 'locale';   # Some /l tests use above-latin1 chars to make sure
                        # they work, even though they warn.
use Config;

plan('no_plan');

# Each case is a valid element of its hash key.  Choose, where available, an
# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
my %testcases = (
    '\w' => [ ord("A"), utf8::unicode_to_native(0xE2), 0x16B ],   # Below expects these to all be alpha
    '\d' => [ ord("0"), 0x0662 ],
    '\s' => [ ord("\t"), utf8::unicode_to_native(0xA0), 0x1680 ],  # Below expects these to be [:blank:]
    '[:cntrl:]' => [ utf8::unicode_to_native(0x00), utf8::unicode_to_native(0x88) ],
    '[:graph:]' => [ ord("&"), utf8::unicode_to_native(0xF7), 0x02C7 ], # Below expects these to be
                                                                     # [:print:]
    '[:lower:]' => [ ord("g"), utf8::unicode_to_native(0xE3), 0x0127 ],
    '[:punct:]' => [ ord('`'), ord('^'), ord('~'), ord('<'), ord('='), ord('>'), ord('|'), ord('-'), ord(','), ord(';'), ord(':'), ord('!'), ord('?'), ord('/'), ord('.'), ord('"'), ord('('), ord(')'), ord('['), ord(']'), ord('{'), ord('}'), ord('@'), ord('$'), ord('*'), ord('\\'), ord('&'), ord('#'), ord('%'), ord('+'), ord("'"), utf8::unicode_to_native(0xBF), 0x055C ],
    '[:upper:]' => [ ord("G"), utf8::unicode_to_native(0xC3), 0x0126 ],
    '[:xdigit:]' => [ ord("4"), 0xFF15 ],
);

$testcases{'[:digit:]'} = $testcases{'\d'};
$testcases{'[:alnum:]'} = $testcases{'\w'};
$testcases{'[:alpha:]'} = $testcases{'\w'};
$testcases{'[:blank:]'} = $testcases{'\s'};
$testcases{'[:print:]'} = $testcases{'[:graph:]'};
$testcases{'[:space:]'} = $testcases{'\s'};
$testcases{'[:word:]'} = $testcases{'\w'};

my $utf8_locale;

my @charsets = qw(a d u aa);
my $locales_ok = eval { locales_enabled('LC_CTYPE'); 1 };
if (! is_miniperl() && $locales_ok) {
    require POSIX;
    my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
    if ($current_locale eq 'C') {

        # test for d_setlocale is repeated here because this one is compile
        # time, and the one above is run time
        use if $Config{d_setlocale}, 'locale';

        # Some implementations don't have the 128-255 range characters all
        # mean nothing under the C locale (an example being VMS).  This is
        # legal, but since we don't know what the right answers should be,
        # skip the locale tests in that situation.
        for my $i (128 .. 255) {
            goto skip_adding_C_locale
                              if chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/;
        }
        push @charsets, 'l';

    skip_adding_C_locale:

        # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale
        $utf8_locale = find_utf8_ctype_locale();
        push @charsets, 'L' if defined $utf8_locale;
    }
}

# For each possible character set...
foreach my $charset (@charsets) {
    my $locale;
    my $charset_mod = lc $charset;
    my $charset_display;
    if ($charset_mod eq 'l') {
        $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l')
                                            ? "C"
                                            : $utf8_locale
                           );
        die "Couldn't change locale" unless $locale;
        $charset_display = $charset_mod . " ($locale)";
    }
    else {
        $charset_display = $charset_mod;
    }

    # And in utf8 or not
    foreach my $upgrade ("", 'utf8::upgrade($a); ') {

        # reverse gets the, \w, \s, \d first.
        for my $class (reverse sort keys %testcases) {

            # The complement of \w is \W; of [:posix:] is [:^posix:]
            my $complement = $class;
            if ($complement !~ s/ ( \[: ) /$1^/x) {
                $complement = uc($class);
            }

            # For each test case
            foreach my $ord (@{$testcases{$class}}) {
                my $char = chr($ord);
                $char = ($char eq '$') ? '\$' : display($char);

                # > 255 already implies upgraded.  Skip the ones that don't
                # have an explicit upgrade.  This shows more clearly in the
                # output which tests are in utf8, or not.
                next if $ord > 255 && ! $upgrade;

                my $reason = "";    # Explanation output with each test
                my $neg_reason = "";
                my $match = 1;      # Calculated whether test regex should
                                    # match or not

                # Everything always matches in ASCII, or under /u, or under /l
                # with a UTF-8 locale
                if (utf8::native_to_unicode($ord) < 128
                    || $charset eq 'u'
                    || $charset eq 'L')
                {
                    $reason = "\"$char\" is a $class under /$charset_display";
                    $neg_reason = "\"$char\" is not a $complement under /$charset_display";
                }
                elsif ($charset eq "a" || $charset eq "aa") {
                    $match = 0;
                    $reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display";
                    $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display";
                }
                elsif ($ord > 255) {
                    $reason = "\"$char\" is a $class under /$charset_display";
                    $neg_reason = "\"$char\" is not a $complement under /$charset_display";
                }
                elsif ($charset eq 'l') {

                    # We are using the C locale, which is essentially ASCII,
                    # but under utf8, the above-latin1 chars are treated as
                    # Unicode)
                    $reason = "\"$char\" is not a $class in the C locale under /$charset_mod";
                    $neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod";
                    $match = 0;
                }
                elsif ($upgrade) {
                    $reason = "\"$char\" is a $class in utf8 under /$charset_display";
                    $neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display";
                }
                else {
                    $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display";
                    $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)";
                    $match = 0;
                }
                $reason = "; $reason" if $reason;
                $neg_reason = "; $neg_reason" if $neg_reason;

                my $op;
                my $neg_op;
                if ($match) {
                    $op = '=~';
                    $neg_op = '!~';
                }
                else {
                    $op = '!~';
                    $neg_op = '=~';
                }

                # In [...] or not
                foreach my $bracketed (0, 1) {
                    my $lb = "";
                    my $rb = "";
                    if ($bracketed) {

                        # Adds an extra char to the character class to make sure
                        # that the class doesn't get optimized away.
                        $lb = ($bracketed) ? '[_' : "";
                        $rb = ($bracketed) ? ']' : "";
                    }
                    else {  # [:posix:] must be inside outer [ ]
                        next if $class =~ /\[/;
                    }

                    my $length = 10;    # For regexec.c regrepeat() cases by
                                        # matching more than one item
                    # Test both class and its complement, and with one or more
                    # than one item to match.
                    foreach my $eval (
                        qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x],
                        qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x],
                    ) {
                        ok (eval $eval, $eval . $reason);
                    }
                    foreach my $eval (
                        qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x],
                        qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x],
                    ) {
                        ok (eval $eval, $eval . $neg_reason);
                    }
                }

                next if $class ne '\w';

                # Test \b, \B at beginning and end of string
                foreach my $eval (
                    qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x],
                    qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x],
                ) {
                    ok (eval $eval, $eval . $reason);
                }
                foreach my $eval (
                    qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x],
                    qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x],
                ) {
                    ok (eval $eval, $eval . $neg_reason);
                }

                # Test \b, \B adjacent to a non-word char, both before it and
                # after.  We test with ASCII, Latin1 and Unicode non-word chars
                foreach my $space_ord (@{$testcases{'\s'}}) {

                    # Useless to try to test non-utf8 when the ord itself
                    # forces utf8
                    next if $space_ord > 255 && ! $upgrade;

                    my $space = display(chr $space_ord);

                    foreach my $eval (
                        qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
                        qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
                    ) {
                        ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
                    }
                    foreach my $eval (
                        qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
                        qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
                    ) {
                        ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w");
                    }
                }

                # Test \b, \B in the middle of two nominally word chars, but
                # one or both may be considered non-word depending on range
                # and charset.
                foreach my $other_ord (@{$testcases{'\w'}}) {
                    next if $other_ord > 255 && ! $upgrade;
                    my $other = display(chr $other_ord);

                    # Determine if the other char is a word char in current
                    # circumstances
                    my $other_is_word = 1;
                    my $other_reason = "\"$other\" is a $class under /$charset_display";
                    my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display";
                    if (utf8::native_to_unicode($other_ord) > 127
                        && $charset ne 'u' && $charset ne 'L'
                        && (($charset eq "a" || $charset eq "aa")
                            || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
                    {
                        $other_is_word = 0;
                        $other_reason = "\"$other\" is not a $class under /$charset_display";
                        $other_neg_reason = "\"$other\" is a $complement under /$charset_display";
                    }
                    my $both_reason = $reason;
                    $both_reason .= "; $other_reason" if $other_ord != $ord;
                    my $both_neg_reason = $neg_reason;
                    $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord;

                    # If both are the same wordness, then \b will fail; \B
                    # succeed
                    if ($match == $other_is_word) {
                        $op = '!~';
                        $neg_op = '=~';
                    }
                    else {
                        $op = '=~';
                        $neg_op = '!~';
                    }

                    foreach my $eval (
                        qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x],
                        qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x],
                    ) {
                        ok (eval $eval, $eval . $both_reason);
                    }
                    foreach my $eval (
                        qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x],
                        qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x],
                    ) {
                        ok (eval $eval, $eval . $both_neg_reason);
                    }

                    next if $other_ord == $ord;

                    # These start with the \b or \B.  They are included, based
                    # on source code analysis, to force the testing of the FBC
                    # (find_by_class) portions of regexec.c.
                    foreach my $eval (
                        qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x],
                        qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x],
                    ) {
                        ok (eval $eval, $eval . $both_reason);
                    }
                    foreach my $eval (
                        qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x],
                        qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x],
                    ) {
                        ok (eval $eval, $eval . $both_neg_reason);
                    }
                }
            } # End of each test case in a class
        } # End of \w, \s, ...
    } # End of utf8 upgraded or not
}

plan(curr_test() - 1);
Commit	Line	Data
e8ed1101 KW	1	# Test the /a, /d, etc regex modifiers
	2
	3	BEGIN {
	4	chdir 't' if -d 't';
e8ed1101	5	require './test.pl';
31f05a37	6	require './loc_tools.pl';
624c42e2	7	set_up_inc('../lib', '../dist/if');
e8ed1101 KW	8	}
	9
	10	use strict;
	11	use warnings;
613abc6d KW	12	no warnings 'locale'; # Some /l tests use above-latin1 chars to make sure
613abc6d KW	13	# they work, even though they warn.
ed35198c	14	use Config;
e8ed1101	15
aab0501d KW	16	plan('no_plan');
aab0501d KW	17
e8ed1101 KW	18	# Each case is a valid element of its hash key. Choose, where available, an
	19	# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
	20	my %testcases = (
f99a3fe1	21	'\w' => [ ord("A"), utf8::unicode_to_native(0xE2), 0x16B ], # Below expects these to all be alpha
e8ed1101	22	'\d' => [ ord("0"), 0x0662 ],
f99a3fe1 KW	23	'\s' => [ ord("\t"), utf8::unicode_to_native(0xA0), 0x1680 ], # Below expects these to be [:blank:]
	24	'[:cntrl:]' => [ utf8::unicode_to_native(0x00), utf8::unicode_to_native(0x88) ],
	25	'[:graph:]' => [ ord("&"), utf8::unicode_to_native(0xF7), 0x02C7 ], # Below expects these to be
	26	# [:print:]
	27	'[:lower:]' => [ ord("g"), utf8::unicode_to_native(0xE3), 0x0127 ],
	28	'[:punct:]' => [ ord('`'), ord('^'), ord('~'), ord('<'), ord('='), ord('>'), ord('\|'), ord('-'), ord(','), ord(';'), ord(':'), ord('!'), ord('?'), ord('/'), ord('.'), ord('"'), ord('('), ord(')'), ord('['), ord(']'), ord('{'), ord('}'), ord('@'), ord('$'), ord('*'), ord('\\'), ord('&'), ord('#'), ord('%'), ord('+'), ord("'"), utf8::unicode_to_native(0xBF), 0x055C ],
	29	'[:upper:]' => [ ord("G"), utf8::unicode_to_native(0xC3), 0x0126 ],
e8ed1101 KW	30	'[:xdigit:]' => [ ord("4"), 0xFF15 ],
	31	);
	32
	33	$testcases{'[:digit:]'} = $testcases{'\d'};
	34	$testcases{'[:alnum:]'} = $testcases{'\w'};
	35	$testcases{'[:alpha:]'} = $testcases{'\w'};
	36	$testcases{'[:blank:]'} = $testcases{'\s'};
	37	$testcases{'[:print:]'} = $testcases{'[:graph:]'};
	38	$testcases{'[:space:]'} = $testcases{'\s'};
	39	$testcases{'[:word:]'} = $testcases{'\w'};
	40
31f05a37 KW	41	my $utf8_locale;
31f05a37 KW	42
c973bd4f	43	my @charsets = qw(a d u aa);
624c42e2 N	44	my $locales_ok = eval { locales_enabled('LC_CTYPE'); 1 };
624c42e2 N	45	if (! is_miniperl() && $locales_ok) {
09fcee4e KW	46	require POSIX;
09fcee4e KW	47	my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
fff7535c	48	if ($current_locale eq 'C') {
ed35198c KW	49
	50	# test for d_setlocale is repeated here because this one is compile
	51	# time, and the one above is run time
	52	use if $Config{d_setlocale}, 'locale';
fff7535c	53
0e18027d KW	54	# Some implementations don't have the 128-255 range characters all
	55	# mean nothing under the C locale (an example being VMS). This is
	56	# legal, but since we don't know what the right answers should be,
	57	# skip the locale tests in that situation.
fff7535c	58	for my $i (128 .. 255) {
f99a3fe1 KW	59	goto skip_adding_C_locale
f99a3fe1 KW	60	if chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/;
fff7535c KW	61	}
fff7535c KW	62	push @charsets, 'l';
31f05a37 KW	63
	64	skip_adding_C_locale:
	65
	66	# Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale
9b0711ee	67	$utf8_locale = find_utf8_ctype_locale();
31f05a37	68	push @charsets, 'L' if defined $utf8_locale;
fff7535c	69	}
09fcee4e KW	70	}
09fcee4e KW	71
e8ed1101	72	# For each possible character set...
158ba892	73	foreach my $charset (@charsets) {
31f05a37 KW	74	my $locale;
	75	my $charset_mod = lc $charset;
	76	my $charset_display;
	77	if ($charset_mod eq 'l') {
	78	$locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l')
	79	? "C"
	80	: $utf8_locale
	81	);
	82	die "Couldn't change locale" unless $locale;
	83	$charset_display = $charset_mod . " ($locale)";
	84	}
	85	else {
	86	$charset_display = $charset_mod;
	87	}
e8ed1101 KW	88
	89	# And in utf8 or not
	90	foreach my $upgrade ("", 'utf8::upgrade($a); ') {
	91
	92	# reverse gets the, \w, \s, \d first.
	93	for my $class (reverse sort keys %testcases) {
	94
	95	# The complement of \w is \W; of [:posix:] is [:^posix:]
	96	my $complement = $class;
	97	if ($complement !~ s/ ( \[: ) /$1^/x) {
	98	$complement = uc($class);
	99	}
	100
	101	# For each test case
	102	foreach my $ord (@{$testcases{$class}}) {
f99a3fe1 KW	103	my $char = chr($ord);
f99a3fe1 KW	104	$char = ($char eq '$') ? '\$' : display($char);
e8ed1101 KW	105
e8ed1101 KW	106	# > 255 already implies upgraded. Skip the ones that don't
9d44f25f	107	# have an explicit upgrade. This shows more clearly in the
e8ed1101 KW	108	# output which tests are in utf8, or not.
	109	next if $ord > 255 && ! $upgrade;
	110
	111	my $reason = ""; # Explanation output with each test
4fc9fd85	112	my $neg_reason = "";
e8ed1101 KW	113	my $match = 1; # Calculated whether test regex should
	114	# match or not
	115
6d9a9b17 KW	116	# Everything always matches in ASCII, or under /u, or under /l
6d9a9b17 KW	117	# with a UTF-8 locale
f99a3fe1 KW	118	if (utf8::native_to_unicode($ord) < 128
	119	\|\| $charset eq 'u'
	120	\|\| $charset eq 'L')
	121	{
31f05a37 KW	122	$reason = "\"$char\" is a $class under /$charset_display";
31f05a37 KW	123	$neg_reason = "\"$char\" is not a $complement under /$charset_display";
e8ed1101	124	}
c973bd4f	125	elsif ($charset eq "a" \|\| $charset eq "aa") {
e8ed1101	126	$match = 0;
6d9a9b17 KW	127	$reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display";
6d9a9b17 KW	128	$neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display";
e8ed1101 KW	129	}
e8ed1101 KW	130	elsif ($ord > 255) {
31f05a37 KW	131	$reason = "\"$char\" is a $class under /$charset_display";
31f05a37 KW	132	$neg_reason = "\"$char\" is not a $complement under /$charset_display";
e8ed1101 KW	133	}
	134	elsif ($charset eq 'l') {
	135
	136	# We are using the C locale, which is essentially ASCII,
	137	# but under utf8, the above-latin1 chars are treated as
	138	# Unicode)
6d9a9b17 KW	139	$reason = "\"$char\" is not a $class in the C locale under /$charset_mod";
6d9a9b17 KW	140	$neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod";
e8ed1101 KW	141	$match = 0;
	142	}
	143	elsif ($upgrade) {
6d9a9b17 KW	144	$reason = "\"$char\" is a $class in utf8 under /$charset_display";
6d9a9b17 KW	145	$neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display";
e8ed1101 KW	146	}
e8ed1101 KW	147	else {
6d9a9b17 KW	148	$reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display";
6d9a9b17 KW	149	$neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)";
e8ed1101 KW	150	$match = 0;
	151	}
	152	$reason = "; $reason" if $reason;
4fc9fd85	153	$neg_reason = "; $neg_reason" if $neg_reason;
e8ed1101 KW	154
	155	my $op;
	156	my $neg_op;
	157	if ($match) {
	158	$op = '=~';
	159	$neg_op = '!~';
	160	}
	161	else {
	162	$op = '!~';
	163	$neg_op = '=~';
	164	}
	165
	166	# In [...] or not
	167	foreach my $bracketed (0, 1) {
	168	my $lb = "";
	169	my $rb = "";
	170	if ($bracketed) {
	171
	172	# Adds an extra char to the character class to make sure
	173	# that the class doesn't get optimized away.
	174	$lb = ($bracketed) ? '[_' : "";
	175	$rb = ($bracketed) ? ']' : "";
	176	}
	177	else { # [:posix:] must be inside outer [ ]
	178	next if $class =~ /\[/;
	179	}
	180
	181	my $length = 10; # For regexec.c regrepeat() cases by
	182	# matching more than one item
	183	# Test both class and its complement, and with one or more
	184	# than one item to match.
	185	foreach my $eval (
31f05a37 KW	186	qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x],
31f05a37 KW	187	qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x],
1e50d612 KW	188	) {
	189	ok (eval $eval, $eval . $reason);
	190	}
	191	foreach my $eval (
31f05a37 KW	192	qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x],
31f05a37 KW	193	qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x],
e8ed1101	194	) {
4fc9fd85	195	ok (eval $eval, $eval . $neg_reason);
e8ed1101 KW	196	}
	197	}
	198
	199	next if $class ne '\w';
	200
	201	# Test \b, \B at beginning and end of string
	202	foreach my $eval (
31f05a37 KW	203	qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x],
31f05a37 KW	204	qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x],
1e50d612 KW	205	) {
	206	ok (eval $eval, $eval . $reason);
	207	}
	208	foreach my $eval (
31f05a37 KW	209	qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x],
31f05a37 KW	210	qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x],
e8ed1101	211	) {
4fc9fd85	212	ok (eval $eval, $eval . $neg_reason);
e8ed1101 KW	213	}
	214
	215	# Test \b, \B adjacent to a non-word char, both before it and
	216	# after. We test with ASCII, Latin1 and Unicode non-word chars
	217	foreach my $space_ord (@{$testcases{'\s'}}) {
	218
	219	# Useless to try to test non-utf8 when the ord itself
	220	# forces utf8
	221	next if $space_ord > 255 && ! $upgrade;
	222
	223	my $space = display(chr $space_ord);
	224
	225	foreach my $eval (
31f05a37 KW	226	qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
31f05a37 KW	227	qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
1e50d612 KW	228	) {
	229	ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
	230	}
	231	foreach my $eval (
31f05a37 KW	232	qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
31f05a37 KW	233	qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
e8ed1101	234	) {
4fc9fd85	235	ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w");
e8ed1101 KW	236	}
	237	}
	238
	239	# Test \b, \B in the middle of two nominally word chars, but
	240	# one or both may be considered non-word depending on range
	241	# and charset.
	242	foreach my $other_ord (@{$testcases{'\w'}}) {
	243	next if $other_ord > 255 && ! $upgrade;
	244	my $other = display(chr $other_ord);
	245
	246	# Determine if the other char is a word char in current
	247	# circumstances
	248	my $other_is_word = 1;
31f05a37 KW	249	my $other_reason = "\"$other\" is a $class under /$charset_display";
31f05a37 KW	250	my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display";
f99a3fe1	251	if (utf8::native_to_unicode($other_ord) > 127
31f05a37	252	&& $charset ne 'u' && $charset ne 'L'
c973bd4f	253	&& (($charset eq "a" \|\| $charset eq "aa")
e8ed1101 KW	254	\|\| ($other_ord < 256 && ($charset eq 'l' \|\| ! $upgrade))))
	255	{
	256	$other_is_word = 0;
31f05a37 KW	257	$other_reason = "\"$other\" is not a $class under /$charset_display";
31f05a37 KW	258	$other_neg_reason = "\"$other\" is a $complement under /$charset_display";
e8ed1101 KW	259	}
	260	my $both_reason = $reason;
	261	$both_reason .= "; $other_reason" if $other_ord != $ord;
4fc9fd85 KW	262	my $both_neg_reason = $neg_reason;
4fc9fd85 KW	263	$both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord;
e8ed1101 KW	264
	265	# If both are the same wordness, then \b will fail; \B
	266	# succeed
	267	if ($match == $other_is_word) {
	268	$op = '!~';
	269	$neg_op = '=~';
	270	}
	271	else {
	272	$op = '=~';
	273	$neg_op = '!~';
	274	}
	275
	276	foreach my $eval (
31f05a37 KW	277	qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x],
31f05a37 KW	278	qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x],
1e50d612 KW	279	) {
	280	ok (eval $eval, $eval . $both_reason);
	281	}
	282	foreach my $eval (
31f05a37 KW	283	qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x],
31f05a37 KW	284	qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x],
e8ed1101	285	) {
4fc9fd85	286	ok (eval $eval, $eval . $both_neg_reason);
e8ed1101 KW	287	}
	288
	289	next if $other_ord == $ord;
	290
	291	# These start with the \b or \B. They are included, based
	292	# on source code analysis, to force the testing of the FBC
	293	# (find_by_class) portions of regexec.c.
	294	foreach my $eval (
31f05a37 KW	295	qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x],
31f05a37 KW	296	qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x],
1e50d612 KW	297	) {
	298	ok (eval $eval, $eval . $both_reason);
	299	}
	300	foreach my $eval (
31f05a37 KW	301	qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x],
31f05a37 KW	302	qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x],
e8ed1101	303	) {
4fc9fd85	304	ok (eval $eval, $eval . $both_neg_reason);
e8ed1101 KW	305	}
	306	}
	307	} # End of each test case in a class
	308	} # End of \w, \s, ...
	309	} # End of utf8 upgraded or not
	310	}
	311
	312	plan(curr_test() - 1);