[perl5.git] / t / re / charset.t

# Test the /a, /d, etc regex modifiers

BEGIN {
    chdir 't' if -d 't';
    @INC = '../lib';
    require './test.pl';
}

use strict;
use warnings;

plan('no_plan');

# Each case is a valid element of its hash key.  Choose, where available, an
# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
my %testcases = (
    '\w' => [ ord("A"), 0xE2, 0x16B ],   # Below expects these to all be alpha
    '\d' => [ ord("0"), 0x0662 ],
    '\s' => [ ord("\t"), 0xA0, 0x1680 ],  # Below expects these to be [:blank:]
    '[:cntrl:]' => [ 0x00, 0x88 ],
    '[:graph:]' => [ ord("&"), 0xF7, 0x02C7 ], # Below expects these to be
                                               # [:print:]
    '[:lower:]' => [ ord("g"), 0xE3, 0x0127 ],
    '[:punct:]' => [ ord("!"), 0xBF, 0x055C ],
    '[:upper:]' => [ ord("G"), 0xC3, 0x0126 ],
    '[:xdigit:]' => [ ord("4"), 0xFF15 ],
);

$testcases{'[:digit:]'} = $testcases{'\d'};
$testcases{'[:alnum:]'} = $testcases{'\w'};
$testcases{'[:alpha:]'} = $testcases{'\w'};
$testcases{'[:blank:]'} = $testcases{'\s'};
$testcases{'[:print:]'} = $testcases{'[:graph:]'};
$testcases{'[:space:]'} = $testcases{'\s'};
$testcases{'[:word:]'} = $testcases{'\w'};

# For each possible character set...
foreach my $charset ("a", "d", "l", "u") {

    # And in utf8 or not
    foreach my $upgrade ("", 'utf8::upgrade($a); ') {

        # reverse gets the, \w, \s, \d first.
        for my $class (reverse sort keys %testcases) {

            # The complement of \w is \W; of [:posix:] is [:^posix:]
            my $complement = $class;
            if ($complement !~ s/ ( \[: ) /$1^/x) {
                $complement = uc($class);
            }

            # For each test case
            foreach my $ord (@{$testcases{$class}}) {
                my $char = display(chr($ord));

                # > 255 already implies upgraded.  Skip the ones that don't
                # have an explicit upgrade.  This shows more clearly in the
                # output which tests are in utf8, or not.
                next if $ord > 255 && ! $upgrade;

                my $reason = "";    # Explanation output with each test
                my $match = 1;      # Calculated whether test regex should
                                    # match or not

                # Everything always matches in ASCII, or under /u
                if ($ord < 128 || $charset eq 'u') {
                    $reason = "\"$char\" is a $class under /$charset";
                }
                elsif ($charset eq "a") {
                    $match = 0;
                    $reason = "\"$char\" is non-ASCII, which can't be a $class under /a";
                }
                elsif ($ord > 255) {
                    $reason = "\"$char\" is a $class under /$charset";
                }
                elsif ($charset eq 'l') {

                    # We are using the C locale, which is essentially ASCII,
                    # but under utf8, the above-latin1 chars are treated as
                    # Unicode)
                    $reason = "\"$char\" is not a $class in this locale under /l";
                    $match = 0;
                }
                elsif ($upgrade) {
                    $reason = "\"$char\" is a $class in utf8 under /d";
                }
                else {
                    $reason = "\"$char\" is latin1, which requires utf8 to be a $class under /d";
                    $match = 0;
                }
                $reason = "; $reason" if $reason;

                my $op;
                my $neg_op;
                if ($match) {
                    $op = '=~';
                    $neg_op = '!~';
                }
                else {
                    $op = '!~';
                    $neg_op = '=~';
                }

                # In [...] or not
                foreach my $bracketed (0, 1) {
                    my $lb = "";
                    my $rb = "";
                    if ($bracketed) {

                        # Adds an extra char to the character class to make sure
                        # that the class doesn't get optimized away.
                        $lb = ($bracketed) ? '[_' : "";
                        $rb = ($bracketed) ? ']' : "";
                    }
                    else {  # [:posix:] must be inside outer [ ]
                        next if $class =~ /\[/;
                    }

                    my $length = 10;    # For regexec.c regrepeat() cases by
                                        # matching more than one item
                    # Test both class and its complement, and with one or more
                    # than one item to match.
                    foreach my $eval (
                        qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: $lb$class$rb ) /x],
                        qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb ) /x],
                        qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset: $lb$class$rb\{$length} ) /x],
                        qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb\{$length} ) /x],
                    ) {
                        ok (eval $eval, $eval . $reason);
                    }
                }

                next if $class ne '\w';

                # Test \b, \B at beginning and end of string
                foreach my $eval (
                    qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: ^ \\b . ) /x],
                    qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: . \\b \$) /x],
                    qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: ^ \\B . ) /x],
                    qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: . \\B \$ ) /x],
                ) {
                    ok (eval $eval, $eval . $reason);
                }

                # Test \b, \B adjacent to a non-word char, both before it and
                # after.  We test with ASCII, Latin1 and Unicode non-word chars
                foreach my $space_ord (@{$testcases{'\s'}}) {

                    # Useless to try to test non-utf8 when the ord itself
                    # forces utf8
                    next if $space_ord > 255 && ! $upgrade;

                    my $space = display(chr $space_ord);

                    foreach my $eval (
                        qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x],
                        qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x],
                        qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x],
                        qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x],
                    ) {
                        ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
                    }
                }

                # Test \b, \B in the middle of two nominally word chars, but
                # one or both may be considered non-word depending on range
                # and charset.
                foreach my $other_ord (@{$testcases{'\w'}}) {
                    next if $other_ord > 255 && ! $upgrade;
                    my $other = display(chr $other_ord);

                    # Determine if the other char is a word char in current
                    # circumstances
                    my $other_is_word = 1;
                    my $other_reason = "\"$other\" is a $class under /$charset";
                    if ($other_ord > 127
                        && $charset ne 'u'
                        && ($charset eq "a"
                            || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
                    {
                        $other_is_word = 0;
                        $other_reason = "\"$other\" is not a $class under /$charset";
                    }
                    my $both_reason = $reason;
                    $both_reason .= "; $other_reason" if $other_ord != $ord;

                    # If both are the same wordness, then \b will fail; \B
                    # succeed
                    if ($match == $other_is_word) {
                        $op = '!~';
                        $neg_op = '=~';
                    }
                    else {
                        $op = '=~';
                        $neg_op = '!~';
                    }

                    foreach my $eval (
                        qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: $other \\b $char ) /x],
                        qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: $char \\b $other ) /x],
                        qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: $other \\B $char ) /x],
                        qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: $char \\B $other ) /x],
                    ) {
                        ok (eval $eval, $eval . $both_reason);
                    }

                    next if $other_ord == $ord;

                    # These start with the \b or \B.  They are included, based
                    # on source code analysis, to force the testing of the FBC
                    # (find_by_class) portions of regexec.c.
                    foreach my $eval (
                        qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: \\b $char ) /x],
                        qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: \\b $other ) /x],
                        qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: \\B $char ) /x],
                        qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: \\B $other ) /x],
                    ) {
                        ok (eval $eval, $eval . $both_reason);
                    }
                }
            } # End of each test case in a class
        } # End of \w, \s, ...
    } # End of utf8 upgraded or not
}

plan(curr_test() - 1);
Commit	Line	Data
e8ed1101 KW	1	# Test the /a, /d, etc regex modifiers
	2
	3	BEGIN {
	4	chdir 't' if -d 't';
	5	@INC = '../lib';
	6	require './test.pl';
	7	}
	8
	9	use strict;
	10	use warnings;
	11
aab0501d KW	12	plan('no_plan');
aab0501d KW	13
e8ed1101 KW	14	# Each case is a valid element of its hash key. Choose, where available, an
	15	# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
	16	my %testcases = (
	17	'\w' => [ ord("A"), 0xE2, 0x16B ], # Below expects these to all be alpha
	18	'\d' => [ ord("0"), 0x0662 ],
	19	'\s' => [ ord("\t"), 0xA0, 0x1680 ], # Below expects these to be [:blank:]
	20	'[:cntrl:]' => [ 0x00, 0x88 ],
	21	'[:graph:]' => [ ord("&"), 0xF7, 0x02C7 ], # Below expects these to be
	22	# [:print:]
	23	'[:lower:]' => [ ord("g"), 0xE3, 0x0127 ],
	24	'[:punct:]' => [ ord("!"), 0xBF, 0x055C ],
	25	'[:upper:]' => [ ord("G"), 0xC3, 0x0126 ],
	26	'[:xdigit:]' => [ ord("4"), 0xFF15 ],
	27	);
	28
	29	$testcases{'[:digit:]'} = $testcases{'\d'};
	30	$testcases{'[:alnum:]'} = $testcases{'\w'};
	31	$testcases{'[:alpha:]'} = $testcases{'\w'};
	32	$testcases{'[:blank:]'} = $testcases{'\s'};
	33	$testcases{'[:print:]'} = $testcases{'[:graph:]'};
	34	$testcases{'[:space:]'} = $testcases{'\s'};
	35	$testcases{'[:word:]'} = $testcases{'\w'};
	36
	37	# For each possible character set...
de87c4fe	38	foreach my $charset ("a", "d", "l", "u") {
e8ed1101 KW	39
	40	# And in utf8 or not
	41	foreach my $upgrade ("", 'utf8::upgrade($a); ') {
	42
	43	# reverse gets the, \w, \s, \d first.
	44	for my $class (reverse sort keys %testcases) {
	45
	46	# The complement of \w is \W; of [:posix:] is [:^posix:]
	47	my $complement = $class;
	48	if ($complement !~ s/ ( \[: ) /$1^/x) {
	49	$complement = uc($class);
	50	}
	51
	52	# For each test case
	53	foreach my $ord (@{$testcases{$class}}) {
	54	my $char = display(chr($ord));
	55
	56	# > 255 already implies upgraded. Skip the ones that don't
9d44f25f	57	# have an explicit upgrade. This shows more clearly in the
e8ed1101 KW	58	# output which tests are in utf8, or not.
	59	next if $ord > 255 && ! $upgrade;
	60
	61	my $reason = ""; # Explanation output with each test
	62	my $match = 1; # Calculated whether test regex should
	63	# match or not
	64
	65	# Everything always matches in ASCII, or under /u
	66	if ($ord < 128 \|\| $charset eq 'u') {
	67	$reason = "\"$char\" is a $class under /$charset";
	68	}
	69	elsif ($charset eq "a") {
	70	$match = 0;
	71	$reason = "\"$char\" is non-ASCII, which can't be a $class under /a";
	72	}
	73	elsif ($ord > 255) {
	74	$reason = "\"$char\" is a $class under /$charset";
	75	}
	76	elsif ($charset eq 'l') {
	77
	78	# We are using the C locale, which is essentially ASCII,
	79	# but under utf8, the above-latin1 chars are treated as
	80	# Unicode)
	81	$reason = "\"$char\" is not a $class in this locale under /l";
	82	$match = 0;
	83	}
	84	elsif ($upgrade) {
	85	$reason = "\"$char\" is a $class in utf8 under /d";
	86	}
	87	else {
	88	$reason = "\"$char\" is latin1, which requires utf8 to be a $class under /d";
	89	$match = 0;
	90	}
	91	$reason = "; $reason" if $reason;
	92
	93	my $op;
	94	my $neg_op;
	95	if ($match) {
	96	$op = '=~';
	97	$neg_op = '!~';
	98	}
	99	else {
	100	$op = '!~';
	101	$neg_op = '=~';
	102	}
	103
	104	# In [...] or not
	105	foreach my $bracketed (0, 1) {
	106	my $lb = "";
	107	my $rb = "";
	108	if ($bracketed) {
	109
	110	# Adds an extra char to the character class to make sure
	111	# that the class doesn't get optimized away.
	112	$lb = ($bracketed) ? '[_' : "";
	113	$rb = ($bracketed) ? ']' : "";
	114	}
	115	else { # [:posix:] must be inside outer [ ]
	116	next if $class =~ /\[/;
	117	}
	118
	119	my $length = 10; # For regexec.c regrepeat() cases by
	120	# matching more than one item
	121	# Test both class and its complement, and with one or more
122	# than one item to match.
123	foreach my $eval (
124	qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: $lb$class$rb ) /x],
125	qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb ) /x],
126	qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset: $lb$class$rb\{$length} ) /x],
127	qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb\{$length} ) /x],
128	) {
129	ok (eval $eval, $eval . $reason);
130	}
131	}
132
133	next if $class ne '\w';
134
135	# Test \b, \B at beginning and end of string
136	foreach my $eval (
137	qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: ^ \\b . ) /x],
138	qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: . \\b \$) /x],
139	qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: ^ \\B . ) /x],
140	qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: . \\B \$ ) /x],
141	) {
142	ok (eval $eval, $eval . $reason);
143	}
144
145	# Test \b, \B adjacent to a non-word char, both before it and
146	# after. We test with ASCII, Latin1 and Unicode non-word chars
147	foreach my $space_ord (@{$testcases{'\s'}}) {
148
149	# Useless to try to test non-utf8 when the ord itself
150	# forces utf8
151	next if $space_ord > 255 && ! $upgrade;
152
153	my $space = display(chr $space_ord);
154
155	foreach my $eval (
156	qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x],
157	qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x],
158	qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x],
159	qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x],
160	) {
161	ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
162	}
163	}
164
165	# Test \b, \B in the middle of two nominally word chars, but
166	# one or both may be considered non-word depending on range
167	# and charset.
168	foreach my $other_ord (@{$testcases{'\w'}}) {
169	next if $other_ord > 255 && ! $upgrade;
170	my $other = display(chr $other_ord);
171
172	# Determine if the other char is a word char in current
173	# circumstances
174	my $other_is_word = 1;
175	my $other_reason = "\"$other\" is a $class under /$charset";
176	if ($other_ord > 127
177	&& $charset ne 'u'
178	&& ($charset eq "a"
179	\|\| ($other_ord < 256 && ($charset eq 'l' \|\| ! $upgrade))))
180	{
181	$other_is_word = 0;
182	$other_reason = "\"$other\" is not a $class under /$charset";
183	}
184	my $both_reason = $reason;
185	$both_reason .= "; $other_reason" if $other_ord != $ord;
186
187	# If both are the same wordness, then \b will fail; \B
188	# succeed
189	if ($match == $other_is_word) {
190	$op = '!~';
191	$neg_op = '=~';
192	}
193	else {
194	$op = '=~';
195	$neg_op = '!~';
196	}
197
198	foreach my $eval (
199	qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: $other \\b $char ) /x],
200	qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: $char \\b $other ) /x],
201	qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: $other \\B $char ) /x],
202	qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: $char \\B $other ) /x],
203	) {
204	ok (eval $eval, $eval . $both_reason);
205	}
206
207	next if $other_ord == $ord;
208
209	# These start with the \b or \B. They are included, based
210	# on source code analysis, to force the testing of the FBC
211	# (find_by_class) portions of regexec.c.
212	foreach my $eval (
213	qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: \\b $char ) /x],
214	qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: \\b $other ) /x],
215	qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: \\B $char ) /x],
216	qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: \\B $other ) /x],
217	) {
218	ok (eval $eval, $eval . $both_reason);
219	}
220	}
221	} # End of each test case in a class
222	} # End of \w, \s, ...
223	} # End of utf8 upgraded or not
224	}
225
226	plan(curr_test() - 1);