This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: Fix utf8 e.g. [\s] under locale
[perl5.git] / t / re / charset.t
CommitLineData
e8ed1101
KW
1# Test the /a, /d, etc regex modifiers
2
3BEGIN {
4 chdir 't' if -d 't';
5 @INC = '../lib';
6 require './test.pl';
7}
8
9use strict;
10use warnings;
11
aab0501d
KW
12plan('no_plan');
13
e8ed1101
KW
14# Each case is a valid element of its hash key. Choose, where available, an
15# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
16my %testcases = (
17 '\w' => [ ord("A"), 0xE2, 0x16B ], # Below expects these to all be alpha
18 '\d' => [ ord("0"), 0x0662 ],
19 '\s' => [ ord("\t"), 0xA0, 0x1680 ], # Below expects these to be [:blank:]
20 '[:cntrl:]' => [ 0x00, 0x88 ],
21 '[:graph:]' => [ ord("&"), 0xF7, 0x02C7 ], # Below expects these to be
22 # [:print:]
23 '[:lower:]' => [ ord("g"), 0xE3, 0x0127 ],
24 '[:punct:]' => [ ord("!"), 0xBF, 0x055C ],
25 '[:upper:]' => [ ord("G"), 0xC3, 0x0126 ],
26 '[:xdigit:]' => [ ord("4"), 0xFF15 ],
27);
28
29$testcases{'[:digit:]'} = $testcases{'\d'};
30$testcases{'[:alnum:]'} = $testcases{'\w'};
31$testcases{'[:alpha:]'} = $testcases{'\w'};
32$testcases{'[:blank:]'} = $testcases{'\s'};
33$testcases{'[:print:]'} = $testcases{'[:graph:]'};
34$testcases{'[:space:]'} = $testcases{'\s'};
35$testcases{'[:word:]'} = $testcases{'\w'};
36
37# For each possible character set...
de87c4fe 38foreach my $charset ("a", "d", "l", "u") {
e8ed1101
KW
39
40 # And in utf8 or not
41 foreach my $upgrade ("", 'utf8::upgrade($a); ') {
42
43 # reverse gets the, \w, \s, \d first.
44 for my $class (reverse sort keys %testcases) {
45
46 # The complement of \w is \W; of [:posix:] is [:^posix:]
47 my $complement = $class;
48 if ($complement !~ s/ ( \[: ) /$1^/x) {
49 $complement = uc($class);
50 }
51
52 # For each test case
53 foreach my $ord (@{$testcases{$class}}) {
54 my $char = display(chr($ord));
55
56 # > 255 already implies upgraded. Skip the ones that don't
9d44f25f 57 # have an explicit upgrade. This shows more clearly in the
e8ed1101
KW
58 # output which tests are in utf8, or not.
59 next if $ord > 255 && ! $upgrade;
60
61 my $reason = ""; # Explanation output with each test
62 my $match = 1; # Calculated whether test regex should
63 # match or not
64
65 # Everything always matches in ASCII, or under /u
66 if ($ord < 128 || $charset eq 'u') {
67 $reason = "\"$char\" is a $class under /$charset";
68 }
69 elsif ($charset eq "a") {
70 $match = 0;
71 $reason = "\"$char\" is non-ASCII, which can't be a $class under /a";
72 }
73 elsif ($ord > 255) {
74 $reason = "\"$char\" is a $class under /$charset";
75 }
76 elsif ($charset eq 'l') {
77
78 # We are using the C locale, which is essentially ASCII,
79 # but under utf8, the above-latin1 chars are treated as
80 # Unicode)
81 $reason = "\"$char\" is not a $class in this locale under /l";
82 $match = 0;
83 }
84 elsif ($upgrade) {
85 $reason = "\"$char\" is a $class in utf8 under /d";
86 }
87 else {
88 $reason = "\"$char\" is latin1, which requires utf8 to be a $class under /d";
89 $match = 0;
90 }
91 $reason = "; $reason" if $reason;
92
93 my $op;
94 my $neg_op;
95 if ($match) {
96 $op = '=~';
97 $neg_op = '!~';
98 }
99 else {
100 $op = '!~';
101 $neg_op = '=~';
102 }
103
104 # In [...] or not
105 foreach my $bracketed (0, 1) {
106 my $lb = "";
107 my $rb = "";
108 if ($bracketed) {
109
110 # Adds an extra char to the character class to make sure
111 # that the class doesn't get optimized away.
112 $lb = ($bracketed) ? '[_' : "";
113 $rb = ($bracketed) ? ']' : "";
114 }
115 else { # [:posix:] must be inside outer [ ]
116 next if $class =~ /\[/;
117 }
118
119 my $length = 10; # For regexec.c regrepeat() cases by
120 # matching more than one item
121 # Test both class and its complement, and with one or more
122 # than one item to match.
123 foreach my $eval (
124 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: $lb$class$rb ) /x],
125 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb ) /x],
126 qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset: $lb$class$rb\{$length} ) /x],
127 qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb\{$length} ) /x],
128 ) {
129 ok (eval $eval, $eval . $reason);
130 }
131 }
132
133 next if $class ne '\w';
134
135 # Test \b, \B at beginning and end of string
136 foreach my $eval (
137 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: ^ \\b . ) /x],
138 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: . \\b \$) /x],
139 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: ^ \\B . ) /x],
140 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: . \\B \$ ) /x],
141 ) {
142 ok (eval $eval, $eval . $reason);
143 }
144
145 # Test \b, \B adjacent to a non-word char, both before it and
146 # after. We test with ASCII, Latin1 and Unicode non-word chars
147 foreach my $space_ord (@{$testcases{'\s'}}) {
148
149 # Useless to try to test non-utf8 when the ord itself
150 # forces utf8
151 next if $space_ord > 255 && ! $upgrade;
152
153 my $space = display(chr $space_ord);
154
155 foreach my $eval (
156 qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x],
157 qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x],
158 qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x],
159 qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x],
160 ) {
161 ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
162 }
163 }
164
165 # Test \b, \B in the middle of two nominally word chars, but
166 # one or both may be considered non-word depending on range
167 # and charset.
168 foreach my $other_ord (@{$testcases{'\w'}}) {
169 next if $other_ord > 255 && ! $upgrade;
170 my $other = display(chr $other_ord);
171
172 # Determine if the other char is a word char in current
173 # circumstances
174 my $other_is_word = 1;
175 my $other_reason = "\"$other\" is a $class under /$charset";
176 if ($other_ord > 127
177 && $charset ne 'u'
178 && ($charset eq "a"
179 || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
180 {
181 $other_is_word = 0;
182 $other_reason = "\"$other\" is not a $class under /$charset";
183 }
184 my $both_reason = $reason;
185 $both_reason .= "; $other_reason" if $other_ord != $ord;
186
187 # If both are the same wordness, then \b will fail; \B
188 # succeed
189 if ($match == $other_is_word) {
190 $op = '!~';
191 $neg_op = '=~';
192 }
193 else {
194 $op = '=~';
195 $neg_op = '!~';
196 }
197
198 foreach my $eval (
199 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: $other \\b $char ) /x],
200 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: $char \\b $other ) /x],
201 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: $other \\B $char ) /x],
202 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: $char \\B $other ) /x],
203 ) {
204 ok (eval $eval, $eval . $both_reason);
205 }
206
207 next if $other_ord == $ord;
208
209 # These start with the \b or \B. They are included, based
210 # on source code analysis, to force the testing of the FBC
211 # (find_by_class) portions of regexec.c.
212 foreach my $eval (
213 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: \\b $char ) /x],
214 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: \\b $other ) /x],
215 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: \\B $char ) /x],
216 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: \\B $other ) /x],
217 ) {
218 ok (eval $eval, $eval . $both_reason);
219 }
220 }
221 } # End of each test case in a class
222 } # End of \w, \s, ...
223 } # End of utf8 upgraded or not
224}
225
226plan(curr_test() - 1);