This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
t/re/charset.t: Fix comment, test names
[perl5.git] / t / re / charset.t
CommitLineData
e8ed1101
KW
1# Test the /a, /d, etc regex modifiers
2
3BEGIN {
4 chdir 't' if -d 't';
5 @INC = '../lib';
6 require './test.pl';
31f05a37 7 require './loc_tools.pl';
e8ed1101
KW
8}
9
10use strict;
11use warnings;
ed35198c 12use Config;
e8ed1101 13
aab0501d
KW
14plan('no_plan');
15
e8ed1101
KW
16# Each case is a valid element of its hash key. Choose, where available, an
17# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
18my %testcases = (
19 '\w' => [ ord("A"), 0xE2, 0x16B ], # Below expects these to all be alpha
20 '\d' => [ ord("0"), 0x0662 ],
21 '\s' => [ ord("\t"), 0xA0, 0x1680 ], # Below expects these to be [:blank:]
22 '[:cntrl:]' => [ 0x00, 0x88 ],
23 '[:graph:]' => [ ord("&"), 0xF7, 0x02C7 ], # Below expects these to be
24 # [:print:]
25 '[:lower:]' => [ ord("g"), 0xE3, 0x0127 ],
26 '[:punct:]' => [ ord("!"), 0xBF, 0x055C ],
27 '[:upper:]' => [ ord("G"), 0xC3, 0x0126 ],
28 '[:xdigit:]' => [ ord("4"), 0xFF15 ],
29);
30
31$testcases{'[:digit:]'} = $testcases{'\d'};
32$testcases{'[:alnum:]'} = $testcases{'\w'};
33$testcases{'[:alpha:]'} = $testcases{'\w'};
34$testcases{'[:blank:]'} = $testcases{'\s'};
35$testcases{'[:print:]'} = $testcases{'[:graph:]'};
36$testcases{'[:space:]'} = $testcases{'\s'};
37$testcases{'[:word:]'} = $testcases{'\w'};
38
31f05a37
KW
39my $utf8_locale;
40
c973bd4f 41my @charsets = qw(a d u aa);
569f7fc5 42if (! is_miniperl() && $Config{d_setlocale}) {
09fcee4e
KW
43 require POSIX;
44 my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
fff7535c 45 if ($current_locale eq 'C') {
ed35198c
KW
46
47 # test for d_setlocale is repeated here because this one is compile
48 # time, and the one above is run time
49 use if $Config{d_setlocale}, 'locale';
fff7535c 50
0e18027d
KW
51 # Some implementations don't have the 128-255 range characters all
52 # mean nothing under the C locale (an example being VMS). This is
53 # legal, but since we don't know what the right answers should be,
54 # skip the locale tests in that situation.
fff7535c 55 for my $i (128 .. 255) {
31f05a37 56 goto skip_adding_C_locale if chr($i) =~ /[[:print:]]/;
fff7535c
KW
57 }
58 push @charsets, 'l';
31f05a37
KW
59
60 skip_adding_C_locale:
61
62 # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale
9b0711ee 63 $utf8_locale = find_utf8_ctype_locale();
31f05a37 64 push @charsets, 'L' if defined $utf8_locale;
fff7535c 65 }
09fcee4e
KW
66}
67
e8ed1101 68# For each possible character set...
158ba892 69foreach my $charset (@charsets) {
31f05a37
KW
70 my $locale;
71 my $charset_mod = lc $charset;
72 my $charset_display;
73 if ($charset_mod eq 'l') {
74 $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l')
75 ? "C"
76 : $utf8_locale
77 );
78 die "Couldn't change locale" unless $locale;
79 $charset_display = $charset_mod . " ($locale)";
80 }
81 else {
82 $charset_display = $charset_mod;
83 }
e8ed1101
KW
84
85 # And in utf8 or not
86 foreach my $upgrade ("", 'utf8::upgrade($a); ') {
87
88 # reverse gets the, \w, \s, \d first.
89 for my $class (reverse sort keys %testcases) {
90
91 # The complement of \w is \W; of [:posix:] is [:^posix:]
92 my $complement = $class;
93 if ($complement !~ s/ ( \[: ) /$1^/x) {
94 $complement = uc($class);
95 }
96
97 # For each test case
98 foreach my $ord (@{$testcases{$class}}) {
99 my $char = display(chr($ord));
100
101 # > 255 already implies upgraded. Skip the ones that don't
9d44f25f 102 # have an explicit upgrade. This shows more clearly in the
e8ed1101
KW
103 # output which tests are in utf8, or not.
104 next if $ord > 255 && ! $upgrade;
105
106 my $reason = ""; # Explanation output with each test
4fc9fd85 107 my $neg_reason = "";
e8ed1101
KW
108 my $match = 1; # Calculated whether test regex should
109 # match or not
110
6d9a9b17
KW
111 # Everything always matches in ASCII, or under /u, or under /l
112 # with a UTF-8 locale
31f05a37
KW
113 if ($ord < 128 || $charset eq 'u' || $charset eq 'L') {
114 $reason = "\"$char\" is a $class under /$charset_display";
115 $neg_reason = "\"$char\" is not a $complement under /$charset_display";
e8ed1101 116 }
c973bd4f 117 elsif ($charset eq "a" || $charset eq "aa") {
e8ed1101 118 $match = 0;
6d9a9b17
KW
119 $reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display";
120 $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display";
e8ed1101
KW
121 }
122 elsif ($ord > 255) {
31f05a37
KW
123 $reason = "\"$char\" is a $class under /$charset_display";
124 $neg_reason = "\"$char\" is not a $complement under /$charset_display";
e8ed1101
KW
125 }
126 elsif ($charset eq 'l') {
127
128 # We are using the C locale, which is essentially ASCII,
129 # but under utf8, the above-latin1 chars are treated as
130 # Unicode)
6d9a9b17
KW
131 $reason = "\"$char\" is not a $class in the C locale under /$charset_mod";
132 $neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod";
e8ed1101
KW
133 $match = 0;
134 }
135 elsif ($upgrade) {
6d9a9b17
KW
136 $reason = "\"$char\" is a $class in utf8 under /$charset_display";
137 $neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display";
e8ed1101
KW
138 }
139 else {
6d9a9b17
KW
140 $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display";
141 $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)";
e8ed1101
KW
142 $match = 0;
143 }
144 $reason = "; $reason" if $reason;
4fc9fd85 145 $neg_reason = "; $neg_reason" if $neg_reason;
e8ed1101
KW
146
147 my $op;
148 my $neg_op;
149 if ($match) {
150 $op = '=~';
151 $neg_op = '!~';
152 }
153 else {
154 $op = '!~';
155 $neg_op = '=~';
156 }
157
158 # In [...] or not
159 foreach my $bracketed (0, 1) {
160 my $lb = "";
161 my $rb = "";
162 if ($bracketed) {
163
164 # Adds an extra char to the character class to make sure
165 # that the class doesn't get optimized away.
166 $lb = ($bracketed) ? '[_' : "";
167 $rb = ($bracketed) ? ']' : "";
168 }
169 else { # [:posix:] must be inside outer [ ]
170 next if $class =~ /\[/;
171 }
172
173 my $length = 10; # For regexec.c regrepeat() cases by
174 # matching more than one item
175 # Test both class and its complement, and with one or more
176 # than one item to match.
177 foreach my $eval (
31f05a37
KW
178 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x],
179 qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x],
1e50d612
KW
180 ) {
181 ok (eval $eval, $eval . $reason);
182 }
183 foreach my $eval (
31f05a37
KW
184 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x],
185 qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x],
e8ed1101 186 ) {
4fc9fd85 187 ok (eval $eval, $eval . $neg_reason);
e8ed1101
KW
188 }
189 }
190
191 next if $class ne '\w';
192
193 # Test \b, \B at beginning and end of string
194 foreach my $eval (
31f05a37
KW
195 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x],
196 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x],
1e50d612
KW
197 ) {
198 ok (eval $eval, $eval . $reason);
199 }
200 foreach my $eval (
31f05a37
KW
201 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x],
202 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x],
e8ed1101 203 ) {
4fc9fd85 204 ok (eval $eval, $eval . $neg_reason);
e8ed1101
KW
205 }
206
207 # Test \b, \B adjacent to a non-word char, both before it and
208 # after. We test with ASCII, Latin1 and Unicode non-word chars
209 foreach my $space_ord (@{$testcases{'\s'}}) {
210
211 # Useless to try to test non-utf8 when the ord itself
212 # forces utf8
213 next if $space_ord > 255 && ! $upgrade;
214
215 my $space = display(chr $space_ord);
216
217 foreach my $eval (
31f05a37
KW
218 qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
219 qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
1e50d612
KW
220 ) {
221 ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
222 }
223 foreach my $eval (
31f05a37
KW
224 qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
225 qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
e8ed1101 226 ) {
4fc9fd85 227 ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w");
e8ed1101
KW
228 }
229 }
230
231 # Test \b, \B in the middle of two nominally word chars, but
232 # one or both may be considered non-word depending on range
233 # and charset.
234 foreach my $other_ord (@{$testcases{'\w'}}) {
235 next if $other_ord > 255 && ! $upgrade;
236 my $other = display(chr $other_ord);
237
238 # Determine if the other char is a word char in current
239 # circumstances
240 my $other_is_word = 1;
31f05a37
KW
241 my $other_reason = "\"$other\" is a $class under /$charset_display";
242 my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display";
e8ed1101 243 if ($other_ord > 127
31f05a37 244 && $charset ne 'u' && $charset ne 'L'
c973bd4f 245 && (($charset eq "a" || $charset eq "aa")
e8ed1101
KW
246 || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
247 {
248 $other_is_word = 0;
31f05a37
KW
249 $other_reason = "\"$other\" is not a $class under /$charset_display";
250 $other_neg_reason = "\"$other\" is a $complement under /$charset_display";
e8ed1101
KW
251 }
252 my $both_reason = $reason;
253 $both_reason .= "; $other_reason" if $other_ord != $ord;
4fc9fd85
KW
254 my $both_neg_reason = $neg_reason;
255 $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord;
e8ed1101
KW
256
257 # If both are the same wordness, then \b will fail; \B
258 # succeed
259 if ($match == $other_is_word) {
260 $op = '!~';
261 $neg_op = '=~';
262 }
263 else {
264 $op = '=~';
265 $neg_op = '!~';
266 }
267
268 foreach my $eval (
31f05a37
KW
269 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x],
270 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x],
1e50d612
KW
271 ) {
272 ok (eval $eval, $eval . $both_reason);
273 }
274 foreach my $eval (
31f05a37
KW
275 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x],
276 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x],
e8ed1101 277 ) {
4fc9fd85 278 ok (eval $eval, $eval . $both_neg_reason);
e8ed1101
KW
279 }
280
281 next if $other_ord == $ord;
282
283 # These start with the \b or \B. They are included, based
284 # on source code analysis, to force the testing of the FBC
285 # (find_by_class) portions of regexec.c.
286 foreach my $eval (
31f05a37
KW
287 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x],
288 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x],
1e50d612
KW
289 ) {
290 ok (eval $eval, $eval . $both_reason);
291 }
292 foreach my $eval (
31f05a37
KW
293 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x],
294 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x],
e8ed1101 295 ) {
4fc9fd85 296 ok (eval $eval, $eval . $both_neg_reason);
e8ed1101
KW
297 }
298 }
299 } # End of each test case in a class
300 } # End of \w, \s, ...
301 } # End of utf8 upgraded or not
302}
303
304plan(curr_test() - 1);