This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Merge branch 'release-5.29.2' into blead
[perl5.git] / t / re / charset.t
CommitLineData
e8ed1101
KW
1# Test the /a, /d, etc regex modifiers
2
3BEGIN {
4 chdir 't' if -d 't';
e8ed1101 5 require './test.pl';
31f05a37 6 require './loc_tools.pl';
624c42e2 7 set_up_inc('../lib', '../dist/if');
e8ed1101
KW
8}
9
10use strict;
11use warnings;
613abc6d
KW
12no warnings 'locale'; # Some /l tests use above-latin1 chars to make sure
13 # they work, even though they warn.
ed35198c 14use Config;
e8ed1101 15
aab0501d
KW
16plan('no_plan');
17
e8ed1101
KW
18# Each case is a valid element of its hash key. Choose, where available, an
19# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
20my %testcases = (
f99a3fe1 21 '\w' => [ ord("A"), utf8::unicode_to_native(0xE2), 0x16B ], # Below expects these to all be alpha
e8ed1101 22 '\d' => [ ord("0"), 0x0662 ],
f99a3fe1
KW
23 '\s' => [ ord("\t"), utf8::unicode_to_native(0xA0), 0x1680 ], # Below expects these to be [:blank:]
24 '[:cntrl:]' => [ utf8::unicode_to_native(0x00), utf8::unicode_to_native(0x88) ],
25 '[:graph:]' => [ ord("&"), utf8::unicode_to_native(0xF7), 0x02C7 ], # Below expects these to be
26 # [:print:]
27 '[:lower:]' => [ ord("g"), utf8::unicode_to_native(0xE3), 0x0127 ],
28 '[:punct:]' => [ ord('`'), ord('^'), ord('~'), ord('<'), ord('='), ord('>'), ord('|'), ord('-'), ord(','), ord(';'), ord(':'), ord('!'), ord('?'), ord('/'), ord('.'), ord('"'), ord('('), ord(')'), ord('['), ord(']'), ord('{'), ord('}'), ord('@'), ord('$'), ord('*'), ord('\\'), ord('&'), ord('#'), ord('%'), ord('+'), ord("'"), utf8::unicode_to_native(0xBF), 0x055C ],
29 '[:upper:]' => [ ord("G"), utf8::unicode_to_native(0xC3), 0x0126 ],
e8ed1101
KW
30 '[:xdigit:]' => [ ord("4"), 0xFF15 ],
31);
32
33$testcases{'[:digit:]'} = $testcases{'\d'};
34$testcases{'[:alnum:]'} = $testcases{'\w'};
35$testcases{'[:alpha:]'} = $testcases{'\w'};
36$testcases{'[:blank:]'} = $testcases{'\s'};
37$testcases{'[:print:]'} = $testcases{'[:graph:]'};
38$testcases{'[:space:]'} = $testcases{'\s'};
39$testcases{'[:word:]'} = $testcases{'\w'};
40
31f05a37
KW
41my $utf8_locale;
42
c973bd4f 43my @charsets = qw(a d u aa);
624c42e2
N
44my $locales_ok = eval { locales_enabled('LC_CTYPE'); 1 };
45if (! is_miniperl() && $locales_ok) {
09fcee4e
KW
46 require POSIX;
47 my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
fff7535c 48 if ($current_locale eq 'C') {
ed35198c
KW
49
50 # test for d_setlocale is repeated here because this one is compile
51 # time, and the one above is run time
52 use if $Config{d_setlocale}, 'locale';
fff7535c 53
0e18027d
KW
54 # Some implementations don't have the 128-255 range characters all
55 # mean nothing under the C locale (an example being VMS). This is
56 # legal, but since we don't know what the right answers should be,
57 # skip the locale tests in that situation.
fff7535c 58 for my $i (128 .. 255) {
f99a3fe1
KW
59 goto skip_adding_C_locale
60 if chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/;
fff7535c
KW
61 }
62 push @charsets, 'l';
31f05a37
KW
63
64 skip_adding_C_locale:
65
66 # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale
9b0711ee 67 $utf8_locale = find_utf8_ctype_locale();
31f05a37 68 push @charsets, 'L' if defined $utf8_locale;
fff7535c 69 }
09fcee4e
KW
70}
71
e8ed1101 72# For each possible character set...
158ba892 73foreach my $charset (@charsets) {
31f05a37
KW
74 my $locale;
75 my $charset_mod = lc $charset;
76 my $charset_display;
77 if ($charset_mod eq 'l') {
78 $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l')
79 ? "C"
80 : $utf8_locale
81 );
82 die "Couldn't change locale" unless $locale;
83 $charset_display = $charset_mod . " ($locale)";
84 }
85 else {
86 $charset_display = $charset_mod;
87 }
e8ed1101
KW
88
89 # And in utf8 or not
90 foreach my $upgrade ("", 'utf8::upgrade($a); ') {
91
92 # reverse gets the, \w, \s, \d first.
93 for my $class (reverse sort keys %testcases) {
94
95 # The complement of \w is \W; of [:posix:] is [:^posix:]
96 my $complement = $class;
97 if ($complement !~ s/ ( \[: ) /$1^/x) {
98 $complement = uc($class);
99 }
100
101 # For each test case
102 foreach my $ord (@{$testcases{$class}}) {
f99a3fe1
KW
103 my $char = chr($ord);
104 $char = ($char eq '$') ? '\$' : display($char);
e8ed1101
KW
105
106 # > 255 already implies upgraded. Skip the ones that don't
9d44f25f 107 # have an explicit upgrade. This shows more clearly in the
e8ed1101
KW
108 # output which tests are in utf8, or not.
109 next if $ord > 255 && ! $upgrade;
110
111 my $reason = ""; # Explanation output with each test
4fc9fd85 112 my $neg_reason = "";
e8ed1101
KW
113 my $match = 1; # Calculated whether test regex should
114 # match or not
115
6d9a9b17
KW
116 # Everything always matches in ASCII, or under /u, or under /l
117 # with a UTF-8 locale
f99a3fe1
KW
118 if (utf8::native_to_unicode($ord) < 128
119 || $charset eq 'u'
120 || $charset eq 'L')
121 {
31f05a37
KW
122 $reason = "\"$char\" is a $class under /$charset_display";
123 $neg_reason = "\"$char\" is not a $complement under /$charset_display";
e8ed1101 124 }
c973bd4f 125 elsif ($charset eq "a" || $charset eq "aa") {
e8ed1101 126 $match = 0;
6d9a9b17
KW
127 $reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display";
128 $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display";
e8ed1101
KW
129 }
130 elsif ($ord > 255) {
31f05a37
KW
131 $reason = "\"$char\" is a $class under /$charset_display";
132 $neg_reason = "\"$char\" is not a $complement under /$charset_display";
e8ed1101
KW
133 }
134 elsif ($charset eq 'l') {
135
136 # We are using the C locale, which is essentially ASCII,
137 # but under utf8, the above-latin1 chars are treated as
138 # Unicode)
6d9a9b17
KW
139 $reason = "\"$char\" is not a $class in the C locale under /$charset_mod";
140 $neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod";
e8ed1101
KW
141 $match = 0;
142 }
143 elsif ($upgrade) {
6d9a9b17
KW
144 $reason = "\"$char\" is a $class in utf8 under /$charset_display";
145 $neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display";
e8ed1101
KW
146 }
147 else {
6d9a9b17
KW
148 $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display";
149 $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)";
e8ed1101
KW
150 $match = 0;
151 }
152 $reason = "; $reason" if $reason;
4fc9fd85 153 $neg_reason = "; $neg_reason" if $neg_reason;
e8ed1101
KW
154
155 my $op;
156 my $neg_op;
157 if ($match) {
158 $op = '=~';
159 $neg_op = '!~';
160 }
161 else {
162 $op = '!~';
163 $neg_op = '=~';
164 }
165
166 # In [...] or not
167 foreach my $bracketed (0, 1) {
168 my $lb = "";
169 my $rb = "";
170 if ($bracketed) {
171
172 # Adds an extra char to the character class to make sure
173 # that the class doesn't get optimized away.
174 $lb = ($bracketed) ? '[_' : "";
175 $rb = ($bracketed) ? ']' : "";
176 }
177 else { # [:posix:] must be inside outer [ ]
178 next if $class =~ /\[/;
179 }
180
181 my $length = 10; # For regexec.c regrepeat() cases by
182 # matching more than one item
183 # Test both class and its complement, and with one or more
184 # than one item to match.
185 foreach my $eval (
31f05a37
KW
186 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x],
187 qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x],
1e50d612
KW
188 ) {
189 ok (eval $eval, $eval . $reason);
190 }
191 foreach my $eval (
31f05a37
KW
192 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x],
193 qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x],
e8ed1101 194 ) {
4fc9fd85 195 ok (eval $eval, $eval . $neg_reason);
e8ed1101
KW
196 }
197 }
198
199 next if $class ne '\w';
200
201 # Test \b, \B at beginning and end of string
202 foreach my $eval (
31f05a37
KW
203 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x],
204 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x],
1e50d612
KW
205 ) {
206 ok (eval $eval, $eval . $reason);
207 }
208 foreach my $eval (
31f05a37
KW
209 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x],
210 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x],
e8ed1101 211 ) {
4fc9fd85 212 ok (eval $eval, $eval . $neg_reason);
e8ed1101
KW
213 }
214
215 # Test \b, \B adjacent to a non-word char, both before it and
216 # after. We test with ASCII, Latin1 and Unicode non-word chars
217 foreach my $space_ord (@{$testcases{'\s'}}) {
218
219 # Useless to try to test non-utf8 when the ord itself
220 # forces utf8
221 next if $space_ord > 255 && ! $upgrade;
222
223 my $space = display(chr $space_ord);
224
225 foreach my $eval (
31f05a37
KW
226 qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
227 qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
1e50d612
KW
228 ) {
229 ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
230 }
231 foreach my $eval (
31f05a37
KW
232 qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
233 qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
e8ed1101 234 ) {
4fc9fd85 235 ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w");
e8ed1101
KW
236 }
237 }
238
239 # Test \b, \B in the middle of two nominally word chars, but
240 # one or both may be considered non-word depending on range
241 # and charset.
242 foreach my $other_ord (@{$testcases{'\w'}}) {
243 next if $other_ord > 255 && ! $upgrade;
244 my $other = display(chr $other_ord);
245
246 # Determine if the other char is a word char in current
247 # circumstances
248 my $other_is_word = 1;
31f05a37
KW
249 my $other_reason = "\"$other\" is a $class under /$charset_display";
250 my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display";
f99a3fe1 251 if (utf8::native_to_unicode($other_ord) > 127
31f05a37 252 && $charset ne 'u' && $charset ne 'L'
c973bd4f 253 && (($charset eq "a" || $charset eq "aa")
e8ed1101
KW
254 || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
255 {
256 $other_is_word = 0;
31f05a37
KW
257 $other_reason = "\"$other\" is not a $class under /$charset_display";
258 $other_neg_reason = "\"$other\" is a $complement under /$charset_display";
e8ed1101
KW
259 }
260 my $both_reason = $reason;
261 $both_reason .= "; $other_reason" if $other_ord != $ord;
4fc9fd85
KW
262 my $both_neg_reason = $neg_reason;
263 $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord;
e8ed1101
KW
264
265 # If both are the same wordness, then \b will fail; \B
266 # succeed
267 if ($match == $other_is_word) {
268 $op = '!~';
269 $neg_op = '=~';
270 }
271 else {
272 $op = '=~';
273 $neg_op = '!~';
274 }
275
276 foreach my $eval (
31f05a37
KW
277 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x],
278 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x],
1e50d612
KW
279 ) {
280 ok (eval $eval, $eval . $both_reason);
281 }
282 foreach my $eval (
31f05a37
KW
283 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x],
284 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x],
e8ed1101 285 ) {
4fc9fd85 286 ok (eval $eval, $eval . $both_neg_reason);
e8ed1101
KW
287 }
288
289 next if $other_ord == $ord;
290
291 # These start with the \b or \B. They are included, based
292 # on source code analysis, to force the testing of the FBC
293 # (find_by_class) portions of regexec.c.
294 foreach my $eval (
31f05a37
KW
295 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x],
296 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x],
1e50d612
KW
297 ) {
298 ok (eval $eval, $eval . $both_reason);
299 }
300 foreach my $eval (
31f05a37
KW
301 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x],
302 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x],
e8ed1101 303 ) {
4fc9fd85 304 ok (eval $eval, $eval . $both_neg_reason);
e8ed1101
KW
305 }
306 }
307 } # End of each test case in a class
308 } # End of \w, \s, ...
309 } # End of utf8 upgraded or not
310}
311
312plan(curr_test() - 1);