This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
re/charset.t: Make sure locale is loaded at compile time
[perl5.git] / t / re / charset.t
CommitLineData
e8ed1101
KW
1# Test the /a, /d, etc regex modifiers
2
3BEGIN {
4 chdir 't' if -d 't';
5 @INC = '../lib';
6 require './test.pl';
7}
8
9use strict;
10use warnings;
ed35198c 11use Config;
e8ed1101 12
aab0501d
KW
13plan('no_plan');
14
e8ed1101
KW
15# Each case is a valid element of its hash key. Choose, where available, an
16# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
17my %testcases = (
18 '\w' => [ ord("A"), 0xE2, 0x16B ], # Below expects these to all be alpha
19 '\d' => [ ord("0"), 0x0662 ],
20 '\s' => [ ord("\t"), 0xA0, 0x1680 ], # Below expects these to be [:blank:]
21 '[:cntrl:]' => [ 0x00, 0x88 ],
22 '[:graph:]' => [ ord("&"), 0xF7, 0x02C7 ], # Below expects these to be
23 # [:print:]
24 '[:lower:]' => [ ord("g"), 0xE3, 0x0127 ],
25 '[:punct:]' => [ ord("!"), 0xBF, 0x055C ],
26 '[:upper:]' => [ ord("G"), 0xC3, 0x0126 ],
27 '[:xdigit:]' => [ ord("4"), 0xFF15 ],
28);
29
30$testcases{'[:digit:]'} = $testcases{'\d'};
31$testcases{'[:alnum:]'} = $testcases{'\w'};
32$testcases{'[:alpha:]'} = $testcases{'\w'};
33$testcases{'[:blank:]'} = $testcases{'\s'};
34$testcases{'[:print:]'} = $testcases{'[:graph:]'};
35$testcases{'[:space:]'} = $testcases{'\s'};
36$testcases{'[:word:]'} = $testcases{'\w'};
37
c973bd4f 38my @charsets = qw(a d u aa);
569f7fc5 39if (! is_miniperl() && $Config{d_setlocale}) {
09fcee4e
KW
40 require POSIX;
41 my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
fff7535c 42 if ($current_locale eq 'C') {
ed35198c
KW
43
44 # test for d_setlocale is repeated here because this one is compile
45 # time, and the one above is run time
46 use if $Config{d_setlocale}, 'locale';
fff7535c
KW
47
48 # Some locale implementations don't have the 128-255 characters all
49 # mean nothing. Skip the locale tests in that situation
50 for my $i (128 .. 255) {
51 goto bad_locale if chr($i) =~ /[[:print:]]/;
52 }
53 push @charsets, 'l';
54 bad_locale:
55 }
09fcee4e
KW
56}
57
e8ed1101 58# For each possible character set...
158ba892 59foreach my $charset (@charsets) {
e8ed1101
KW
60
61 # And in utf8 or not
62 foreach my $upgrade ("", 'utf8::upgrade($a); ') {
63
64 # reverse gets the, \w, \s, \d first.
65 for my $class (reverse sort keys %testcases) {
66
67 # The complement of \w is \W; of [:posix:] is [:^posix:]
68 my $complement = $class;
69 if ($complement !~ s/ ( \[: ) /$1^/x) {
70 $complement = uc($class);
71 }
72
73 # For each test case
74 foreach my $ord (@{$testcases{$class}}) {
75 my $char = display(chr($ord));
76
77 # > 255 already implies upgraded. Skip the ones that don't
9d44f25f 78 # have an explicit upgrade. This shows more clearly in the
e8ed1101
KW
79 # output which tests are in utf8, or not.
80 next if $ord > 255 && ! $upgrade;
81
82 my $reason = ""; # Explanation output with each test
4fc9fd85 83 my $neg_reason = "";
e8ed1101
KW
84 my $match = 1; # Calculated whether test regex should
85 # match or not
86
87 # Everything always matches in ASCII, or under /u
88 if ($ord < 128 || $charset eq 'u') {
89 $reason = "\"$char\" is a $class under /$charset";
4fc9fd85 90 $neg_reason = "\"$char\" is not a $complement under /$charset";
e8ed1101 91 }
c973bd4f 92 elsif ($charset eq "a" || $charset eq "aa") {
e8ed1101
KW
93 $match = 0;
94 $reason = "\"$char\" is non-ASCII, which can't be a $class under /a";
4fc9fd85 95 $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /a";
e8ed1101
KW
96 }
97 elsif ($ord > 255) {
98 $reason = "\"$char\" is a $class under /$charset";
4fc9fd85 99 $neg_reason = "\"$char\" is not a $complement under /$charset";
e8ed1101
KW
100 }
101 elsif ($charset eq 'l') {
102
103 # We are using the C locale, which is essentially ASCII,
104 # but under utf8, the above-latin1 chars are treated as
105 # Unicode)
106 $reason = "\"$char\" is not a $class in this locale under /l";
4fc9fd85 107 $neg_reason = "\"$char\" is a $complement in this locale under /l";
e8ed1101
KW
108 $match = 0;
109 }
110 elsif ($upgrade) {
111 $reason = "\"$char\" is a $class in utf8 under /d";
4fc9fd85 112 $neg_reason = "\"$char\" is not a $complement in utf8 under /d";
e8ed1101
KW
113 }
114 else {
4fc9fd85
KW
115 $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /d";
116 $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /d (unless in utf8)";
e8ed1101
KW
117 $match = 0;
118 }
119 $reason = "; $reason" if $reason;
4fc9fd85 120 $neg_reason = "; $neg_reason" if $neg_reason;
e8ed1101
KW
121
122 my $op;
123 my $neg_op;
124 if ($match) {
125 $op = '=~';
126 $neg_op = '!~';
127 }
128 else {
129 $op = '!~';
130 $neg_op = '=~';
131 }
132
133 # In [...] or not
134 foreach my $bracketed (0, 1) {
135 my $lb = "";
136 my $rb = "";
137 if ($bracketed) {
138
139 # Adds an extra char to the character class to make sure
140 # that the class doesn't get optimized away.
141 $lb = ($bracketed) ? '[_' : "";
142 $rb = ($bracketed) ? ']' : "";
143 }
144 else { # [:posix:] must be inside outer [ ]
145 next if $class =~ /\[/;
146 }
147
148 my $length = 10; # For regexec.c regrepeat() cases by
149 # matching more than one item
150 # Test both class and its complement, and with one or more
151 # than one item to match.
152 foreach my $eval (
153 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: $lb$class$rb ) /x],
e8ed1101 154 qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset: $lb$class$rb\{$length} ) /x],
1e50d612
KW
155 ) {
156 ok (eval $eval, $eval . $reason);
157 }
158 foreach my $eval (
159 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb ) /x],
e8ed1101
KW
160 qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb\{$length} ) /x],
161 ) {
4fc9fd85 162 ok (eval $eval, $eval . $neg_reason);
e8ed1101
KW
163 }
164 }
165
166 next if $class ne '\w';
167
168 # Test \b, \B at beginning and end of string
169 foreach my $eval (
170 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: ^ \\b . ) /x],
171 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: . \\b \$) /x],
1e50d612
KW
172 ) {
173 ok (eval $eval, $eval . $reason);
174 }
175 foreach my $eval (
e8ed1101
KW
176 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: ^ \\B . ) /x],
177 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: . \\B \$ ) /x],
178 ) {
4fc9fd85 179 ok (eval $eval, $eval . $neg_reason);
e8ed1101
KW
180 }
181
182 # Test \b, \B adjacent to a non-word char, both before it and
183 # after. We test with ASCII, Latin1 and Unicode non-word chars
184 foreach my $space_ord (@{$testcases{'\s'}}) {
185
186 # Useless to try to test non-utf8 when the ord itself
187 # forces utf8
188 next if $space_ord > 255 && ! $upgrade;
189
190 my $space = display(chr $space_ord);
191
192 foreach my $eval (
193 qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x],
194 qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x],
1e50d612
KW
195 ) {
196 ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
197 }
198 foreach my $eval (
e8ed1101
KW
199 qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x],
200 qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x],
201 ) {
4fc9fd85 202 ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w");
e8ed1101
KW
203 }
204 }
205
206 # Test \b, \B in the middle of two nominally word chars, but
207 # one or both may be considered non-word depending on range
208 # and charset.
209 foreach my $other_ord (@{$testcases{'\w'}}) {
210 next if $other_ord > 255 && ! $upgrade;
211 my $other = display(chr $other_ord);
212
213 # Determine if the other char is a word char in current
214 # circumstances
215 my $other_is_word = 1;
216 my $other_reason = "\"$other\" is a $class under /$charset";
4fc9fd85 217 my $other_neg_reason = "\"$other\" is not a $complement under /$charset";
e8ed1101
KW
218 if ($other_ord > 127
219 && $charset ne 'u'
c973bd4f 220 && (($charset eq "a" || $charset eq "aa")
e8ed1101
KW
221 || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
222 {
223 $other_is_word = 0;
224 $other_reason = "\"$other\" is not a $class under /$charset";
4fc9fd85 225 $other_neg_reason = "\"$other\" is a $complement under /$charset";
e8ed1101
KW
226 }
227 my $both_reason = $reason;
228 $both_reason .= "; $other_reason" if $other_ord != $ord;
4fc9fd85
KW
229 my $both_neg_reason = $neg_reason;
230 $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord;
e8ed1101
KW
231
232 # If both are the same wordness, then \b will fail; \B
233 # succeed
234 if ($match == $other_is_word) {
235 $op = '!~';
236 $neg_op = '=~';
237 }
238 else {
239 $op = '=~';
240 $neg_op = '!~';
241 }
242
243 foreach my $eval (
244 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: $other \\b $char ) /x],
245 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: $char \\b $other ) /x],
1e50d612
KW
246 ) {
247 ok (eval $eval, $eval . $both_reason);
248 }
249 foreach my $eval (
e8ed1101
KW
250 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: $other \\B $char ) /x],
251 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: $char \\B $other ) /x],
252 ) {
4fc9fd85 253 ok (eval $eval, $eval . $both_neg_reason);
e8ed1101
KW
254 }
255
256 next if $other_ord == $ord;
257
258 # These start with the \b or \B. They are included, based
259 # on source code analysis, to force the testing of the FBC
260 # (find_by_class) portions of regexec.c.
261 foreach my $eval (
262 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: \\b $char ) /x],
263 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: \\b $other ) /x],
1e50d612
KW
264 ) {
265 ok (eval $eval, $eval . $both_reason);
266 }
267 foreach my $eval (
e8ed1101
KW
268 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: \\B $char ) /x],
269 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: \\B $other ) /x],
270 ) {
4fc9fd85 271 ok (eval $eval, $eval . $both_neg_reason);
e8ed1101
KW
272 }
273 }
274 } # End of each test case in a class
275 } # End of \w, \s, ...
276 } # End of utf8 upgraded or not
277}
278
279plan(curr_test() - 1);