Commit | Line | Data |
---|---|---|
e8ed1101 KW |
1 | # Test the /a, /d, etc regex modifiers |
2 | ||
3 | BEGIN { | |
4 | chdir 't' if -d 't'; | |
5 | @INC = '../lib'; | |
6 | require './test.pl'; | |
31f05a37 | 7 | require './loc_tools.pl'; |
e8ed1101 KW |
8 | } |
9 | ||
10 | use strict; | |
11 | use warnings; | |
ed35198c | 12 | use Config; |
e8ed1101 | 13 | |
aab0501d KW |
14 | plan('no_plan'); |
15 | ||
e8ed1101 KW |
16 | # Each case is a valid element of its hash key. Choose, where available, an |
17 | # ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point. | |
18 | my %testcases = ( | |
19 | '\w' => [ ord("A"), 0xE2, 0x16B ], # Below expects these to all be alpha | |
20 | '\d' => [ ord("0"), 0x0662 ], | |
21 | '\s' => [ ord("\t"), 0xA0, 0x1680 ], # Below expects these to be [:blank:] | |
22 | '[:cntrl:]' => [ 0x00, 0x88 ], | |
23 | '[:graph:]' => [ ord("&"), 0xF7, 0x02C7 ], # Below expects these to be | |
24 | # [:print:] | |
25 | '[:lower:]' => [ ord("g"), 0xE3, 0x0127 ], | |
26 | '[:punct:]' => [ ord("!"), 0xBF, 0x055C ], | |
27 | '[:upper:]' => [ ord("G"), 0xC3, 0x0126 ], | |
28 | '[:xdigit:]' => [ ord("4"), 0xFF15 ], | |
29 | ); | |
30 | ||
31 | $testcases{'[:digit:]'} = $testcases{'\d'}; | |
32 | $testcases{'[:alnum:]'} = $testcases{'\w'}; | |
33 | $testcases{'[:alpha:]'} = $testcases{'\w'}; | |
34 | $testcases{'[:blank:]'} = $testcases{'\s'}; | |
35 | $testcases{'[:print:]'} = $testcases{'[:graph:]'}; | |
36 | $testcases{'[:space:]'} = $testcases{'\s'}; | |
37 | $testcases{'[:word:]'} = $testcases{'\w'}; | |
38 | ||
31f05a37 KW |
39 | my $utf8_locale; |
40 | ||
c973bd4f | 41 | my @charsets = qw(a d u aa); |
569f7fc5 | 42 | if (! is_miniperl() && $Config{d_setlocale}) { |
09fcee4e KW |
43 | require POSIX; |
44 | my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // ""; | |
fff7535c | 45 | if ($current_locale eq 'C') { |
ed35198c KW |
46 | |
47 | # test for d_setlocale is repeated here because this one is compile | |
48 | # time, and the one above is run time | |
49 | use if $Config{d_setlocale}, 'locale'; | |
fff7535c | 50 | |
0e18027d KW |
51 | # Some implementations don't have the 128-255 range characters all |
52 | # mean nothing under the C locale (an example being VMS). This is | |
53 | # legal, but since we don't know what the right answers should be, | |
54 | # skip the locale tests in that situation. | |
fff7535c | 55 | for my $i (128 .. 255) { |
31f05a37 | 56 | goto skip_adding_C_locale if chr($i) =~ /[[:print:]]/; |
fff7535c KW |
57 | } |
58 | push @charsets, 'l'; | |
31f05a37 KW |
59 | |
60 | skip_adding_C_locale: | |
61 | ||
62 | # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale | |
9b0711ee | 63 | $utf8_locale = find_utf8_ctype_locale(); |
31f05a37 | 64 | push @charsets, 'L' if defined $utf8_locale; |
fff7535c | 65 | } |
09fcee4e KW |
66 | } |
67 | ||
e8ed1101 | 68 | # For each possible character set... |
158ba892 | 69 | foreach my $charset (@charsets) { |
31f05a37 KW |
70 | my $locale; |
71 | my $charset_mod = lc $charset; | |
72 | my $charset_display; | |
73 | if ($charset_mod eq 'l') { | |
74 | $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l') | |
75 | ? "C" | |
76 | : $utf8_locale | |
77 | ); | |
78 | die "Couldn't change locale" unless $locale; | |
79 | $charset_display = $charset_mod . " ($locale)"; | |
80 | } | |
81 | else { | |
82 | $charset_display = $charset_mod; | |
83 | } | |
e8ed1101 KW |
84 | |
85 | # And in utf8 or not | |
86 | foreach my $upgrade ("", 'utf8::upgrade($a); ') { | |
87 | ||
88 | # reverse gets the, \w, \s, \d first. | |
89 | for my $class (reverse sort keys %testcases) { | |
90 | ||
91 | # The complement of \w is \W; of [:posix:] is [:^posix:] | |
92 | my $complement = $class; | |
93 | if ($complement !~ s/ ( \[: ) /$1^/x) { | |
94 | $complement = uc($class); | |
95 | } | |
96 | ||
97 | # For each test case | |
98 | foreach my $ord (@{$testcases{$class}}) { | |
99 | my $char = display(chr($ord)); | |
100 | ||
101 | # > 255 already implies upgraded. Skip the ones that don't | |
9d44f25f | 102 | # have an explicit upgrade. This shows more clearly in the |
e8ed1101 KW |
103 | # output which tests are in utf8, or not. |
104 | next if $ord > 255 && ! $upgrade; | |
105 | ||
106 | my $reason = ""; # Explanation output with each test | |
4fc9fd85 | 107 | my $neg_reason = ""; |
e8ed1101 KW |
108 | my $match = 1; # Calculated whether test regex should |
109 | # match or not | |
110 | ||
6d9a9b17 KW |
111 | # Everything always matches in ASCII, or under /u, or under /l |
112 | # with a UTF-8 locale | |
31f05a37 KW |
113 | if ($ord < 128 || $charset eq 'u' || $charset eq 'L') { |
114 | $reason = "\"$char\" is a $class under /$charset_display"; | |
115 | $neg_reason = "\"$char\" is not a $complement under /$charset_display"; | |
e8ed1101 | 116 | } |
c973bd4f | 117 | elsif ($charset eq "a" || $charset eq "aa") { |
e8ed1101 | 118 | $match = 0; |
6d9a9b17 KW |
119 | $reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display"; |
120 | $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display"; | |
e8ed1101 KW |
121 | } |
122 | elsif ($ord > 255) { | |
31f05a37 KW |
123 | $reason = "\"$char\" is a $class under /$charset_display"; |
124 | $neg_reason = "\"$char\" is not a $complement under /$charset_display"; | |
e8ed1101 KW |
125 | } |
126 | elsif ($charset eq 'l') { | |
127 | ||
128 | # We are using the C locale, which is essentially ASCII, | |
129 | # but under utf8, the above-latin1 chars are treated as | |
130 | # Unicode) | |
6d9a9b17 KW |
131 | $reason = "\"$char\" is not a $class in the C locale under /$charset_mod"; |
132 | $neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod"; | |
e8ed1101 KW |
133 | $match = 0; |
134 | } | |
135 | elsif ($upgrade) { | |
6d9a9b17 KW |
136 | $reason = "\"$char\" is a $class in utf8 under /$charset_display"; |
137 | $neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display"; | |
e8ed1101 KW |
138 | } |
139 | else { | |
6d9a9b17 KW |
140 | $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display"; |
141 | $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)"; | |
e8ed1101 KW |
142 | $match = 0; |
143 | } | |
144 | $reason = "; $reason" if $reason; | |
4fc9fd85 | 145 | $neg_reason = "; $neg_reason" if $neg_reason; |
e8ed1101 KW |
146 | |
147 | my $op; | |
148 | my $neg_op; | |
149 | if ($match) { | |
150 | $op = '=~'; | |
151 | $neg_op = '!~'; | |
152 | } | |
153 | else { | |
154 | $op = '!~'; | |
155 | $neg_op = '=~'; | |
156 | } | |
157 | ||
158 | # In [...] or not | |
159 | foreach my $bracketed (0, 1) { | |
160 | my $lb = ""; | |
161 | my $rb = ""; | |
162 | if ($bracketed) { | |
163 | ||
164 | # Adds an extra char to the character class to make sure | |
165 | # that the class doesn't get optimized away. | |
166 | $lb = ($bracketed) ? '[_' : ""; | |
167 | $rb = ($bracketed) ? ']' : ""; | |
168 | } | |
169 | else { # [:posix:] must be inside outer [ ] | |
170 | next if $class =~ /\[/; | |
171 | } | |
172 | ||
173 | my $length = 10; # For regexec.c regrepeat() cases by | |
174 | # matching more than one item | |
175 | # Test both class and its complement, and with one or more | |
176 | # than one item to match. | |
177 | foreach my $eval ( | |
31f05a37 KW |
178 | qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x], |
179 | qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x], | |
1e50d612 KW |
180 | ) { |
181 | ok (eval $eval, $eval . $reason); | |
182 | } | |
183 | foreach my $eval ( | |
31f05a37 KW |
184 | qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x], |
185 | qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x], | |
e8ed1101 | 186 | ) { |
4fc9fd85 | 187 | ok (eval $eval, $eval . $neg_reason); |
e8ed1101 KW |
188 | } |
189 | } | |
190 | ||
191 | next if $class ne '\w'; | |
192 | ||
193 | # Test \b, \B at beginning and end of string | |
194 | foreach my $eval ( | |
31f05a37 KW |
195 | qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x], |
196 | qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x], | |
1e50d612 KW |
197 | ) { |
198 | ok (eval $eval, $eval . $reason); | |
199 | } | |
200 | foreach my $eval ( | |
31f05a37 KW |
201 | qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x], |
202 | qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x], | |
e8ed1101 | 203 | ) { |
4fc9fd85 | 204 | ok (eval $eval, $eval . $neg_reason); |
e8ed1101 KW |
205 | } |
206 | ||
207 | # Test \b, \B adjacent to a non-word char, both before it and | |
208 | # after. We test with ASCII, Latin1 and Unicode non-word chars | |
209 | foreach my $space_ord (@{$testcases{'\s'}}) { | |
210 | ||
211 | # Useless to try to test non-utf8 when the ord itself | |
212 | # forces utf8 | |
213 | next if $space_ord > 255 && ! $upgrade; | |
214 | ||
215 | my $space = display(chr $space_ord); | |
216 | ||
217 | foreach my $eval ( | |
31f05a37 KW |
218 | qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], |
219 | qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], | |
1e50d612 KW |
220 | ) { |
221 | ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w"); | |
222 | } | |
223 | foreach my $eval ( | |
31f05a37 KW |
224 | qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], |
225 | qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], | |
e8ed1101 | 226 | ) { |
4fc9fd85 | 227 | ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w"); |
e8ed1101 KW |
228 | } |
229 | } | |
230 | ||
231 | # Test \b, \B in the middle of two nominally word chars, but | |
232 | # one or both may be considered non-word depending on range | |
233 | # and charset. | |
234 | foreach my $other_ord (@{$testcases{'\w'}}) { | |
235 | next if $other_ord > 255 && ! $upgrade; | |
236 | my $other = display(chr $other_ord); | |
237 | ||
238 | # Determine if the other char is a word char in current | |
239 | # circumstances | |
240 | my $other_is_word = 1; | |
31f05a37 KW |
241 | my $other_reason = "\"$other\" is a $class under /$charset_display"; |
242 | my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display"; | |
e8ed1101 | 243 | if ($other_ord > 127 |
31f05a37 | 244 | && $charset ne 'u' && $charset ne 'L' |
c973bd4f | 245 | && (($charset eq "a" || $charset eq "aa") |
e8ed1101 KW |
246 | || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade)))) |
247 | { | |
248 | $other_is_word = 0; | |
31f05a37 KW |
249 | $other_reason = "\"$other\" is not a $class under /$charset_display"; |
250 | $other_neg_reason = "\"$other\" is a $complement under /$charset_display"; | |
e8ed1101 KW |
251 | } |
252 | my $both_reason = $reason; | |
253 | $both_reason .= "; $other_reason" if $other_ord != $ord; | |
4fc9fd85 KW |
254 | my $both_neg_reason = $neg_reason; |
255 | $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord; | |
e8ed1101 KW |
256 | |
257 | # If both are the same wordness, then \b will fail; \B | |
258 | # succeed | |
259 | if ($match == $other_is_word) { | |
260 | $op = '!~'; | |
261 | $neg_op = '=~'; | |
262 | } | |
263 | else { | |
264 | $op = '=~'; | |
265 | $neg_op = '!~'; | |
266 | } | |
267 | ||
268 | foreach my $eval ( | |
31f05a37 KW |
269 | qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x], |
270 | qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x], | |
1e50d612 KW |
271 | ) { |
272 | ok (eval $eval, $eval . $both_reason); | |
273 | } | |
274 | foreach my $eval ( | |
31f05a37 KW |
275 | qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x], |
276 | qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x], | |
e8ed1101 | 277 | ) { |
4fc9fd85 | 278 | ok (eval $eval, $eval . $both_neg_reason); |
e8ed1101 KW |
279 | } |
280 | ||
281 | next if $other_ord == $ord; | |
282 | ||
283 | # These start with the \b or \B. They are included, based | |
284 | # on source code analysis, to force the testing of the FBC | |
285 | # (find_by_class) portions of regexec.c. | |
286 | foreach my $eval ( | |
31f05a37 KW |
287 | qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x], |
288 | qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x], | |
1e50d612 KW |
289 | ) { |
290 | ok (eval $eval, $eval . $both_reason); | |
291 | } | |
292 | foreach my $eval ( | |
31f05a37 KW |
293 | qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x], |
294 | qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x], | |
e8ed1101 | 295 | ) { |
4fc9fd85 | 296 | ok (eval $eval, $eval . $both_neg_reason); |
e8ed1101 KW |
297 | } |
298 | } | |
299 | } # End of each test case in a class | |
300 | } # End of \w, \s, ... | |
301 | } # End of utf8 upgraded or not | |
302 | } | |
303 | ||
304 | plan(curr_test() - 1); |