Commit | Line | Data |
---|---|---|
e8ed1101 KW |
1 | # Test the /a, /d, etc regex modifiers |
2 | ||
3 | BEGIN { | |
4 | chdir 't' if -d 't'; | |
e8ed1101 | 5 | require './test.pl'; |
31f05a37 | 6 | require './loc_tools.pl'; |
624c42e2 | 7 | set_up_inc('../lib', '../dist/if'); |
e8ed1101 KW |
8 | } |
9 | ||
10 | use strict; | |
11 | use warnings; | |
613abc6d KW |
12 | no warnings 'locale'; # Some /l tests use above-latin1 chars to make sure |
13 | # they work, even though they warn. | |
ed35198c | 14 | use Config; |
e8ed1101 | 15 | |
aab0501d KW |
16 | plan('no_plan'); |
17 | ||
e8ed1101 KW |
18 | # Each case is a valid element of its hash key. Choose, where available, an |
19 | # ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point. | |
20 | my %testcases = ( | |
f99a3fe1 | 21 | '\w' => [ ord("A"), utf8::unicode_to_native(0xE2), 0x16B ], # Below expects these to all be alpha |
e8ed1101 | 22 | '\d' => [ ord("0"), 0x0662 ], |
f99a3fe1 KW |
23 | '\s' => [ ord("\t"), utf8::unicode_to_native(0xA0), 0x1680 ], # Below expects these to be [:blank:] |
24 | '[:cntrl:]' => [ utf8::unicode_to_native(0x00), utf8::unicode_to_native(0x88) ], | |
25 | '[:graph:]' => [ ord("&"), utf8::unicode_to_native(0xF7), 0x02C7 ], # Below expects these to be | |
26 | # [:print:] | |
27 | '[:lower:]' => [ ord("g"), utf8::unicode_to_native(0xE3), 0x0127 ], | |
28 | '[:punct:]' => [ ord('`'), ord('^'), ord('~'), ord('<'), ord('='), ord('>'), ord('|'), ord('-'), ord(','), ord(';'), ord(':'), ord('!'), ord('?'), ord('/'), ord('.'), ord('"'), ord('('), ord(')'), ord('['), ord(']'), ord('{'), ord('}'), ord('@'), ord('$'), ord('*'), ord('\\'), ord('&'), ord('#'), ord('%'), ord('+'), ord("'"), utf8::unicode_to_native(0xBF), 0x055C ], | |
29 | '[:upper:]' => [ ord("G"), utf8::unicode_to_native(0xC3), 0x0126 ], | |
e8ed1101 KW |
30 | '[:xdigit:]' => [ ord("4"), 0xFF15 ], |
31 | ); | |
32 | ||
33 | $testcases{'[:digit:]'} = $testcases{'\d'}; | |
34 | $testcases{'[:alnum:]'} = $testcases{'\w'}; | |
35 | $testcases{'[:alpha:]'} = $testcases{'\w'}; | |
36 | $testcases{'[:blank:]'} = $testcases{'\s'}; | |
37 | $testcases{'[:print:]'} = $testcases{'[:graph:]'}; | |
38 | $testcases{'[:space:]'} = $testcases{'\s'}; | |
39 | $testcases{'[:word:]'} = $testcases{'\w'}; | |
40 | ||
31f05a37 KW |
41 | my $utf8_locale; |
42 | ||
c973bd4f | 43 | my @charsets = qw(a d u aa); |
624c42e2 N |
44 | my $locales_ok = eval { locales_enabled('LC_CTYPE'); 1 }; |
45 | if (! is_miniperl() && $locales_ok) { | |
09fcee4e KW |
46 | require POSIX; |
47 | my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // ""; | |
fff7535c | 48 | if ($current_locale eq 'C') { |
ed35198c KW |
49 | |
50 | # test for d_setlocale is repeated here because this one is compile | |
51 | # time, and the one above is run time | |
52 | use if $Config{d_setlocale}, 'locale'; | |
fff7535c | 53 | |
0e18027d KW |
54 | # Some implementations don't have the 128-255 range characters all |
55 | # mean nothing under the C locale (an example being VMS). This is | |
56 | # legal, but since we don't know what the right answers should be, | |
57 | # skip the locale tests in that situation. | |
fff7535c | 58 | for my $i (128 .. 255) { |
f99a3fe1 KW |
59 | goto skip_adding_C_locale |
60 | if chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/; | |
fff7535c KW |
61 | } |
62 | push @charsets, 'l'; | |
31f05a37 KW |
63 | |
64 | skip_adding_C_locale: | |
65 | ||
66 | # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale | |
9b0711ee | 67 | $utf8_locale = find_utf8_ctype_locale(); |
31f05a37 | 68 | push @charsets, 'L' if defined $utf8_locale; |
fff7535c | 69 | } |
09fcee4e KW |
70 | } |
71 | ||
e8ed1101 | 72 | # For each possible character set... |
158ba892 | 73 | foreach my $charset (@charsets) { |
31f05a37 KW |
74 | my $locale; |
75 | my $charset_mod = lc $charset; | |
76 | my $charset_display; | |
77 | if ($charset_mod eq 'l') { | |
78 | $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l') | |
79 | ? "C" | |
80 | : $utf8_locale | |
81 | ); | |
82 | die "Couldn't change locale" unless $locale; | |
83 | $charset_display = $charset_mod . " ($locale)"; | |
84 | } | |
85 | else { | |
86 | $charset_display = $charset_mod; | |
87 | } | |
e8ed1101 KW |
88 | |
89 | # And in utf8 or not | |
90 | foreach my $upgrade ("", 'utf8::upgrade($a); ') { | |
91 | ||
92 | # reverse gets the, \w, \s, \d first. | |
93 | for my $class (reverse sort keys %testcases) { | |
94 | ||
95 | # The complement of \w is \W; of [:posix:] is [:^posix:] | |
96 | my $complement = $class; | |
97 | if ($complement !~ s/ ( \[: ) /$1^/x) { | |
98 | $complement = uc($class); | |
99 | } | |
100 | ||
101 | # For each test case | |
102 | foreach my $ord (@{$testcases{$class}}) { | |
f99a3fe1 KW |
103 | my $char = chr($ord); |
104 | $char = ($char eq '$') ? '\$' : display($char); | |
e8ed1101 KW |
105 | |
106 | # > 255 already implies upgraded. Skip the ones that don't | |
9d44f25f | 107 | # have an explicit upgrade. This shows more clearly in the |
e8ed1101 KW |
108 | # output which tests are in utf8, or not. |
109 | next if $ord > 255 && ! $upgrade; | |
110 | ||
111 | my $reason = ""; # Explanation output with each test | |
4fc9fd85 | 112 | my $neg_reason = ""; |
e8ed1101 KW |
113 | my $match = 1; # Calculated whether test regex should |
114 | # match or not | |
115 | ||
6d9a9b17 KW |
116 | # Everything always matches in ASCII, or under /u, or under /l |
117 | # with a UTF-8 locale | |
f99a3fe1 KW |
118 | if (utf8::native_to_unicode($ord) < 128 |
119 | || $charset eq 'u' | |
120 | || $charset eq 'L') | |
121 | { | |
31f05a37 KW |
122 | $reason = "\"$char\" is a $class under /$charset_display"; |
123 | $neg_reason = "\"$char\" is not a $complement under /$charset_display"; | |
e8ed1101 | 124 | } |
c973bd4f | 125 | elsif ($charset eq "a" || $charset eq "aa") { |
e8ed1101 | 126 | $match = 0; |
6d9a9b17 KW |
127 | $reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display"; |
128 | $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display"; | |
e8ed1101 KW |
129 | } |
130 | elsif ($ord > 255) { | |
31f05a37 KW |
131 | $reason = "\"$char\" is a $class under /$charset_display"; |
132 | $neg_reason = "\"$char\" is not a $complement under /$charset_display"; | |
e8ed1101 KW |
133 | } |
134 | elsif ($charset eq 'l') { | |
135 | ||
136 | # We are using the C locale, which is essentially ASCII, | |
137 | # but under utf8, the above-latin1 chars are treated as | |
138 | # Unicode) | |
6d9a9b17 KW |
139 | $reason = "\"$char\" is not a $class in the C locale under /$charset_mod"; |
140 | $neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod"; | |
e8ed1101 KW |
141 | $match = 0; |
142 | } | |
143 | elsif ($upgrade) { | |
6d9a9b17 KW |
144 | $reason = "\"$char\" is a $class in utf8 under /$charset_display"; |
145 | $neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display"; | |
e8ed1101 KW |
146 | } |
147 | else { | |
6d9a9b17 KW |
148 | $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display"; |
149 | $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)"; | |
e8ed1101 KW |
150 | $match = 0; |
151 | } | |
152 | $reason = "; $reason" if $reason; | |
4fc9fd85 | 153 | $neg_reason = "; $neg_reason" if $neg_reason; |
e8ed1101 KW |
154 | |
155 | my $op; | |
156 | my $neg_op; | |
157 | if ($match) { | |
158 | $op = '=~'; | |
159 | $neg_op = '!~'; | |
160 | } | |
161 | else { | |
162 | $op = '!~'; | |
163 | $neg_op = '=~'; | |
164 | } | |
165 | ||
166 | # In [...] or not | |
167 | foreach my $bracketed (0, 1) { | |
168 | my $lb = ""; | |
169 | my $rb = ""; | |
170 | if ($bracketed) { | |
171 | ||
172 | # Adds an extra char to the character class to make sure | |
173 | # that the class doesn't get optimized away. | |
174 | $lb = ($bracketed) ? '[_' : ""; | |
175 | $rb = ($bracketed) ? ']' : ""; | |
176 | } | |
177 | else { # [:posix:] must be inside outer [ ] | |
178 | next if $class =~ /\[/; | |
179 | } | |
180 | ||
181 | my $length = 10; # For regexec.c regrepeat() cases by | |
182 | # matching more than one item | |
183 | # Test both class and its complement, and with one or more | |
184 | # than one item to match. | |
185 | foreach my $eval ( | |
31f05a37 KW |
186 | qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x], |
187 | qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x], | |
1e50d612 KW |
188 | ) { |
189 | ok (eval $eval, $eval . $reason); | |
190 | } | |
191 | foreach my $eval ( | |
31f05a37 KW |
192 | qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x], |
193 | qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x], | |
e8ed1101 | 194 | ) { |
4fc9fd85 | 195 | ok (eval $eval, $eval . $neg_reason); |
e8ed1101 KW |
196 | } |
197 | } | |
198 | ||
199 | next if $class ne '\w'; | |
200 | ||
201 | # Test \b, \B at beginning and end of string | |
202 | foreach my $eval ( | |
31f05a37 KW |
203 | qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x], |
204 | qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x], | |
1e50d612 KW |
205 | ) { |
206 | ok (eval $eval, $eval . $reason); | |
207 | } | |
208 | foreach my $eval ( | |
31f05a37 KW |
209 | qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x], |
210 | qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x], | |
e8ed1101 | 211 | ) { |
4fc9fd85 | 212 | ok (eval $eval, $eval . $neg_reason); |
e8ed1101 KW |
213 | } |
214 | ||
215 | # Test \b, \B adjacent to a non-word char, both before it and | |
216 | # after. We test with ASCII, Latin1 and Unicode non-word chars | |
217 | foreach my $space_ord (@{$testcases{'\s'}}) { | |
218 | ||
219 | # Useless to try to test non-utf8 when the ord itself | |
220 | # forces utf8 | |
221 | next if $space_ord > 255 && ! $upgrade; | |
222 | ||
223 | my $space = display(chr $space_ord); | |
224 | ||
225 | foreach my $eval ( | |
31f05a37 KW |
226 | qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], |
227 | qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], | |
1e50d612 KW |
228 | ) { |
229 | ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w"); | |
230 | } | |
231 | foreach my $eval ( | |
31f05a37 KW |
232 | qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], |
233 | qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], | |
e8ed1101 | 234 | ) { |
4fc9fd85 | 235 | ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w"); |
e8ed1101 KW |
236 | } |
237 | } | |
238 | ||
239 | # Test \b, \B in the middle of two nominally word chars, but | |
240 | # one or both may be considered non-word depending on range | |
241 | # and charset. | |
242 | foreach my $other_ord (@{$testcases{'\w'}}) { | |
243 | next if $other_ord > 255 && ! $upgrade; | |
244 | my $other = display(chr $other_ord); | |
245 | ||
246 | # Determine if the other char is a word char in current | |
247 | # circumstances | |
248 | my $other_is_word = 1; | |
31f05a37 KW |
249 | my $other_reason = "\"$other\" is a $class under /$charset_display"; |
250 | my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display"; | |
f99a3fe1 | 251 | if (utf8::native_to_unicode($other_ord) > 127 |
31f05a37 | 252 | && $charset ne 'u' && $charset ne 'L' |
c973bd4f | 253 | && (($charset eq "a" || $charset eq "aa") |
e8ed1101 KW |
254 | || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade)))) |
255 | { | |
256 | $other_is_word = 0; | |
31f05a37 KW |
257 | $other_reason = "\"$other\" is not a $class under /$charset_display"; |
258 | $other_neg_reason = "\"$other\" is a $complement under /$charset_display"; | |
e8ed1101 KW |
259 | } |
260 | my $both_reason = $reason; | |
261 | $both_reason .= "; $other_reason" if $other_ord != $ord; | |
4fc9fd85 KW |
262 | my $both_neg_reason = $neg_reason; |
263 | $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord; | |
e8ed1101 KW |
264 | |
265 | # If both are the same wordness, then \b will fail; \B | |
266 | # succeed | |
267 | if ($match == $other_is_word) { | |
268 | $op = '!~'; | |
269 | $neg_op = '=~'; | |
270 | } | |
271 | else { | |
272 | $op = '=~'; | |
273 | $neg_op = '!~'; | |
274 | } | |
275 | ||
276 | foreach my $eval ( | |
31f05a37 KW |
277 | qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x], |
278 | qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x], | |
1e50d612 KW |
279 | ) { |
280 | ok (eval $eval, $eval . $both_reason); | |
281 | } | |
282 | foreach my $eval ( | |
31f05a37 KW |
283 | qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x], |
284 | qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x], | |
e8ed1101 | 285 | ) { |
4fc9fd85 | 286 | ok (eval $eval, $eval . $both_neg_reason); |
e8ed1101 KW |
287 | } |
288 | ||
289 | next if $other_ord == $ord; | |
290 | ||
291 | # These start with the \b or \B. They are included, based | |
292 | # on source code analysis, to force the testing of the FBC | |
293 | # (find_by_class) portions of regexec.c. | |
294 | foreach my $eval ( | |
31f05a37 KW |
295 | qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x], |
296 | qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x], | |
1e50d612 KW |
297 | ) { |
298 | ok (eval $eval, $eval . $both_reason); | |
299 | } | |
300 | foreach my $eval ( | |
31f05a37 KW |
301 | qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x], |
302 | qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x], | |
e8ed1101 | 303 | ) { |
4fc9fd85 | 304 | ok (eval $eval, $eval . $both_neg_reason); |
e8ed1101 KW |
305 | } |
306 | } | |
307 | } # End of each test case in a class | |
308 | } # End of \w, \s, ... | |
309 | } # End of utf8 upgraded or not | |
310 | } | |
311 | ||
312 | plan(curr_test() - 1); |