1 # Test the /a, /d, etc regex modifiers
6 set_up_inc('../lib', '../dist/if');
7 require './loc_tools.pl';
12 no warnings 'locale'; # Some /l tests use above-latin1 chars to make sure
13 # they work, even though they warn.
18 # Each case is a valid element of its hash key. Choose, where available, an
19 # ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
21 '\w' => [ ord("A"), utf8::unicode_to_native(0xE2), 0x16B ], # Below expects these to all be alpha
22 '\d' => [ ord("0"), 0x0662 ],
23 '\s' => [ ord("\t"), utf8::unicode_to_native(0xA0), 0x1680 ], # Below expects these to be [:blank:]
24 '[:cntrl:]' => [ utf8::unicode_to_native(0x00), utf8::unicode_to_native(0x88) ],
25 '[:graph:]' => [ ord("&"), utf8::unicode_to_native(0xF7), 0x02C7 ], # Below expects these to be
27 '[:lower:]' => [ ord("g"), utf8::unicode_to_native(0xE3), 0x0127 ],
28 '[:punct:]' => [ ord('`'), ord('^'), ord('~'), ord('<'), ord('='), ord('>'), ord('|'), ord('-'), ord(','), ord(';'), ord(':'), ord('!'), ord('?'), ord('/'), ord('.'), ord('"'), ord('('), ord(')'), ord('['), ord(']'), ord('{'), ord('}'), ord('@'), ord('$'), ord('*'), ord('\\'), ord('&'), ord('#'), ord('%'), ord('+'), ord("'"), utf8::unicode_to_native(0xBF), 0x055C ],
29 '[:upper:]' => [ ord("G"), utf8::unicode_to_native(0xC3), 0x0126 ],
30 '[:xdigit:]' => [ ord("4"), 0xFF15 ],
33 $testcases{'[:digit:]'} = $testcases{'\d'};
34 $testcases{'[:alnum:]'} = $testcases{'\w'};
35 $testcases{'[:alpha:]'} = $testcases{'\w'};
36 $testcases{'[:blank:]'} = $testcases{'\s'};
37 $testcases{'[:print:]'} = $testcases{'[:graph:]'};
38 $testcases{'[:space:]'} = $testcases{'\s'};
39 $testcases{'[:word:]'} = $testcases{'\w'};
43 my @charsets = qw(a d u aa);
44 my $locales_ok = eval { locales_enabled('LC_CTYPE'); 1 };
45 if (! is_miniperl() && $locales_ok) {
47 my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
48 if ($current_locale eq 'C') {
50 # test for d_setlocale is repeated here because this one is compile
51 # time, and the one above is run time
52 use if $Config{d_setlocale}, 'locale';
54 # Some implementations don't have the 128-255 range characters all
55 # mean nothing under the C locale (an example being VMS). This is
56 # legal, but since we don't know what the right answers should be,
57 # skip the locale tests in that situation.
58 for my $i (128 .. 255) {
59 goto skip_adding_C_locale
60 if chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/;
66 # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale
67 $utf8_locale = find_utf8_ctype_locale();
68 push @charsets, 'L' if defined $utf8_locale;
72 # For each possible character set...
73 foreach my $charset (@charsets) {
75 my $charset_mod = lc $charset;
77 if ($charset_mod eq 'l') {
78 $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l')
82 die "Couldn't change locale" unless $locale;
83 $charset_display = $charset_mod . " ($locale)";
86 $charset_display = $charset_mod;
90 foreach my $upgrade ("", 'utf8::upgrade($a); ') {
92 # reverse gets the, \w, \s, \d first.
93 for my $class (reverse sort keys %testcases) {
95 # The complement of \w is \W; of [:posix:] is [:^posix:]
96 my $complement = $class;
97 if ($complement !~ s/ ( \[: ) /$1^/x) {
98 $complement = uc($class);
102 foreach my $ord (@{$testcases{$class}}) {
103 my $char = chr($ord);
104 $char = ($char eq '$') ? '\$' : display($char);
106 # > 255 already implies upgraded. Skip the ones that don't
107 # have an explicit upgrade. This shows more clearly in the
108 # output which tests are in utf8, or not.
109 next if $ord > 255 && ! $upgrade;
111 my $reason = ""; # Explanation output with each test
113 my $match = 1; # Calculated whether test regex should
116 # Everything always matches in ASCII, or under /u, or under /l
117 # with a UTF-8 locale
118 if (utf8::native_to_unicode($ord) < 128
122 $reason = "\"$char\" is a $class under /$charset_display";
123 $neg_reason = "\"$char\" is not a $complement under /$charset_display";
125 elsif ($charset eq "a" || $charset eq "aa") {
127 $reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display";
128 $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display";
131 $reason = "\"$char\" is a $class under /$charset_display";
132 $neg_reason = "\"$char\" is not a $complement under /$charset_display";
134 elsif ($charset eq 'l') {
136 # We are using the C locale, which is essentially ASCII,
137 # but under utf8, the above-latin1 chars are treated as
139 $reason = "\"$char\" is not a $class in the C locale under /$charset_mod";
140 $neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod";
144 $reason = "\"$char\" is a $class in utf8 under /$charset_display";
145 $neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display";
148 $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display";
149 $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)";
152 $reason = "; $reason" if $reason;
153 $neg_reason = "; $neg_reason" if $neg_reason;
167 foreach my $bracketed (0, 1) {
172 # Adds an extra char to the character class to make sure
173 # that the class doesn't get optimized away.
174 $lb = ($bracketed) ? '[_' : "";
175 $rb = ($bracketed) ? ']' : "";
177 else { # [:posix:] must be inside outer [ ]
178 next if $class =~ /\[/;
181 my $length = 10; # For regexec.c regrepeat() cases by
182 # matching more than one item
183 # Test both class and its complement, and with one or more
184 # than one item to match.
186 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x],
187 qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x],
189 ok (eval $eval, $eval . $reason);
192 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x],
193 qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x],
195 ok (eval $eval, $eval . $neg_reason);
199 next if $class ne '\w';
201 # Test \b, \B at beginning and end of string
203 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x],
204 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x],
206 ok (eval $eval, $eval . $reason);
209 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x],
210 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x],
212 ok (eval $eval, $eval . $neg_reason);
215 # Test \b, \B adjacent to a non-word char, both before it and
216 # after. We test with ASCII, Latin1 and Unicode non-word chars
217 foreach my $space_ord (@{$testcases{'\s'}}) {
219 # Useless to try to test non-utf8 when the ord itself
221 next if $space_ord > 255 && ! $upgrade;
223 my $space = display(chr $space_ord);
226 qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
227 qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
229 ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
232 qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
233 qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
235 ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w");
239 # Test \b, \B in the middle of two nominally word chars, but
240 # one or both may be considered non-word depending on range
242 foreach my $other_ord (@{$testcases{'\w'}}) {
243 next if $other_ord > 255 && ! $upgrade;
244 my $other = display(chr $other_ord);
246 # Determine if the other char is a word char in current
248 my $other_is_word = 1;
249 my $other_reason = "\"$other\" is a $class under /$charset_display";
250 my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display";
251 if (utf8::native_to_unicode($other_ord) > 127
252 && $charset ne 'u' && $charset ne 'L'
253 && (($charset eq "a" || $charset eq "aa")
254 || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
257 $other_reason = "\"$other\" is not a $class under /$charset_display";
258 $other_neg_reason = "\"$other\" is a $complement under /$charset_display";
260 my $both_reason = $reason;
261 $both_reason .= "; $other_reason" if $other_ord != $ord;
262 my $both_neg_reason = $neg_reason;
263 $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord;
265 # If both are the same wordness, then \b will fail; \B
267 if ($match == $other_is_word) {
277 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x],
278 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x],
280 ok (eval $eval, $eval . $both_reason);
283 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x],
284 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x],
286 ok (eval $eval, $eval . $both_neg_reason);
289 next if $other_ord == $ord;
291 # These start with the \b or \B. They are included, based
292 # on source code analysis, to force the testing of the FBC
293 # (find_by_class) portions of regexec.c.
295 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x],
296 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x],
298 ok (eval $eval, $eval . $both_reason);
301 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x],
302 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x],
304 ok (eval $eval, $eval . $both_neg_reason);
307 } # End of each test case in a class
308 } # End of \w, \s, ...
309 } # End of utf8 upgraded or not
312 plan(curr_test() - 1);