Commit | Line | Data |
---|---|---|
e8ed1101 KW |
1 | # Test the /a, /d, etc regex modifiers |
2 | ||
3 | BEGIN { | |
4 | chdir 't' if -d 't'; | |
5 | @INC = '../lib'; | |
6 | require './test.pl'; | |
7 | } | |
8 | ||
9 | use strict; | |
10 | use warnings; | |
11 | ||
aab0501d KW |
12 | plan('no_plan'); |
13 | ||
e8ed1101 KW |
14 | # Each case is a valid element of its hash key. Choose, where available, an |
15 | # ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point. | |
16 | my %testcases = ( | |
17 | '\w' => [ ord("A"), 0xE2, 0x16B ], # Below expects these to all be alpha | |
18 | '\d' => [ ord("0"), 0x0662 ], | |
19 | '\s' => [ ord("\t"), 0xA0, 0x1680 ], # Below expects these to be [:blank:] | |
20 | '[:cntrl:]' => [ 0x00, 0x88 ], | |
21 | '[:graph:]' => [ ord("&"), 0xF7, 0x02C7 ], # Below expects these to be | |
22 | # [:print:] | |
23 | '[:lower:]' => [ ord("g"), 0xE3, 0x0127 ], | |
24 | '[:punct:]' => [ ord("!"), 0xBF, 0x055C ], | |
25 | '[:upper:]' => [ ord("G"), 0xC3, 0x0126 ], | |
26 | '[:xdigit:]' => [ ord("4"), 0xFF15 ], | |
27 | ); | |
28 | ||
29 | $testcases{'[:digit:]'} = $testcases{'\d'}; | |
30 | $testcases{'[:alnum:]'} = $testcases{'\w'}; | |
31 | $testcases{'[:alpha:]'} = $testcases{'\w'}; | |
32 | $testcases{'[:blank:]'} = $testcases{'\s'}; | |
33 | $testcases{'[:print:]'} = $testcases{'[:graph:]'}; | |
34 | $testcases{'[:space:]'} = $testcases{'\s'}; | |
35 | $testcases{'[:word:]'} = $testcases{'\w'}; | |
36 | ||
37 | # For each possible character set... | |
de87c4fe | 38 | foreach my $charset ("a", "d", "l", "u") { |
e8ed1101 KW |
39 | |
40 | # And in utf8 or not | |
41 | foreach my $upgrade ("", 'utf8::upgrade($a); ') { | |
42 | ||
43 | # reverse gets the, \w, \s, \d first. | |
44 | for my $class (reverse sort keys %testcases) { | |
45 | ||
46 | # The complement of \w is \W; of [:posix:] is [:^posix:] | |
47 | my $complement = $class; | |
48 | if ($complement !~ s/ ( \[: ) /$1^/x) { | |
49 | $complement = uc($class); | |
50 | } | |
51 | ||
52 | # For each test case | |
53 | foreach my $ord (@{$testcases{$class}}) { | |
54 | my $char = display(chr($ord)); | |
55 | ||
56 | # > 255 already implies upgraded. Skip the ones that don't | |
9d44f25f | 57 | # have an explicit upgrade. This shows more clearly in the |
e8ed1101 KW |
58 | # output which tests are in utf8, or not. |
59 | next if $ord > 255 && ! $upgrade; | |
60 | ||
61 | my $reason = ""; # Explanation output with each test | |
62 | my $match = 1; # Calculated whether test regex should | |
63 | # match or not | |
64 | ||
65 | # Everything always matches in ASCII, or under /u | |
66 | if ($ord < 128 || $charset eq 'u') { | |
67 | $reason = "\"$char\" is a $class under /$charset"; | |
68 | } | |
69 | elsif ($charset eq "a") { | |
70 | $match = 0; | |
71 | $reason = "\"$char\" is non-ASCII, which can't be a $class under /a"; | |
72 | } | |
73 | elsif ($ord > 255) { | |
74 | $reason = "\"$char\" is a $class under /$charset"; | |
75 | } | |
76 | elsif ($charset eq 'l') { | |
77 | ||
78 | # We are using the C locale, which is essentially ASCII, | |
79 | # but under utf8, the above-latin1 chars are treated as | |
80 | # Unicode) | |
81 | $reason = "\"$char\" is not a $class in this locale under /l"; | |
82 | $match = 0; | |
83 | } | |
84 | elsif ($upgrade) { | |
85 | $reason = "\"$char\" is a $class in utf8 under /d"; | |
86 | } | |
87 | else { | |
88 | $reason = "\"$char\" is latin1, which requires utf8 to be a $class under /d"; | |
89 | $match = 0; | |
90 | } | |
91 | $reason = "; $reason" if $reason; | |
92 | ||
93 | my $op; | |
94 | my $neg_op; | |
95 | if ($match) { | |
96 | $op = '=~'; | |
97 | $neg_op = '!~'; | |
98 | } | |
99 | else { | |
100 | $op = '!~'; | |
101 | $neg_op = '=~'; | |
102 | } | |
103 | ||
104 | # In [...] or not | |
105 | foreach my $bracketed (0, 1) { | |
106 | my $lb = ""; | |
107 | my $rb = ""; | |
108 | if ($bracketed) { | |
109 | ||
110 | # Adds an extra char to the character class to make sure | |
111 | # that the class doesn't get optimized away. | |
112 | $lb = ($bracketed) ? '[_' : ""; | |
113 | $rb = ($bracketed) ? ']' : ""; | |
114 | } | |
115 | else { # [:posix:] must be inside outer [ ] | |
116 | next if $class =~ /\[/; | |
117 | } | |
118 | ||
119 | my $length = 10; # For regexec.c regrepeat() cases by | |
120 | # matching more than one item | |
121 | # Test both class and its complement, and with one or more | |
122 | # than one item to match. | |
123 | foreach my $eval ( | |
124 | qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: $lb$class$rb ) /x], | |
125 | qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb ) /x], | |
126 | qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset: $lb$class$rb\{$length} ) /x], | |
127 | qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb\{$length} ) /x], | |
128 | ) { | |
129 | ok (eval $eval, $eval . $reason); | |
130 | } | |
131 | } | |
132 | ||
133 | next if $class ne '\w'; | |
134 | ||
135 | # Test \b, \B at beginning and end of string | |
136 | foreach my $eval ( | |
137 | qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: ^ \\b . ) /x], | |
138 | qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: . \\b \$) /x], | |
139 | qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: ^ \\B . ) /x], | |
140 | qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: . \\B \$ ) /x], | |
141 | ) { | |
142 | ok (eval $eval, $eval . $reason); | |
143 | } | |
144 | ||
145 | # Test \b, \B adjacent to a non-word char, both before it and | |
146 | # after. We test with ASCII, Latin1 and Unicode non-word chars | |
147 | foreach my $space_ord (@{$testcases{'\s'}}) { | |
148 | ||
149 | # Useless to try to test non-utf8 when the ord itself | |
150 | # forces utf8 | |
151 | next if $space_ord > 255 && ! $upgrade; | |
152 | ||
153 | my $space = display(chr $space_ord); | |
154 | ||
155 | foreach my $eval ( | |
156 | qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x], | |
157 | qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x], | |
158 | qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x], | |
159 | qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x], | |
160 | ) { | |
161 | ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w"); | |
162 | } | |
163 | } | |
164 | ||
165 | # Test \b, \B in the middle of two nominally word chars, but | |
166 | # one or both may be considered non-word depending on range | |
167 | # and charset. | |
168 | foreach my $other_ord (@{$testcases{'\w'}}) { | |
169 | next if $other_ord > 255 && ! $upgrade; | |
170 | my $other = display(chr $other_ord); | |
171 | ||
172 | # Determine if the other char is a word char in current | |
173 | # circumstances | |
174 | my $other_is_word = 1; | |
175 | my $other_reason = "\"$other\" is a $class under /$charset"; | |
176 | if ($other_ord > 127 | |
177 | && $charset ne 'u' | |
178 | && ($charset eq "a" | |
179 | || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade)))) | |
180 | { | |
181 | $other_is_word = 0; | |
182 | $other_reason = "\"$other\" is not a $class under /$charset"; | |
183 | } | |
184 | my $both_reason = $reason; | |
185 | $both_reason .= "; $other_reason" if $other_ord != $ord; | |
186 | ||
187 | # If both are the same wordness, then \b will fail; \B | |
188 | # succeed | |
189 | if ($match == $other_is_word) { | |
190 | $op = '!~'; | |
191 | $neg_op = '=~'; | |
192 | } | |
193 | else { | |
194 | $op = '=~'; | |
195 | $neg_op = '!~'; | |
196 | } | |
197 | ||
198 | foreach my $eval ( | |
199 | qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: $other \\b $char ) /x], | |
200 | qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: $char \\b $other ) /x], | |
201 | qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: $other \\B $char ) /x], | |
202 | qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: $char \\B $other ) /x], | |
203 | ) { | |
204 | ok (eval $eval, $eval . $both_reason); | |
205 | } | |
206 | ||
207 | next if $other_ord == $ord; | |
208 | ||
209 | # These start with the \b or \B. They are included, based | |
210 | # on source code analysis, to force the testing of the FBC | |
211 | # (find_by_class) portions of regexec.c. | |
212 | foreach my $eval ( | |
213 | qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: \\b $char ) /x], | |
214 | qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: \\b $other ) /x], | |
215 | qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: \\B $char ) /x], | |
216 | qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: \\B $other ) /x], | |
217 | ) { | |
218 | ok (eval $eval, $eval . $both_reason); | |
219 | } | |
220 | } | |
221 | } # End of each test case in a class | |
222 | } # End of \w, \s, ... | |
223 | } # End of utf8 upgraded or not | |
224 | } | |
225 | ||
226 | plan(curr_test() - 1); |