Commit | Line | Data |
---|---|---|
a2d9a01a KW |
1 | # Grind out a lot of combinatoric tests for folding. Still missing are |
2 | # testing backreferences and tries. | |
3 | ||
4 | use charnames ":full"; | |
5 | ||
6 | binmode STDOUT, ":utf8"; | |
7 | ||
8 | BEGIN { | |
9 | chdir 't' if -d 't'; | |
10 | @INC = '../lib'; | |
11 | require './test.pl'; | |
12 | } | |
13 | ||
14 | use strict; | |
15 | use warnings; | |
16 | ||
17 | # Tests both unicode and not, so make sure not implicitly testing unicode | |
18 | no feature 'unicode_strings'; | |
19 | ||
20 | # Case-insensitive matching is a large and complicated issue. Perl does not | |
21 | # implement it fully, properly. For example, it doesn't include normalization | |
22 | # as part of the equation. To test every conceivable combination is clearly | |
23 | # impossible; these tests are mostly drawn from visual inspection of the code | |
24 | # and experience, trying to exercise all areas. | |
25 | ||
26 | # There are three basic ranges of characters that Perl may treat differently: | |
27 | # 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are | |
28 | # referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants | |
29 | # are all controls that fold to themselves. | |
30 | my $ASCII = 1; | |
31 | ||
32 | # 2) Other characters that fit into a byte but are different in utf8 than not; | |
33 | # here referred to, taking some liberties, as Latin1. | |
34 | my $Latin1 = 2; | |
35 | ||
36 | # 3) Characters that won't fit in a byte; here referred to as Unicode | |
37 | my $Unicode = 3; | |
38 | ||
39 | # Within these basic groups are equivalence classes that testing any character | |
40 | # in is likely to lead to the same results as any other character. This is | |
41 | # used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is | |
42 | # set. | |
43 | my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS}; | |
44 | ||
45 | sub range_type { | |
46 | my $ord = shift; | |
47 | ||
48 | return $ASCII if $ord < 128; | |
49 | return $Latin1 if $ord < 256; | |
50 | return $Unicode; | |
51 | } | |
52 | ||
53 | sub numerically { | |
54 | return $a <=> $b | |
55 | } | |
56 | ||
57 | my %tests; | |
58 | my %simple_folds; | |
59 | my %multi_folds; | |
60 | ||
61 | # First, analyze the current Unicode's folding rules | |
62 | my %folded_from; | |
63 | my $file="../lib/unicore/CaseFolding.txt"; | |
64 | open my $fh, "<", $file or die "Failed to read '$file': $!"; | |
65 | while (<$fh>) { | |
66 | chomp; | |
67 | ||
68 | # Lines look like (though without the initial '#') | |
69 | #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE | |
70 | ||
71 | my ($line, $comment) = split / \s+ \# \s+ /x, $_; | |
72 | next if $line eq "" || substr($line, 0, 1) eq '#'; | |
73 | my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line; | |
74 | ||
75 | my $from = hex $hex_from; | |
76 | ||
77 | if ($fold_type eq 'F') { | |
78 | next; # XXX TODO multi-char folds | |
79 | my $from_range_type = range_type($from); | |
80 | @folded = map { hex $_ } @folded; | |
81 | ||
82 | # Include three code points that are handled internally by the regex | |
83 | # engine specially, plus all non-Unicode multi folds (which actually | |
84 | # the only one is already included in the three, but this makes sure) | |
85 | # And if any member of the fold is not the same range type as the | |
86 | # source, add it directly to the tests. It needs to be an array of an | |
87 | # array, so that it is distinguished from multiple single folds | |
88 | if ($from == 0xDF || $from == 0x390 || $from == 0x3B0 | |
89 | || $from_range_type != $Unicode | |
90 | || grep { range_type($_) != $from_range_type } @folded) | |
91 | { | |
92 | $tests{$from} = [ [ @folded ] ]; | |
93 | } | |
94 | else { | |
95 | ||
96 | # Must be Unicode here, so chr is automatically utf8. Get the | |
97 | # number of bytes in each. This is because the optimizer cares | |
98 | # about length differences. | |
99 | my $from_length = length encode('utf-8', chr($from)); | |
100 | my $to_length = length encode('utf-8', pack 'U*', @folded); | |
101 | push @{$multi_folds{$from_length}{$to_length}}, { $from => [ @folded ] }; | |
102 | } | |
103 | } | |
104 | ||
105 | # Perl only deals with C and F folds | |
106 | next if $fold_type ne 'C'; | |
107 | ||
108 | # C folds are single-char $from to single-char $folded | |
109 | push @{$folded_from{hex $folded[0]}}, $from; | |
110 | } | |
111 | ||
112 | # Now try to sort the single char folds into equivalence classes of that are | |
113 | # likely to have identical successes and failures. Any fold that crosses | |
114 | # range types is suspect, and is automatically tested. Otherwise, store by | |
115 | # the number of characters that participate in a fold. Likely all folds in a | |
116 | # range type that fold to each other like B->b->B will have identical success | |
117 | # and failure; similarly all folds that have three characters participating | |
118 | # are likely to have the same successes and failures, etc. | |
119 | foreach my $folded (sort numerically keys %folded_from) { | |
120 | my $target_range_type = range_type($folded); | |
121 | my $count = @{$folded_from{$folded}}; | |
122 | ||
123 | # Automatically test any fold that crosses range types | |
124 | if (grep { range_type($_) != $target_range_type } @{$folded_from{$folded}}) | |
125 | { | |
126 | $tests{$folded} = $folded_from{$folded}; | |
127 | } | |
128 | else { | |
129 | push @{$simple_folds{$target_range_type}{$count}}, | |
130 | { $folded => $folded_from{$folded} }; | |
131 | } | |
132 | } | |
133 | ||
134 | foreach my $from_length (keys %multi_folds) { | |
135 | foreach my $fold_length (keys %{$multi_folds{$from_length}}) { | |
136 | #print __LINE__, ref $multi_folds{$from_length}{$fold_length}, Dumper $multi_folds{$from_length}{$fold_length}; | |
137 | foreach my $test (@{$multi_folds{$from_length}{$fold_length}}) { | |
138 | #print __LINE__, ": $from_length, $fold_length, $test:\n"; | |
139 | my ($target, $pattern) = each %$test; | |
140 | #print __LINE__, ": $target: $pattern\n"; | |
141 | $tests{$target} = $pattern; | |
142 | last if $skip_apparently_redundant; | |
143 | } | |
144 | } | |
145 | } | |
146 | ||
147 | # Add in tests for single character folds. Add tests for each range type, | |
148 | # and within those tests for each number of characters participating in a | |
149 | # fold. Thus B->b has two characters participating. But K->k and Kelvin | |
150 | # Sign->k has three characters participating. So we would make sure that | |
151 | # there is a test for 3 chars, 4 chars, ... . (Note that the 'k' example is a | |
152 | # bad one because it crosses range types, so is automatically tested. In the | |
153 | # Unicode range there are various of these 3 and 4 char classes, but aren't as | |
154 | # easily described as the 'k' one.) | |
155 | foreach my $type (keys %simple_folds) { | |
156 | foreach my $count (keys %{$simple_folds{$type}}) { | |
157 | foreach my $test (@{$simple_folds{$type}{$count}}) { | |
158 | my ($target, $pattern) = each %$test; | |
159 | $tests{$target} = $pattern; | |
160 | last if $skip_apparently_redundant; | |
161 | } | |
162 | } | |
163 | } | |
164 | ||
165 | # For each range type, test additionally a character that folds to itself | |
166 | $tests{0x3A} = [ 0x3A ]; | |
167 | $tests{0xF7} = [ 0xF7 ]; | |
168 | $tests{0x2C7} = [ 0x2C7 ]; | |
169 | ||
170 | my $clump_execs = 10000; # Speed up by building an 'exec' of many tests | |
171 | my @eval_tests; | |
172 | ||
173 | # For use by pairs() in generating combinations | |
174 | sub prefix { | |
175 | my $p = shift; | |
176 | map [ $p, $_ ], @_ | |
177 | } | |
178 | ||
179 | # Returns all ordered combinations of pairs of elements from the input array. | |
180 | # It doesn't return pairs like (a, a), (b, b). Change the slice to an array | |
181 | # to do that. This was just to have fewer tests. | |
182 | sub pairs (@) { | |
183 | #print __LINE__, ": ", join(" XXX ", @_), "\n"; | |
184 | map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_ | |
185 | } | |
186 | ||
187 | ||
188 | # Finally ready to do the tests | |
189 | my $count=1; | |
190 | foreach my $test (sort { numerically } keys %tests) { | |
191 | ||
192 | my $previous_target; | |
193 | my $previous_pattern; | |
194 | my @pairs = pairs(sort numerically $test, @{$tests{$test}}); | |
195 | ||
196 | # Each fold can be viewed as a closure of all the characters that | |
197 | # participate in it. Look at each possible pairing from a closure, with the | |
198 | # first member of the pair the target string to match against, and the | |
199 | # second member forming the pattern. Thus each fold member gets tested as | |
200 | # the string, and the pattern with every other member in the opposite role. | |
201 | while (my $pair = shift @pairs) { | |
202 | my ($target, $pattern) = @$pair; | |
203 | ||
204 | # When testing a char that doesn't fold, we can get the same | |
205 | # permutation twice; so skip all but the first. | |
206 | next if $previous_target | |
207 | && $previous_target == $target | |
208 | && $previous_pattern == $pattern; | |
209 | ($previous_target, $previous_pattern) = ($target, $pattern); | |
210 | ||
211 | # Each side may be either a single char or a string. Extract each into an | |
212 | # array (perhaps of length 1) | |
213 | my @target, my @pattern; | |
214 | @target = (ref $target) ? @$target : $target; | |
215 | @pattern = (ref $pattern) ? @$pattern : $pattern; | |
216 | ||
217 | # Have to convert non-utf8 chars to native char set | |
218 | @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target; | |
219 | @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern; | |
220 | ||
221 | # Get in hex form. | |
222 | my @x_target = map { sprintf "\\x{%04X}", $_ } @target; | |
223 | my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern; | |
224 | ||
225 | my $target_above_latin1 = grep { $_ > 255 } @target; | |
226 | my $pattern_above_latin1 = grep { $_ > 255 } @pattern; | |
227 | my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0]; | |
228 | ||
229 | # We don't test multi-char folding into other multi-chars. We are testing | |
230 | # a code point that folds to or from other characters. Find the single | |
231 | # code point for diagnostic purposes. (If both are single, choose the | |
232 | # target string) | |
233 | my $ord = @target == 1 ? $target[0] : $pattern[0]; | |
234 | my $progress = sprintf "\"%s\" and /%s/", | |
235 | join("", @x_target), | |
236 | join("", @x_pattern); | |
237 | #print $progress, "\n"; | |
238 | #diag $progress; | |
239 | ||
240 | # Now grind out tests, using various combinations. | |
241 | foreach my $uni_semantics ("", 'u') { # Both non- and uni semantics | |
242 | foreach my $utf8_target (0, 1) { # Both utf8 and not, for | |
243 | # code points < 256 | |
244 | my $upgrade_target = ""; | |
245 | ||
246 | # These must already be in utf8 because the string to match has | |
247 | # something above latin1. So impossible to test if to not to be in | |
248 | # utf8; and otherwise, no upgrade is needed. | |
249 | next if $target_above_latin1 && ! $utf8_target; | |
250 | $upgrade_target = '; utf8::upgrade($c)' if ! $target_above_latin1 && $utf8_target; | |
251 | ||
35bae598 KW |
252 | foreach my $uni_pattern (0, 1) { |
253 | next if $pattern_above_latin1 && ! $uni_pattern; | |
a2d9a01a | 254 | my $upgrade_pattern = ""; |
35bae598 | 255 | $upgrade_pattern = '; use re "/u"' if ! $pattern_above_latin1 && $uni_pattern; |
a2d9a01a KW |
256 | |
257 | my $lhs = join "", @x_target; | |
258 | my @rhs = @x_pattern; | |
259 | #print "$lhs: ", "/@rhs/\n"; | |
260 | ||
1ef17b72 | 261 | foreach my $bracketed (0, 1) { # Put rhs in [...], or not |
a2d9a01a KW |
262 | foreach my $inverted (0,1) { |
263 | next if $inverted && ! $bracketed; | |
264 | ||
265 | # In some cases, add an extra character that doesn't fold, and | |
266 | # looks ok in the output. | |
267 | my $extra_char = "_"; | |
268 | foreach my $prepend ("", $extra_char) { | |
269 | foreach my $append ("", $extra_char) { | |
270 | # Append a char for after quantifier, as results vary if no | |
271 | # char appended. | |
272 | ||
273 | # Assemble the rhs. Put each character in a separate | |
274 | # bracketed if using charclasses. This creates a stress on | |
275 | # the code to span a match across multiple elements | |
276 | my $rhs = ""; | |
277 | foreach my $rhs_char (@rhs) { | |
278 | $rhs .= '[' if $bracketed; | |
279 | $rhs .= '^' if $inverted; | |
280 | $rhs .= $rhs_char; | |
281 | ||
282 | # Add a character to the class, so class doesn't get | |
283 | # optimized out | |
284 | $rhs .= '_]' if $bracketed; | |
285 | } | |
286 | ||
287 | # Add one of: no capturing parens | |
288 | # a single set | |
289 | # a nested set | |
290 | # Use quantifiers and extra variable width matches inside | |
291 | # them to keep some optimizations from happening | |
292 | foreach my $parend (0, 1, 2) { | |
293 | my $interior = (! $parend) | |
294 | ? $rhs | |
295 | : ($parend == 1) | |
296 | ? "(${rhs},?)" | |
297 | : "((${rhs})+,?)"; | |
298 | foreach my $quantifier ("", '?', '*', '+', '{1,3}') { | |
299 | ||
300 | # A ? or * quantifier normally causes the thing to be | |
301 | # able to match a null string | |
302 | my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*'; | |
303 | ||
304 | # But since we only quantify the last character in a | |
305 | # multiple fold, the other characters will have width, | |
306 | # except if we are quantifying the whole rhs | |
307 | my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend); | |
308 | ||
309 | foreach my $l_anchor ("", '^') { # '\A' didn't change result) | |
310 | foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result) | |
311 | ||
312 | # The folded part can match the null string if it | |
313 | # isn't required to have width, and there's not | |
314 | # something on one or both sides that force it to. | |
315 | my $must_match = ! $can_match_null || ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append); | |
316 | #next unless $must_match; | |
317 | my $quantified = "(?$uni_semantics:$l_anchor$prepend$interior${quantifier}$append$r_anchor)"; | |
318 | my $op; | |
35bae598 | 319 | if ($must_match && ! $utf8_target && ! $uni_pattern && ! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self) { |
a2d9a01a KW |
320 | $op = 0; |
321 | } else { | |
322 | $op = 1; | |
323 | } | |
324 | $op = ! $op if $must_match && $inverted; | |
325 | $op = ($op) ? '=~' : '!~'; | |
326 | ||
35bae598 KW |
327 | my $stuff .= " utf8_target=$utf8_target, uni_semantics=$uni_semantics, uni_pattern=$uni_pattern, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quantifier=$quantifier, l_anchor=$l_anchor, r_anchor=$r_anchor"; |
328 | my $eval = "my \$c = \"$prepend$lhs$append\"$upgrade_target; $upgrade_pattern; \$c $op /$quantified/i;"; | |
a2d9a01a KW |
329 | |
330 | # XXX Doesn't currently test multi-char folds | |
331 | next if @pattern != 1; | |
332 | #next if ! $must_match; | |
333 | push @eval_tests, qq[ok(eval '$eval', '$eval')]; | |
334 | $count++; | |
335 | ||
336 | # Group tests | |
337 | if (@eval_tests >= $clump_execs) { | |
338 | eval join ";\n", @eval_tests; | |
339 | undef @eval_tests; | |
340 | } | |
341 | } | |
342 | } | |
343 | } | |
344 | } | |
345 | } | |
346 | } | |
347 | } | |
348 | } | |
349 | } | |
350 | } | |
351 | } | |
352 | } | |
353 | } | |
354 | ||
355 | # Finish up any tests not already done | |
356 | eval join ";\n", @eval_tests; | |
357 | ||
358 | plan($count-1); | |
359 | ||
360 | 1 |