Commit | Line | Data |
---|---|---|
371a505e | 1 | # Grind out a lot of combinatoric tests for folding. |
a2d9a01a | 2 | |
a2d9a01a KW |
3 | binmode STDOUT, ":utf8"; |
4 | ||
5 | BEGIN { | |
6 | chdir 't' if -d 't'; | |
7 | @INC = '../lib'; | |
8 | require './test.pl'; | |
a59efd0a | 9 | skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX"); |
a2d9a01a KW |
10 | } |
11 | ||
4fd7f5ff NC |
12 | use charnames ":full"; |
13 | ||
29d01a3e | 14 | my $DEBUG = 0; # Outputs extra information for debugging this .t |
abf4d645 | 15 | |
a2d9a01a KW |
16 | use strict; |
17 | use warnings; | |
d08723ac | 18 | use Encode; |
a59efd0a | 19 | use POSIX; |
a2d9a01a KW |
20 | |
21 | # Tests both unicode and not, so make sure not implicitly testing unicode | |
22 | no feature 'unicode_strings'; | |
23 | ||
24 | # Case-insensitive matching is a large and complicated issue. Perl does not | |
25 | # implement it fully, properly. For example, it doesn't include normalization | |
26 | # as part of the equation. To test every conceivable combination is clearly | |
27 | # impossible; these tests are mostly drawn from visual inspection of the code | |
28 | # and experience, trying to exercise all areas. | |
29 | ||
30 | # There are three basic ranges of characters that Perl may treat differently: | |
31 | # 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are | |
32 | # referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants | |
33 | # are all controls that fold to themselves. | |
34 | my $ASCII = 1; | |
35 | ||
36 | # 2) Other characters that fit into a byte but are different in utf8 than not; | |
37 | # here referred to, taking some liberties, as Latin1. | |
38 | my $Latin1 = 2; | |
39 | ||
40 | # 3) Characters that won't fit in a byte; here referred to as Unicode | |
41 | my $Unicode = 3; | |
42 | ||
43 | # Within these basic groups are equivalence classes that testing any character | |
44 | # in is likely to lead to the same results as any other character. This is | |
45 | # used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is | |
46 | # set. | |
47 | my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS}; | |
48 | ||
67fc5dca DL |
49 | # Additionally parts of this test run a lot of subtests, outputting the |
50 | # resulting TAP can be expensive so the tests are summarised internally. The | |
51 | # PERL_DEBUG_FULL_TEST environment variable can be set to produce the full | |
52 | # output for debugging purposes. | |
53 | ||
a2d9a01a KW |
54 | sub range_type { |
55 | my $ord = shift; | |
56 | ||
57 | return $ASCII if $ord < 128; | |
58 | return $Latin1 if $ord < 256; | |
59 | return $Unicode; | |
60 | } | |
61 | ||
62 | sub numerically { | |
63 | return $a <=> $b | |
64 | } | |
65 | ||
bb2fbce0 | 66 | sub run_test($$$$) { |
7a0a13a6 | 67 | my ($test, $count, $todo, $debug) = @_; |
abf4d645 KW |
68 | |
69 | $debug = "" unless $DEBUG; | |
7a0a13a6 | 70 | $todo = "Known problem" if $todo; |
abf4d645 | 71 | |
bb2fbce0 DM |
72 | TODO: { |
73 | local $::TODO = $todo ? "Known problem" : undef; | |
74 | ok(eval $test, "$test; $debug"); | |
75 | } | |
abf4d645 KW |
76 | } |
77 | ||
29d01a3e | 78 | my %tests; # The final set of tests. keys are the code points to test |
a2d9a01a KW |
79 | my %simple_folds; |
80 | my %multi_folds; | |
81 | ||
82 | # First, analyze the current Unicode's folding rules | |
83 | my %folded_from; | |
84 | my $file="../lib/unicore/CaseFolding.txt"; | |
85 | open my $fh, "<", $file or die "Failed to read '$file': $!"; | |
86 | while (<$fh>) { | |
87 | chomp; | |
88 | ||
89 | # Lines look like (though without the initial '#') | |
90 | #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE | |
91 | ||
92 | my ($line, $comment) = split / \s+ \# \s+ /x, $_; | |
93 | next if $line eq "" || substr($line, 0, 1) eq '#'; | |
94 | my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line; | |
95 | ||
96 | my $from = hex $hex_from; | |
97 | ||
98 | if ($fold_type eq 'F') { | |
27f6057f KW |
99 | my $from_range_type = range_type($from); |
100 | ||
101 | # If we were testing comprehensively, we would try every combination | |
102 | # of upper and lower case in the fold, but it is quite likely that if | |
103 | # the code can handle all combinations if it can handle the cases | |
104 | # where everything is upper and when everything is lower. Because of | |
105 | # complement matching, we need to do both. And we use the | |
106 | # reverse-fold instead of uppercase. | |
a2d9a01a | 107 | @folded = map { hex $_ } @folded; |
27f6057f KW |
108 | # XXX better to use reverse fold of these instead of uc |
109 | my @uc_folded = map { ord uc chr $_ } @folded; | |
a2d9a01a KW |
110 | |
111 | # Include three code points that are handled internally by the regex | |
29d01a3e | 112 | # engine specially, plus all non-above-255 multi folds (which actually |
a2d9a01a KW |
113 | # the only one is already included in the three, but this makes sure) |
114 | # And if any member of the fold is not the same range type as the | |
115 | # source, add it directly to the tests. It needs to be an array of an | |
116 | # array, so that it is distinguished from multiple single folds | |
117 | if ($from == 0xDF || $from == 0x390 || $from == 0x3B0 | |
118 | || $from_range_type != $Unicode | |
119 | || grep { range_type($_) != $from_range_type } @folded) | |
120 | { | |
27f6057f | 121 | $tests{$from} = [ [ @folded ], [ @uc_folded ] ]; |
a2d9a01a KW |
122 | } |
123 | else { | |
124 | ||
27f6057f KW |
125 | # The only multi-char non-utf8 fold is DF, which is handled above, |
126 | # so here chr() must be utf8. Get the number of bytes in each. | |
127 | # This is because the optimizer cares about length differences. | |
128 | my $from_length = length encode('UTF-8', chr($from)); | |
129 | my $to_length = length encode('UTF-8', pack 'U*', @folded); | |
130 | push @{$multi_folds{$from_length}{$to_length}}, { $from => [ [ @folded ], [ @uc_folded ] ] }; | |
a2d9a01a KW |
131 | } |
132 | } | |
133 | ||
134 | # Perl only deals with C and F folds | |
135 | next if $fold_type ne 'C'; | |
136 | ||
d2025f57 KW |
137 | # C folds are single-char $from to single-char $folded, in chr terms |
138 | # folded_from{'s'} = [ 'S', \N{LATIN SMALL LETTER LONG S} ] | |
a2d9a01a KW |
139 | push @{$folded_from{hex $folded[0]}}, $from; |
140 | } | |
141 | ||
d2025f57 | 142 | # Now try to sort the single char folds into equivalence classes that are |
a2d9a01a KW |
143 | # likely to have identical successes and failures. Any fold that crosses |
144 | # range types is suspect, and is automatically tested. Otherwise, store by | |
145 | # the number of characters that participate in a fold. Likely all folds in a | |
146 | # range type that fold to each other like B->b->B will have identical success | |
147 | # and failure; similarly all folds that have three characters participating | |
148 | # are likely to have the same successes and failures, etc. | |
149 | foreach my $folded (sort numerically keys %folded_from) { | |
150 | my $target_range_type = range_type($folded); | |
151 | my $count = @{$folded_from{$folded}}; | |
152 | ||
153 | # Automatically test any fold that crosses range types | |
154 | if (grep { range_type($_) != $target_range_type } @{$folded_from{$folded}}) | |
155 | { | |
156 | $tests{$folded} = $folded_from{$folded}; | |
157 | } | |
158 | else { | |
159 | push @{$simple_folds{$target_range_type}{$count}}, | |
160 | { $folded => $folded_from{$folded} }; | |
a7caa9e8 | 161 | } |
a2d9a01a KW |
162 | } |
163 | ||
164 | foreach my $from_length (keys %multi_folds) { | |
165 | foreach my $fold_length (keys %{$multi_folds{$from_length}}) { | |
166 | #print __LINE__, ref $multi_folds{$from_length}{$fold_length}, Dumper $multi_folds{$from_length}{$fold_length}; | |
167 | foreach my $test (@{$multi_folds{$from_length}{$fold_length}}) { | |
168 | #print __LINE__, ": $from_length, $fold_length, $test:\n"; | |
169 | my ($target, $pattern) = each %$test; | |
170 | #print __LINE__, ": $target: $pattern\n"; | |
171 | $tests{$target} = $pattern; | |
172 | last if $skip_apparently_redundant; | |
173 | } | |
174 | } | |
175 | } | |
176 | ||
177 | # Add in tests for single character folds. Add tests for each range type, | |
178 | # and within those tests for each number of characters participating in a | |
179 | # fold. Thus B->b has two characters participating. But K->k and Kelvin | |
180 | # Sign->k has three characters participating. So we would make sure that | |
181 | # there is a test for 3 chars, 4 chars, ... . (Note that the 'k' example is a | |
182 | # bad one because it crosses range types, so is automatically tested. In the | |
183 | # Unicode range there are various of these 3 and 4 char classes, but aren't as | |
184 | # easily described as the 'k' one.) | |
185 | foreach my $type (keys %simple_folds) { | |
186 | foreach my $count (keys %{$simple_folds{$type}}) { | |
187 | foreach my $test (@{$simple_folds{$type}{$count}}) { | |
188 | my ($target, $pattern) = each %$test; | |
189 | $tests{$target} = $pattern; | |
190 | last if $skip_apparently_redundant; | |
191 | } | |
192 | } | |
193 | } | |
194 | ||
195 | # For each range type, test additionally a character that folds to itself | |
196 | $tests{0x3A} = [ 0x3A ]; | |
197 | $tests{0xF7} = [ 0xF7 ]; | |
198 | $tests{0x2C7} = [ 0x2C7 ]; | |
199 | ||
a2d9a01a | 200 | |
2f7f8cb1 KW |
201 | # To cut down on the number of tests |
202 | my $has_tested_aa_above_latin1; | |
203 | my $has_tested_latin1_aa; | |
6eea66eb | 204 | my $has_tested_ascii_aa; |
17580e7a | 205 | my $has_tested_l_above_latin1; |
ceb92b9b | 206 | my $has_tested_above_latin1_l; |
6eea66eb | 207 | my $has_tested_ascii_l; |
963bd580 KW |
208 | my $has_tested_above_latin1_d; |
209 | my $has_tested_ascii_d; | |
f16e8484 | 210 | my $has_tested_non_latin1_d; |
419d8974 KW |
211 | my $has_tested_above_latin1_a; |
212 | my $has_tested_ascii_a; | |
213 | my $has_tested_non_latin1_a; | |
2f7f8cb1 | 214 | |
a2d9a01a KW |
215 | # For use by pairs() in generating combinations |
216 | sub prefix { | |
217 | my $p = shift; | |
a7caa9e8 | 218 | map [ $p, $_ ], @_ |
a2d9a01a KW |
219 | } |
220 | ||
221 | # Returns all ordered combinations of pairs of elements from the input array. | |
222 | # It doesn't return pairs like (a, a), (b, b). Change the slice to an array | |
223 | # to do that. This was just to have fewer tests. | |
a7caa9e8 | 224 | sub pairs (@) { |
a2d9a01a | 225 | #print __LINE__, ": ", join(" XXX ", @_), "\n"; |
a7caa9e8 | 226 | map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_ |
a2d9a01a KW |
227 | } |
228 | ||
419d8974 | 229 | my @charsets = qw(d u a aa); |
a59efd0a KW |
230 | my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // ""; |
231 | push @charsets, 'l' if $current_locale eq 'C'; | |
a2d9a01a KW |
232 | |
233 | # Finally ready to do the tests | |
abf4d645 | 234 | my $count=0; |
a2d9a01a KW |
235 | foreach my $test (sort { numerically } keys %tests) { |
236 | ||
237 | my $previous_target; | |
238 | my $previous_pattern; | |
239 | my @pairs = pairs(sort numerically $test, @{$tests{$test}}); | |
240 | ||
241 | # Each fold can be viewed as a closure of all the characters that | |
242 | # participate in it. Look at each possible pairing from a closure, with the | |
243 | # first member of the pair the target string to match against, and the | |
244 | # second member forming the pattern. Thus each fold member gets tested as | |
245 | # the string, and the pattern with every other member in the opposite role. | |
246 | while (my $pair = shift @pairs) { | |
247 | my ($target, $pattern) = @$pair; | |
248 | ||
249 | # When testing a char that doesn't fold, we can get the same | |
250 | # permutation twice; so skip all but the first. | |
251 | next if $previous_target | |
252 | && $previous_target == $target | |
253 | && $previous_pattern == $pattern; | |
254 | ($previous_target, $previous_pattern) = ($target, $pattern); | |
255 | ||
256 | # Each side may be either a single char or a string. Extract each into an | |
257 | # array (perhaps of length 1) | |
258 | my @target, my @pattern; | |
259 | @target = (ref $target) ? @$target : $target; | |
260 | @pattern = (ref $pattern) ? @$pattern : $pattern; | |
261 | ||
262 | # Have to convert non-utf8 chars to native char set | |
263 | @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target; | |
264 | @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern; | |
265 | ||
266 | # Get in hex form. | |
267 | my @x_target = map { sprintf "\\x{%04X}", $_ } @target; | |
268 | my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern; | |
269 | ||
270 | my $target_above_latin1 = grep { $_ > 255 } @target; | |
271 | my $pattern_above_latin1 = grep { $_ > 255 } @pattern; | |
2f7f8cb1 KW |
272 | my $target_has_ascii = grep { $_ < 128 } @target; |
273 | my $pattern_has_ascii = grep { $_ < 128 } @pattern; | |
6eea66eb KW |
274 | my $target_only_ascii = ! grep { $_ > 127 } @target; |
275 | my $pattern_only_ascii = ! grep { $_ > 127 } @pattern; | |
17580e7a | 276 | my $target_has_latin1 = grep { $_ < 256 } @target; |
f16e8484 KW |
277 | my $target_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @target; |
278 | my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @pattern; | |
17580e7a | 279 | my $pattern_has_latin1 = grep { $_ < 256 } @pattern; |
a2d9a01a KW |
280 | my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0]; |
281 | ||
282 | # We don't test multi-char folding into other multi-chars. We are testing | |
283 | # a code point that folds to or from other characters. Find the single | |
284 | # code point for diagnostic purposes. (If both are single, choose the | |
285 | # target string) | |
286 | my $ord = @target == 1 ? $target[0] : $pattern[0]; | |
7fea222d KW |
287 | my $progress = sprintf "%04X: \"%s\" and /%s/", |
288 | $test, | |
a2d9a01a KW |
289 | join("", @x_target), |
290 | join("", @x_pattern); | |
291 | #print $progress, "\n"; | |
292 | #diag $progress; | |
293 | ||
294 | # Now grind out tests, using various combinations. | |
a59efd0a | 295 | foreach my $charset (@charsets) { |
2f7f8cb1 | 296 | |
63fb01f9 KW |
297 | # To cut down somewhat on the enormous quantity of tests this currently |
298 | # runs, skip some for some of the character sets whose results aren't | |
299 | # likely to differ from others. But run all tests on the code points | |
300 | # that don't fold, plus one other set in each range group. | |
301 | if (! $is_self) { | |
302 | ||
b0d6380c KW |
303 | # /aa should only affect things with folds in the ASCII range. But, try |
304 | # it on one set in the other ranges just to make sure it doesn't break | |
305 | # them. | |
306 | if ($charset eq 'aa') { | |
307 | if (! $target_has_ascii && ! $pattern_has_ascii) { | |
308 | if ($target_above_latin1 || $pattern_above_latin1) { | |
309 | next if defined $has_tested_aa_above_latin1 | |
310 | && $has_tested_aa_above_latin1 != $test; | |
311 | $has_tested_aa_above_latin1 = $test; | |
312 | } | |
313 | next if defined $has_tested_latin1_aa | |
314 | && $has_tested_latin1_aa != $test; | |
315 | $has_tested_latin1_aa = $test; | |
2f7f8cb1 | 316 | } |
f02497ec KW |
317 | elsif ($target_only_ascii && $pattern_only_ascii) { |
318 | ||
319 | # And, except for one set just to make sure, skip tests | |
320 | # where both elements in the pair are ASCII. If one works for | |
321 | # aa, the others are likely too. This skips tests where the | |
322 | # fold is from non-ASCII to ASCII, but this part of the test | |
323 | # is just about the ASCII components. | |
324 | next if defined $has_tested_ascii_l | |
325 | && $has_tested_ascii_l != $test; | |
326 | $has_tested_ascii_l = $test; | |
327 | } | |
2f7f8cb1 | 328 | } |
b0d6380c | 329 | elsif ($charset eq 'l') { |
6ed220eb KW |
330 | |
331 | # For l, don't need to test beyond one set those things that are | |
7b4853d1 KW |
332 | # all above latin1, because unlikely to have different successes |
333 | # than /u | |
b0d6380c | 334 | if (! $target_has_latin1 && ! $pattern_has_latin1) { |
ceb92b9b KW |
335 | next if defined $has_tested_above_latin1_l |
336 | && $has_tested_above_latin1_l != $test; | |
337 | $has_tested_above_latin1_l = $test; | |
b0d6380c | 338 | } |
6eea66eb KW |
339 | elsif ($target_only_ascii && $pattern_only_ascii) { |
340 | ||
341 | # And, except for one set just to make sure, skip tests | |
f02497ec KW |
342 | # where both elements in the pair are ASCII. This is |
343 | # essentially the same reasoning as above for /aa. | |
6eea66eb KW |
344 | next if defined $has_tested_ascii_l |
345 | && $has_tested_ascii_l != $test; | |
346 | $has_tested_ascii_l = $test; | |
347 | } | |
17580e7a | 348 | } |
f16e8484 KW |
349 | elsif ($charset eq 'd') { |
350 | # Similarly for d. Beyond one test (besides self) each, we don't | |
351 | # test pairs that are both ascii; or both above latin1, or are | |
352 | # combinations of ascii and above latin1. | |
353 | if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { | |
354 | if ($target_has_ascii && $pattern_has_ascii) { | |
963bd580 KW |
355 | next if defined $has_tested_ascii_d |
356 | && $has_tested_ascii_d != $test; | |
f16e8484 KW |
357 | $has_tested_ascii_d = $test |
358 | } | |
359 | elsif (! $target_has_latin1 && ! $pattern_has_latin1) { | |
360 | next if defined $has_tested_above_latin1_d | |
361 | && $has_tested_above_latin1_d != $test; | |
362 | $has_tested_above_latin1_d = $test; | |
363 | } | |
364 | else { | |
365 | next if defined $has_tested_non_latin1_d | |
366 | && $has_tested_non_latin1_d != $test; | |
367 | $has_tested_non_latin1_d = $test; | |
368 | } | |
963bd580 KW |
369 | } |
370 | } | |
419d8974 KW |
371 | elsif ($charset eq 'a') { |
372 | # Similarly for a. This should match identically to /u, so wasn't | |
373 | # tested at all until a bug was found that was thereby missed. | |
374 | # As a compromise, beyond one test (besides self) each, we don't | |
375 | # test pairs that are both ascii; or both above latin1, or are | |
376 | # combinations of ascii and above latin1. | |
377 | if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { | |
378 | if ($target_has_ascii && $pattern_has_ascii) { | |
379 | next if defined $has_tested_ascii_a | |
380 | && $has_tested_ascii_a != $test; | |
381 | $has_tested_ascii_a = $test | |
382 | } | |
383 | elsif (! $target_has_latin1 && ! $pattern_has_latin1) { | |
384 | next if defined $has_tested_above_latin1_a | |
385 | && $has_tested_above_latin1_a != $test; | |
386 | $has_tested_above_latin1_a = $test; | |
387 | } | |
388 | else { | |
389 | next if defined $has_tested_non_latin1_a | |
390 | && $has_tested_non_latin1_a != $test; | |
391 | $has_tested_non_latin1_a = $test; | |
392 | } | |
393 | } | |
394 | } | |
17580e7a | 395 | } |
2f7f8cb1 | 396 | |
a2d9a01a KW |
397 | foreach my $utf8_target (0, 1) { # Both utf8 and not, for |
398 | # code points < 256 | |
399 | my $upgrade_target = ""; | |
400 | ||
401 | # These must already be in utf8 because the string to match has | |
402 | # something above latin1. So impossible to test if to not to be in | |
403 | # utf8; and otherwise, no upgrade is needed. | |
404 | next if $target_above_latin1 && ! $utf8_target; | |
d08723ac | 405 | $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target; |
a2d9a01a | 406 | |
d08723ac KW |
407 | foreach my $utf8_pattern (0, 1) { |
408 | next if $pattern_above_latin1 && ! $utf8_pattern; | |
17580e7a KW |
409 | |
410 | # Our testing of 'l' uses the POSIX locale, which is ASCII-only | |
411 | my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/); | |
a2d9a01a | 412 | my $upgrade_pattern = ""; |
d08723ac | 413 | $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern; |
a2d9a01a KW |
414 | |
415 | my $lhs = join "", @x_target; | |
aa3ca102 | 416 | my $lhs_str = eval qq{"$lhs"}; fail($@) if $@; |
a2d9a01a | 417 | my @rhs = @x_pattern; |
371a505e | 418 | my $rhs = join "", @rhs; |
2f7f8cb1 | 419 | my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self) |
17580e7a KW |
420 | || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii) |
421 | || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1); | |
371a505e KW |
422 | |
423 | # Do simple tests of referencing capture buffers, named and | |
424 | # numbered. | |
425 | my $op = '=~'; | |
426 | $op = '!~' if $should_fail; | |
d2025f57 | 427 | |
7a0a13a6 KW |
428 | # I'm afraid this was derived from trial and error. |
429 | my $todo = ($test == 0xdf | |
430 | && $lhs =~ /DF/ | |
431 | && $uni_semantics | |
419d8974 KW |
432 | && ($charset eq 'u' || $charset eq 'a' || $charset eq 'd') |
433 | && ! (($charset eq 'u' || $charset eq 'a') | |
434 | && (($upgrade_target eq "") != ($upgrade_pattern eq ""))) | |
7a0a13a6 KW |
435 | && ! ($charset eq 'd' && (! $upgrade_target || ! $upgrade_pattern)) |
436 | ); | |
371a505e | 437 | my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; |
bb2fbce0 | 438 | run_test($eval, ++$count, $todo, ""); |
abf4d645 | 439 | |
371a505e | 440 | $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; |
bb2fbce0 | 441 | run_test($eval, ++$count, $todo, ""); |
abf4d645 | 442 | |
371a505e KW |
443 | if ($lhs ne $rhs) { |
444 | $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; | |
bb2fbce0 | 445 | run_test($eval, ++$count, "", ""); |
abf4d645 | 446 | |
371a505e | 447 | $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; |
bb2fbce0 | 448 | run_test($eval, ++$count, "", ""); |
371a505e | 449 | } |
371a505e | 450 | |
6f9cf5ec DM |
451 | # XXX Doesn't currently test multi-char folds in pattern |
452 | next if @pattern != 1; | |
453 | ||
67fc5dca DL |
454 | my $okays = 0; |
455 | my $this_iteration = 0; | |
456 | ||
1ef17b72 | 457 | foreach my $bracketed (0, 1) { # Put rhs in [...], or not |
a2d9a01a | 458 | foreach my $inverted (0,1) { |
d2025f57 | 459 | next if $inverted && ! $bracketed; # inversion only valid in [^...] |
a2d9a01a KW |
460 | |
461 | # In some cases, add an extra character that doesn't fold, and | |
462 | # looks ok in the output. | |
463 | my $extra_char = "_"; | |
464 | foreach my $prepend ("", $extra_char) { | |
465 | foreach my $append ("", $extra_char) { | |
a2d9a01a KW |
466 | |
467 | # Assemble the rhs. Put each character in a separate | |
468 | # bracketed if using charclasses. This creates a stress on | |
469 | # the code to span a match across multiple elements | |
470 | my $rhs = ""; | |
471 | foreach my $rhs_char (@rhs) { | |
472 | $rhs .= '[' if $bracketed; | |
473 | $rhs .= '^' if $inverted; | |
474 | $rhs .= $rhs_char; | |
475 | ||
476 | # Add a character to the class, so class doesn't get | |
477 | # optimized out | |
478 | $rhs .= '_]' if $bracketed; | |
479 | } | |
480 | ||
481 | # Add one of: no capturing parens | |
482 | # a single set | |
483 | # a nested set | |
484 | # Use quantifiers and extra variable width matches inside | |
485 | # them to keep some optimizations from happening | |
486 | foreach my $parend (0, 1, 2) { | |
487 | my $interior = (! $parend) | |
488 | ? $rhs | |
489 | : ($parend == 1) | |
490 | ? "(${rhs},?)" | |
491 | : "((${rhs})+,?)"; | |
492 | foreach my $quantifier ("", '?', '*', '+', '{1,3}') { | |
493 | ||
494 | # A ? or * quantifier normally causes the thing to be | |
495 | # able to match a null string | |
496 | my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*'; | |
497 | ||
498 | # But since we only quantify the last character in a | |
499 | # multiple fold, the other characters will have width, | |
500 | # except if we are quantifying the whole rhs | |
501 | my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend); | |
502 | ||
503 | foreach my $l_anchor ("", '^') { # '\A' didn't change result) | |
504 | foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result) | |
505 | ||
506 | # The folded part can match the null string if it | |
507 | # isn't required to have width, and there's not | |
508 | # something on one or both sides that force it to. | |
2f7f8cb1 KW |
509 | my $both_sides = ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append); |
510 | my $must_match = ! $can_match_null || $both_sides; | |
511 | # for performance, but doing this missed many failures | |
a2d9a01a | 512 | #next unless $must_match; |
d08723ac | 513 | my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)"; |
a2d9a01a | 514 | my $op; |
d08723ac | 515 | if ($must_match && $should_fail) { |
a2d9a01a KW |
516 | $op = 0; |
517 | } else { | |
518 | $op = 1; | |
519 | } | |
520 | $op = ! $op if $must_match && $inverted; | |
27f6057f KW |
521 | |
522 | if ($inverted && @target > 1) { | |
523 | # When doing an inverted match against a | |
524 | # multi-char target, and there is not something on | |
525 | # the left to anchor the match, if it shouldn't | |
526 | # succeed, skip, as what will happen (when working | |
527 | # correctly) is that it will match the first | |
528 | # position correctly, and then be inverted to not | |
529 | # match; then it will go to the second position | |
530 | # where it won't match, but get inverted to match, | |
531 | # and hence succeeding. | |
532 | next if ! ($l_anchor || $prepend) && ! $op; | |
533 | ||
534 | # Can't ever match for latin1 code points non-uni | |
535 | # semantics that have a inverted multi-char fold | |
536 | # when there is something on both sides and the | |
537 | # quantifier isn't such as to span the required | |
538 | # width, which is 2 or 3. | |
539 | $op = 0 if $ord < 255 | |
540 | && ! $uni_semantics | |
541 | && $both_sides | |
542 | && ( ! $quantifier || $quantifier eq '?') | |
543 | && $parend < 2; | |
544 | ||
545 | # Similarly can't ever match when inverting a multi-char | |
546 | # fold for /aa and the quantifier isn't sufficient | |
547 | # to allow it to span to both sides. | |
548 | $op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2; | |
549 | ||
550 | # Or for /l | |
551 | $op = 0 if $target_has_latin1 && $charset eq 'l' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2; | |
552 | } | |
553 | ||
85882dd7 DM |
554 | |
555 | my $desc = "my \$c = \"$prepend$lhs$append\"; " | |
556 | . "my \$p = qr/$quantified/i;" | |
557 | . "$upgrade_target$upgrade_pattern " | |
558 | . "\$c " . ($op ? "=~" : "!~") . " \$p; "; | |
559 | if ($DEBUG) { | |
560 | $desc .= ( | |
561 | "; uni_semantics=$uni_semantics, " | |
562 | . "should_fail=$should_fail, " | |
563 | . "bracketed=$bracketed, " | |
564 | . "prepend=$prepend, " | |
565 | . "append=$append, " | |
566 | . "parend=$parend, " | |
567 | . "quantifier=$quantifier, " | |
568 | . "l_anchor=$l_anchor, " | |
569 | . "r_anchor=$r_anchor; " | |
570 | . "pattern_above_latin1=$pattern_above_latin1; " | |
571 | . "utf8_pattern=$utf8_pattern" | |
572 | ); | |
573 | } | |
574 | ||
aa3ca102 DM |
575 | my $c = "$prepend$lhs_str$append"; |
576 | my $p = qr/$quantified/i; | |
577 | utf8::upgrade($c) if length($upgrade_target); | |
578 | utf8::upgrade($p) if length($upgrade_pattern); | |
85882dd7 | 579 | my $res = $op ? ($c =~ $p): ($c !~ $p); |
aa3ca102 | 580 | |
67fc5dca DL |
581 | if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) { |
582 | # Failed or debug; output the result | |
583 | $count++; | |
584 | ok($res, $desc); | |
585 | } else { | |
586 | # Just count the test as passed | |
587 | $okays++; | |
588 | } | |
589 | $this_iteration++; | |
a2d9a01a KW |
590 | } |
591 | } | |
592 | } | |
593 | } | |
594 | } | |
595 | } | |
596 | } | |
597 | } | |
67fc5dca DL |
598 | |
599 | unless($ENV{PERL_DEBUG_FULL_TEST}) { | |
600 | $count++; | |
601 | is $okays, $this_iteration, "Subtests okay for " | |
602 | . "charset=$charset, utf8_pattern=$utf8_pattern"; | |
603 | } | |
a2d9a01a KW |
604 | } |
605 | } | |
606 | } | |
607 | } | |
608 | } | |
609 | ||
abf4d645 | 610 | plan($count); |
a2d9a01a KW |
611 | |
612 | 1 |