Commit | Line | Data |
---|---|---|
371a505e | 1 | # Grind out a lot of combinatoric tests for folding. |
a2d9a01a | 2 | |
a2d9a01a KW |
3 | binmode STDOUT, ":utf8"; |
4 | ||
5 | BEGIN { | |
6 | chdir 't' if -d 't'; | |
7 | @INC = '../lib'; | |
8 | require './test.pl'; | |
a59efd0a | 9 | skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX"); |
a2d9a01a KW |
10 | } |
11 | ||
4fd7f5ff NC |
12 | use charnames ":full"; |
13 | ||
29d01a3e | 14 | my $DEBUG = 0; # Outputs extra information for debugging this .t |
abf4d645 | 15 | |
a2d9a01a KW |
16 | use strict; |
17 | use warnings; | |
d08723ac | 18 | use Encode; |
a59efd0a | 19 | use POSIX; |
a2d9a01a | 20 | |
ae937040 KW |
21 | # Special-cased characters in the .c's that we want to make sure get tested. |
22 | my %be_sure_to_test = ( | |
23 | "\xDF" => 1, # LATIN_SMALL_LETTER_SHARP_S | |
24 | "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S | |
25 | "\x{390}" => 1, # GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS | |
26 | "\x{3B0}" => 1, # GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS | |
27 | "\x{1FD3}" => 1, # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA | |
28 | "\x{1FE3}" => 1, # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA | |
29 | ); | |
30 | ||
31 | ||
a2d9a01a KW |
32 | # Tests both unicode and not, so make sure not implicitly testing unicode |
33 | no feature 'unicode_strings'; | |
34 | ||
35 | # Case-insensitive matching is a large and complicated issue. Perl does not | |
36 | # implement it fully, properly. For example, it doesn't include normalization | |
37 | # as part of the equation. To test every conceivable combination is clearly | |
38 | # impossible; these tests are mostly drawn from visual inspection of the code | |
39 | # and experience, trying to exercise all areas. | |
40 | ||
41 | # There are three basic ranges of characters that Perl may treat differently: | |
42 | # 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are | |
43 | # referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants | |
44 | # are all controls that fold to themselves. | |
45 | my $ASCII = 1; | |
46 | ||
47 | # 2) Other characters that fit into a byte but are different in utf8 than not; | |
48 | # here referred to, taking some liberties, as Latin1. | |
49 | my $Latin1 = 2; | |
50 | ||
51 | # 3) Characters that won't fit in a byte; here referred to as Unicode | |
52 | my $Unicode = 3; | |
53 | ||
54 | # Within these basic groups are equivalence classes that testing any character | |
55 | # in is likely to lead to the same results as any other character. This is | |
56 | # used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is | |
57 | # set. | |
58 | my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS}; | |
59 | ||
67fc5dca DL |
60 | # Additionally parts of this test run a lot of subtests, outputting the |
61 | # resulting TAP can be expensive so the tests are summarised internally. The | |
62 | # PERL_DEBUG_FULL_TEST environment variable can be set to produce the full | |
63 | # output for debugging purposes. | |
64 | ||
a2d9a01a | 65 | sub range_type { |
ae937040 | 66 | my $ord = ord shift; |
a2d9a01a KW |
67 | |
68 | return $ASCII if $ord < 128; | |
69 | return $Latin1 if $ord < 256; | |
70 | return $Unicode; | |
71 | } | |
72 | ||
73 | sub numerically { | |
74 | return $a <=> $b | |
75 | } | |
76 | ||
5f01d936 KW |
77 | # Significant time is saved by not outputting each test but grouping the |
78 | # output into subtests | |
79 | my $okays; # Number of ok's in current subtest | |
80 | my $this_iteration; # Number of possible tests in current subtest | |
81 | my $count=0; # Number of subtests = number of total tests | |
82 | ||
83 | sub run_test($$$) { | |
84 | my ($test, $todo, $debug) = @_; | |
abf4d645 KW |
85 | |
86 | $debug = "" unless $DEBUG; | |
5f01d936 KW |
87 | my $res = eval $test; |
88 | ||
89 | if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) { | |
90 | # Failed or debug; output the result | |
91 | $count++; | |
92 | ok($res, "$test; $debug"); | |
93 | } else { | |
94 | # Just count the test as passed | |
95 | $okays++; | |
96 | } | |
97 | $this_iteration++; | |
abf4d645 KW |
98 | } |
99 | ||
ae937040 KW |
100 | my %has_test_by_participants; # Makes sure has tests for each range and each |
101 | # number of characters that fold to the same | |
102 | # thing | |
103 | my %has_test_by_byte_count; # Makes sure has tests for each combination of | |
104 | # n bytes folds to m bytes | |
105 | ||
106 | my %tests; # The set of tests. | |
107 | # Each key is a code point that folds to something else. | |
108 | # Each value is a list of things that the key folds to. If the 'thing' is a | |
109 | # single code point, it is that ordinal. If it is a multi-char fold, it is an | |
110 | # ordered list of the code points in that fold. Here's an example for 'S': | |
111 | # '83' => [ 115, 383 ] | |
112 | # | |
113 | # And one for a multi-char fold: \xDF | |
114 | # 223 => [ | |
115 | # [ # 'ss' | |
116 | # 83, | |
117 | # 83 | |
118 | # ], | |
119 | # [ # 'SS' | |
120 | # 115, | |
121 | # 115 | |
122 | # ], | |
123 | # [ # LATIN SMALL LETTER LONG S | |
124 | # 383, | |
125 | # 383 | |
126 | # ], | |
127 | # 7838 # LATIN_CAPITAL_LETTER_SHARP_S | |
128 | # ], | |
129 | ||
130 | my %inverse_folds; # keys are strings of the folded-to; | |
131 | # values are lists of characters that fold to them | |
132 | ||
133 | sub add_test($@) { | |
134 | my ($to, @from) = @_; | |
135 | ||
136 | # Called to cause the input to be tested by adding to %tests. @from is | |
137 | # the list of characters that fold to the string $to. @from should be | |
138 | # sorted so the lowest code point is first.... | |
139 | # The input is in string form; %tests uses code points, so have to | |
140 | # convert. | |
141 | ||
142 | my $to_chars = length $to; | |
143 | my @test_to; # List of tests for $to | |
144 | ||
145 | if ($to_chars == 1) { | |
146 | @test_to = ord $to; | |
147 | } | |
148 | else { | |
149 | push @test_to, [ map { ord $_ } split "", $to ]; | |
150 | ||
151 | # For multi-char folds, we also test that things that can fold to each | |
152 | # individual character in the fold also work. If we were testing | |
153 | # comprehensively, we would try every combination of upper and lower | |
154 | # case in the fold, but it will have to suffice to avoid running | |
155 | # forever to make sure that each thing that folds to these is tested | |
156 | # at least once. Because of complement matching, we need to do both | |
157 | # the folded, and the folded-from. | |
158 | # We first look at each character in the multi-char fold, and save how | |
159 | # many characters fold to it; and also the maximum number of such | |
160 | # folds | |
161 | my @folds_to_count; # 0th char in fold is index 0 ... | |
162 | my $max_folds_to = 0; | |
163 | ||
164 | for (my $i = 0; $i < $to_chars; $i++) { | |
165 | my $to_char = substr($to, $i, 1); | |
166 | if (exists $inverse_folds{$to_char}) { | |
167 | $folds_to_count[$i] = scalar @{$inverse_folds{$to_char}}; | |
168 | $max_folds_to = $folds_to_count[$i] if $max_folds_to < $folds_to_count[$i]; | |
169 | } | |
170 | else { | |
171 | $folds_to_count[$i] = 0; | |
172 | } | |
173 | } | |
174 | ||
175 | # We will need to generate as many tests as the maximum number of | |
176 | # folds, so that each fold will have at least one test. | |
177 | for (my $i = 0; $i < $max_folds_to; $i++) { | |
178 | my @this_test_to; # Assemble a single test | |
179 | ||
180 | # For each character in the multi-char fold ... | |
181 | for (my $j = 0; $j < $to_chars; $j++) { | |
182 | my $this_char = substr($to, $j, 1); | |
183 | ||
184 | # Use its corresponding inverse fold, if available. | |
185 | if ($i < $folds_to_count[$j]) { | |
186 | push @this_test_to, ord $inverse_folds{$this_char}[$i]; | |
187 | } | |
188 | else { # Or else itself. | |
189 | push @this_test_to, ord $this_char; | |
190 | } | |
191 | } | |
192 | ||
193 | # Add this test to the list | |
194 | push @test_to, [ @this_test_to ]; | |
195 | } | |
196 | ||
197 | # Here, have assembled all the tests for the multi-char fold. Sort so | |
198 | # lowest code points are first for consistency and aesthetics in | |
199 | # output. We know there are at least two characters in the fold, but | |
200 | # I haven't bothered to worry about sorting on an optional third | |
201 | # character if the first two are identical. | |
202 | @test_to = sort { ($a->[0] == $b->[0]) | |
203 | ? $a->[1] <=> $b->[1] | |
204 | : $a->[0] <=> $b->[0] | |
205 | } @test_to; | |
206 | } | |
207 | ||
208 | ||
209 | # This test is from n bytes to m bytes. Record that so won't try to add | |
210 | # another test that does the same. | |
211 | use bytes; | |
212 | my $to_bytes = length $to; | |
213 | foreach my $from_map (@from) { | |
214 | $has_test_by_byte_count{length $from_map}{$to_bytes} = $to; | |
215 | } | |
216 | no bytes; | |
217 | ||
218 | my $ord_smallest_from = ord shift @from; | |
219 | if (exists $tests{$ord_smallest_from}) { | |
220 | die "There are already tests for $ord_smallest_from" | |
221 | }; | |
222 | ||
223 | # Add in the fold tests, | |
224 | push @{$tests{$ord_smallest_from}}, @test_to; | |
225 | ||
226 | # Then any remaining froms in the equivalence class. | |
227 | push @{$tests{$ord_smallest_from}}, map { ord $_ } @from; | |
228 | } | |
229 | ||
230 | # Read the Unicode rules file and construct inverse mappings from it | |
a2d9a01a | 231 | |
a2d9a01a KW |
232 | my $file="../lib/unicore/CaseFolding.txt"; |
233 | open my $fh, "<", $file or die "Failed to read '$file': $!"; | |
ae937040 | 234 | |
a2d9a01a KW |
235 | while (<$fh>) { |
236 | chomp; | |
237 | ||
238 | # Lines look like (though without the initial '#') | |
239 | #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE | |
240 | ||
241 | my ($line, $comment) = split / \s+ \# \s+ /x, $_; | |
ae937040 KW |
242 | next if $line eq "" || $line =~ /^#/; |
243 | my ($hex_from, $fold_type, @hex_folded) = split /[\s;]+/, $line; | |
a2d9a01a | 244 | |
ae937040 KW |
245 | next if $fold_type eq 'T'; # Perl doesn't do Turkish folding |
246 | next if $fold_type eq 'S'; # If Unicode's tables are correct, the F | |
247 | # should be a superset of S | |
a2d9a01a | 248 | |
ae937040 KW |
249 | my $folded_str = pack ("U0U*", map { hex $_ } @hex_folded); |
250 | push @{$inverse_folds{$folded_str}}, chr hex $hex_from; | |
a2d9a01a KW |
251 | } |
252 | ||
ae937040 KW |
253 | # Analyze the data and generate tests to get adequate test coverage. We sort |
254 | # things so that smallest code points are done first. | |
255 | TO: | |
256 | foreach my $to (sort { (length $a == length $b) | |
257 | ? $a cmp $b | |
258 | : length $a <=> length $b | |
259 | } keys %inverse_folds) { | |
260 | ||
261 | # Within each fold, sort so that the smallest code points are done first | |
262 | @{$inverse_folds{$to}} = sort { $a cmp $b } @{$inverse_folds{$to}}; | |
263 | my @from = @{$inverse_folds{$to}}; | |
264 | ||
265 | # Just add it to the tests if doing complete coverage | |
266 | if (! $skip_apparently_redundant) { | |
267 | add_test($to, @from); | |
268 | next TO; | |
a2d9a01a | 269 | } |
ae937040 KW |
270 | |
271 | my $to_chars = length $to; | |
272 | my $to_range_type = range_type(substr($to, 0, 1)); | |
273 | ||
274 | # If this is required to be tested, do so. We check for these first, as | |
275 | # they will take up slots of byte-to-byte combinations that we otherwise | |
276 | # would have to have other tests to get. | |
277 | foreach my $from_map (@from) { | |
278 | if (exists $be_sure_to_test{$from_map}) { | |
279 | add_test($to, @from); | |
280 | next TO; | |
281 | } | |
a7caa9e8 | 282 | } |
a2d9a01a | 283 | |
ae937040 KW |
284 | # If the fold contains heterogeneous range types, is suspect and should be |
285 | # tested. | |
286 | if ($to_chars > 1) { | |
287 | foreach my $char (split "", $to) { | |
288 | if (range_type($char) != $to_range_type) { | |
289 | add_test($to, @from); | |
290 | next TO; | |
291 | } | |
292 | } | |
293 | } | |
294 | ||
295 | # If the mapping crosses range types, is suspect and should be tested | |
296 | foreach my $from_map (@from) { | |
297 | if (range_type($from_map) != $to_range_type) { | |
298 | add_test($to, @from); | |
299 | next TO; | |
a2d9a01a KW |
300 | } |
301 | } | |
a2d9a01a | 302 | |
ae937040 KW |
303 | # Here, all components of the mapping are in the same range type. For |
304 | # single character folds, we test one case in each range type that has 2 | |
305 | # particpants, 3 particpants, etc. | |
306 | if ($to_chars == 1) { | |
307 | if (! exists $has_test_by_participants{scalar @from}{$to_range_type}) { | |
308 | add_test($to, @from); | |
309 | $has_test_by_participants{scalar @from}{$to_range_type} = $to; | |
310 | next TO; | |
311 | } | |
312 | } | |
313 | ||
314 | # We also test all combinations of mappings from m to n bytes. This is | |
315 | # because the regex optimizer cares. (Don't bother worrying about that | |
316 | # Latin1 chars will occupy a different number of bytes under utf8, as | |
317 | # there are plenty of other cases that catch these byte numbers.) | |
318 | use bytes; | |
319 | my $to_bytes = length $to; | |
320 | foreach my $from_map (@from) { | |
321 | if (! exists $has_test_by_byte_count{length $from_map}{$to_bytes}) { | |
322 | add_test($to, @from); | |
323 | next TO; | |
a2d9a01a KW |
324 | } |
325 | } | |
326 | } | |
327 | ||
328 | # For each range type, test additionally a character that folds to itself | |
ae937040 KW |
329 | add_test(chr 0x3A, chr 0x3A); |
330 | add_test(chr 0xF7, chr 0xF7); | |
331 | add_test(chr 0x2C7, chr 0x2C7); | |
a2d9a01a | 332 | |
2f7f8cb1 KW |
333 | # To cut down on the number of tests |
334 | my $has_tested_aa_above_latin1; | |
335 | my $has_tested_latin1_aa; | |
6eea66eb | 336 | my $has_tested_ascii_aa; |
17580e7a | 337 | my $has_tested_l_above_latin1; |
ceb92b9b | 338 | my $has_tested_above_latin1_l; |
6eea66eb | 339 | my $has_tested_ascii_l; |
963bd580 KW |
340 | my $has_tested_above_latin1_d; |
341 | my $has_tested_ascii_d; | |
f16e8484 | 342 | my $has_tested_non_latin1_d; |
419d8974 KW |
343 | my $has_tested_above_latin1_a; |
344 | my $has_tested_ascii_a; | |
345 | my $has_tested_non_latin1_a; | |
2f7f8cb1 | 346 | |
a2d9a01a KW |
347 | # For use by pairs() in generating combinations |
348 | sub prefix { | |
349 | my $p = shift; | |
a7caa9e8 | 350 | map [ $p, $_ ], @_ |
a2d9a01a KW |
351 | } |
352 | ||
353 | # Returns all ordered combinations of pairs of elements from the input array. | |
354 | # It doesn't return pairs like (a, a), (b, b). Change the slice to an array | |
355 | # to do that. This was just to have fewer tests. | |
a7caa9e8 | 356 | sub pairs (@) { |
a2d9a01a | 357 | #print __LINE__, ": ", join(" XXX ", @_), "\n"; |
a7caa9e8 | 358 | map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_ |
a2d9a01a KW |
359 | } |
360 | ||
419d8974 | 361 | my @charsets = qw(d u a aa); |
a59efd0a | 362 | my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // ""; |
24282c4e KW |
363 | if ($current_locale eq 'C') { |
364 | use locale; | |
365 | ||
6cc59f4c | 366 | # Some locale implementations don't have the range 128-255 characters all |
24282c4e KW |
367 | # mean nothing. Skip the locale tests in that situation. |
368 | for my $i (128 .. 255) { | |
369 | my $char = chr($i); | |
370 | goto bad_locale if uc($char) ne $char || lc($char) ne $char; | |
371 | } | |
372 | push @charsets, 'l'; | |
373 | bad_locale: | |
374 | } | |
a2d9a01a KW |
375 | |
376 | # Finally ready to do the tests | |
a2d9a01a KW |
377 | foreach my $test (sort { numerically } keys %tests) { |
378 | ||
379 | my $previous_target; | |
380 | my $previous_pattern; | |
381 | my @pairs = pairs(sort numerically $test, @{$tests{$test}}); | |
382 | ||
383 | # Each fold can be viewed as a closure of all the characters that | |
384 | # participate in it. Look at each possible pairing from a closure, with the | |
385 | # first member of the pair the target string to match against, and the | |
386 | # second member forming the pattern. Thus each fold member gets tested as | |
387 | # the string, and the pattern with every other member in the opposite role. | |
388 | while (my $pair = shift @pairs) { | |
389 | my ($target, $pattern) = @$pair; | |
390 | ||
391 | # When testing a char that doesn't fold, we can get the same | |
392 | # permutation twice; so skip all but the first. | |
393 | next if $previous_target | |
394 | && $previous_target == $target | |
395 | && $previous_pattern == $pattern; | |
396 | ($previous_target, $previous_pattern) = ($target, $pattern); | |
397 | ||
398 | # Each side may be either a single char or a string. Extract each into an | |
399 | # array (perhaps of length 1) | |
400 | my @target, my @pattern; | |
401 | @target = (ref $target) ? @$target : $target; | |
402 | @pattern = (ref $pattern) ? @$pattern : $pattern; | |
403 | ||
6cc59f4c KW |
404 | # We are testing just folds to/from a single character. If our pairs |
405 | # happens to generate multi/multi, skip. | |
406 | next if @target > 1 && @pattern > 1; | |
407 | ||
a2d9a01a KW |
408 | # Have to convert non-utf8 chars to native char set |
409 | @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target; | |
410 | @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern; | |
411 | ||
412 | # Get in hex form. | |
413 | my @x_target = map { sprintf "\\x{%04X}", $_ } @target; | |
414 | my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern; | |
415 | ||
416 | my $target_above_latin1 = grep { $_ > 255 } @target; | |
417 | my $pattern_above_latin1 = grep { $_ > 255 } @pattern; | |
2f7f8cb1 KW |
418 | my $target_has_ascii = grep { $_ < 128 } @target; |
419 | my $pattern_has_ascii = grep { $_ < 128 } @pattern; | |
6eea66eb KW |
420 | my $target_only_ascii = ! grep { $_ > 127 } @target; |
421 | my $pattern_only_ascii = ! grep { $_ > 127 } @pattern; | |
17580e7a | 422 | my $target_has_latin1 = grep { $_ < 256 } @target; |
f16e8484 KW |
423 | my $target_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @target; |
424 | my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @pattern; | |
17580e7a | 425 | my $pattern_has_latin1 = grep { $_ < 256 } @pattern; |
a2d9a01a KW |
426 | my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0]; |
427 | ||
428 | # We don't test multi-char folding into other multi-chars. We are testing | |
429 | # a code point that folds to or from other characters. Find the single | |
430 | # code point for diagnostic purposes. (If both are single, choose the | |
431 | # target string) | |
432 | my $ord = @target == 1 ? $target[0] : $pattern[0]; | |
7fea222d KW |
433 | my $progress = sprintf "%04X: \"%s\" and /%s/", |
434 | $test, | |
a2d9a01a KW |
435 | join("", @x_target), |
436 | join("", @x_pattern); | |
5473c576 | 437 | #note $progress; |
a2d9a01a KW |
438 | |
439 | # Now grind out tests, using various combinations. | |
a59efd0a | 440 | foreach my $charset (@charsets) { |
5f01d936 KW |
441 | $okays = 0; |
442 | $this_iteration = 0; | |
2f7f8cb1 | 443 | |
63fb01f9 KW |
444 | # To cut down somewhat on the enormous quantity of tests this currently |
445 | # runs, skip some for some of the character sets whose results aren't | |
446 | # likely to differ from others. But run all tests on the code points | |
447 | # that don't fold, plus one other set in each range group. | |
448 | if (! $is_self) { | |
449 | ||
b0d6380c KW |
450 | # /aa should only affect things with folds in the ASCII range. But, try |
451 | # it on one set in the other ranges just to make sure it doesn't break | |
452 | # them. | |
453 | if ($charset eq 'aa') { | |
454 | if (! $target_has_ascii && ! $pattern_has_ascii) { | |
455 | if ($target_above_latin1 || $pattern_above_latin1) { | |
456 | next if defined $has_tested_aa_above_latin1 | |
457 | && $has_tested_aa_above_latin1 != $test; | |
458 | $has_tested_aa_above_latin1 = $test; | |
459 | } | |
460 | next if defined $has_tested_latin1_aa | |
461 | && $has_tested_latin1_aa != $test; | |
462 | $has_tested_latin1_aa = $test; | |
2f7f8cb1 | 463 | } |
f02497ec KW |
464 | elsif ($target_only_ascii && $pattern_only_ascii) { |
465 | ||
466 | # And, except for one set just to make sure, skip tests | |
467 | # where both elements in the pair are ASCII. If one works for | |
468 | # aa, the others are likely too. This skips tests where the | |
469 | # fold is from non-ASCII to ASCII, but this part of the test | |
470 | # is just about the ASCII components. | |
471 | next if defined $has_tested_ascii_l | |
472 | && $has_tested_ascii_l != $test; | |
473 | $has_tested_ascii_l = $test; | |
474 | } | |
2f7f8cb1 | 475 | } |
b0d6380c | 476 | elsif ($charset eq 'l') { |
6ed220eb KW |
477 | |
478 | # For l, don't need to test beyond one set those things that are | |
7b4853d1 KW |
479 | # all above latin1, because unlikely to have different successes |
480 | # than /u | |
b0d6380c | 481 | if (! $target_has_latin1 && ! $pattern_has_latin1) { |
ceb92b9b KW |
482 | next if defined $has_tested_above_latin1_l |
483 | && $has_tested_above_latin1_l != $test; | |
484 | $has_tested_above_latin1_l = $test; | |
b0d6380c | 485 | } |
6eea66eb KW |
486 | elsif ($target_only_ascii && $pattern_only_ascii) { |
487 | ||
488 | # And, except for one set just to make sure, skip tests | |
f02497ec KW |
489 | # where both elements in the pair are ASCII. This is |
490 | # essentially the same reasoning as above for /aa. | |
6eea66eb KW |
491 | next if defined $has_tested_ascii_l |
492 | && $has_tested_ascii_l != $test; | |
493 | $has_tested_ascii_l = $test; | |
494 | } | |
17580e7a | 495 | } |
f16e8484 KW |
496 | elsif ($charset eq 'd') { |
497 | # Similarly for d. Beyond one test (besides self) each, we don't | |
498 | # test pairs that are both ascii; or both above latin1, or are | |
499 | # combinations of ascii and above latin1. | |
500 | if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { | |
501 | if ($target_has_ascii && $pattern_has_ascii) { | |
963bd580 KW |
502 | next if defined $has_tested_ascii_d |
503 | && $has_tested_ascii_d != $test; | |
f16e8484 KW |
504 | $has_tested_ascii_d = $test |
505 | } | |
506 | elsif (! $target_has_latin1 && ! $pattern_has_latin1) { | |
507 | next if defined $has_tested_above_latin1_d | |
508 | && $has_tested_above_latin1_d != $test; | |
509 | $has_tested_above_latin1_d = $test; | |
510 | } | |
511 | else { | |
512 | next if defined $has_tested_non_latin1_d | |
513 | && $has_tested_non_latin1_d != $test; | |
514 | $has_tested_non_latin1_d = $test; | |
515 | } | |
963bd580 KW |
516 | } |
517 | } | |
419d8974 KW |
518 | elsif ($charset eq 'a') { |
519 | # Similarly for a. This should match identically to /u, so wasn't | |
520 | # tested at all until a bug was found that was thereby missed. | |
521 | # As a compromise, beyond one test (besides self) each, we don't | |
522 | # test pairs that are both ascii; or both above latin1, or are | |
523 | # combinations of ascii and above latin1. | |
524 | if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) { | |
525 | if ($target_has_ascii && $pattern_has_ascii) { | |
526 | next if defined $has_tested_ascii_a | |
527 | && $has_tested_ascii_a != $test; | |
528 | $has_tested_ascii_a = $test | |
529 | } | |
530 | elsif (! $target_has_latin1 && ! $pattern_has_latin1) { | |
531 | next if defined $has_tested_above_latin1_a | |
532 | && $has_tested_above_latin1_a != $test; | |
533 | $has_tested_above_latin1_a = $test; | |
534 | } | |
535 | else { | |
536 | next if defined $has_tested_non_latin1_a | |
537 | && $has_tested_non_latin1_a != $test; | |
538 | $has_tested_non_latin1_a = $test; | |
539 | } | |
540 | } | |
541 | } | |
17580e7a | 542 | } |
2f7f8cb1 | 543 | |
a2d9a01a KW |
544 | foreach my $utf8_target (0, 1) { # Both utf8 and not, for |
545 | # code points < 256 | |
546 | my $upgrade_target = ""; | |
547 | ||
548 | # These must already be in utf8 because the string to match has | |
549 | # something above latin1. So impossible to test if to not to be in | |
550 | # utf8; and otherwise, no upgrade is needed. | |
551 | next if $target_above_latin1 && ! $utf8_target; | |
d08723ac | 552 | $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target; |
a2d9a01a | 553 | |
d08723ac KW |
554 | foreach my $utf8_pattern (0, 1) { |
555 | next if $pattern_above_latin1 && ! $utf8_pattern; | |
17580e7a KW |
556 | |
557 | # Our testing of 'l' uses the POSIX locale, which is ASCII-only | |
558 | my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/); | |
a2d9a01a | 559 | my $upgrade_pattern = ""; |
d08723ac | 560 | $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern; |
a2d9a01a KW |
561 | |
562 | my $lhs = join "", @x_target; | |
aa3ca102 | 563 | my $lhs_str = eval qq{"$lhs"}; fail($@) if $@; |
a2d9a01a | 564 | my @rhs = @x_pattern; |
371a505e | 565 | my $rhs = join "", @rhs; |
2f7f8cb1 | 566 | my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self) |
17580e7a KW |
567 | || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii) |
568 | || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1); | |
371a505e KW |
569 | |
570 | # Do simple tests of referencing capture buffers, named and | |
571 | # numbered. | |
572 | my $op = '=~'; | |
573 | $op = '!~' if $should_fail; | |
d2025f57 | 574 | |
6d7cd591 | 575 | my $todo = 0; # No longer any todo's |
371a505e | 576 | my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; |
5f01d936 | 577 | run_test($eval, $todo, ""); |
abf4d645 | 578 | |
371a505e | 579 | $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; |
5f01d936 | 580 | run_test($eval, $todo, ""); |
abf4d645 | 581 | |
371a505e KW |
582 | if ($lhs ne $rhs) { |
583 | $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; | |
5f01d936 | 584 | run_test($eval, "", ""); |
abf4d645 | 585 | |
371a505e | 586 | $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; |
5f01d936 | 587 | run_test($eval, "", ""); |
371a505e | 588 | } |
371a505e | 589 | |
fbebf34e KW |
590 | # See if works on what could be a simple trie. |
591 | $eval = "my \$c = \"$lhs\"; my \$p = qr/$rhs|xyz/i$charset;$upgrade_target$upgrade_pattern \$c $op \$p"; | |
5f01d936 | 592 | run_test($eval, "", ""); |
67fc5dca | 593 | |
1ef17b72 | 594 | foreach my $bracketed (0, 1) { # Put rhs in [...], or not |
eee0f761 KW |
595 | next if $bracketed && @pattern != 1; # bracketed makes these |
596 | # or's instead of a sequence | |
a2d9a01a | 597 | foreach my $inverted (0,1) { |
d2025f57 | 598 | next if $inverted && ! $bracketed; # inversion only valid in [^...] |
827f5bb8 KW |
599 | next if $inverted && @target != 1; # [perl #89750] multi-char |
600 | # not valid in [^...] | |
a2d9a01a KW |
601 | |
602 | # In some cases, add an extra character that doesn't fold, and | |
603 | # looks ok in the output. | |
604 | my $extra_char = "_"; | |
605 | foreach my $prepend ("", $extra_char) { | |
606 | foreach my $append ("", $extra_char) { | |
a2d9a01a KW |
607 | |
608 | # Assemble the rhs. Put each character in a separate | |
609 | # bracketed if using charclasses. This creates a stress on | |
610 | # the code to span a match across multiple elements | |
611 | my $rhs = ""; | |
612 | foreach my $rhs_char (@rhs) { | |
613 | $rhs .= '[' if $bracketed; | |
614 | $rhs .= '^' if $inverted; | |
615 | $rhs .= $rhs_char; | |
616 | ||
617 | # Add a character to the class, so class doesn't get | |
618 | # optimized out | |
619 | $rhs .= '_]' if $bracketed; | |
620 | } | |
621 | ||
622 | # Add one of: no capturing parens | |
623 | # a single set | |
624 | # a nested set | |
625 | # Use quantifiers and extra variable width matches inside | |
626 | # them to keep some optimizations from happening | |
627 | foreach my $parend (0, 1, 2) { | |
628 | my $interior = (! $parend) | |
629 | ? $rhs | |
630 | : ($parend == 1) | |
631 | ? "(${rhs},?)" | |
632 | : "((${rhs})+,?)"; | |
633 | foreach my $quantifier ("", '?', '*', '+', '{1,3}') { | |
634 | ||
eee0f761 KW |
635 | # Perhaps should be TODOs, as are unimplemented, but |
636 | # maybe will never be implemented | |
637 | next if @pattern != 1 && $quantifier; | |
638 | ||
a2d9a01a KW |
639 | # A ? or * quantifier normally causes the thing to be |
640 | # able to match a null string | |
641 | my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*'; | |
642 | ||
643 | # But since we only quantify the last character in a | |
644 | # multiple fold, the other characters will have width, | |
645 | # except if we are quantifying the whole rhs | |
646 | my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend); | |
647 | ||
648 | foreach my $l_anchor ("", '^') { # '\A' didn't change result) | |
649 | foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result) | |
650 | ||
651 | # The folded part can match the null string if it | |
652 | # isn't required to have width, and there's not | |
653 | # something on one or both sides that force it to. | |
2f7f8cb1 KW |
654 | my $both_sides = ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append); |
655 | my $must_match = ! $can_match_null || $both_sides; | |
656 | # for performance, but doing this missed many failures | |
a2d9a01a | 657 | #next unless $must_match; |
d08723ac | 658 | my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)"; |
a2d9a01a | 659 | my $op; |
d08723ac | 660 | if ($must_match && $should_fail) { |
a2d9a01a KW |
661 | $op = 0; |
662 | } else { | |
663 | $op = 1; | |
664 | } | |
665 | $op = ! $op if $must_match && $inverted; | |
27f6057f KW |
666 | |
667 | if ($inverted && @target > 1) { | |
668 | # When doing an inverted match against a | |
669 | # multi-char target, and there is not something on | |
670 | # the left to anchor the match, if it shouldn't | |
671 | # succeed, skip, as what will happen (when working | |
672 | # correctly) is that it will match the first | |
673 | # position correctly, and then be inverted to not | |
674 | # match; then it will go to the second position | |
675 | # where it won't match, but get inverted to match, | |
676 | # and hence succeeding. | |
677 | next if ! ($l_anchor || $prepend) && ! $op; | |
678 | ||
679 | # Can't ever match for latin1 code points non-uni | |
680 | # semantics that have a inverted multi-char fold | |
681 | # when there is something on both sides and the | |
682 | # quantifier isn't such as to span the required | |
683 | # width, which is 2 or 3. | |
684 | $op = 0 if $ord < 255 | |
685 | && ! $uni_semantics | |
686 | && $both_sides | |
687 | && ( ! $quantifier || $quantifier eq '?') | |
688 | && $parend < 2; | |
689 | ||
690 | # Similarly can't ever match when inverting a multi-char | |
691 | # fold for /aa and the quantifier isn't sufficient | |
692 | # to allow it to span to both sides. | |
693 | $op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2; | |
694 | ||
695 | # Or for /l | |
696 | $op = 0 if $target_has_latin1 && $charset eq 'l' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2; | |
697 | } | |
698 | ||
85882dd7 DM |
699 | |
700 | my $desc = "my \$c = \"$prepend$lhs$append\"; " | |
701 | . "my \$p = qr/$quantified/i;" | |
702 | . "$upgrade_target$upgrade_pattern " | |
703 | . "\$c " . ($op ? "=~" : "!~") . " \$p; "; | |
704 | if ($DEBUG) { | |
705 | $desc .= ( | |
706 | "; uni_semantics=$uni_semantics, " | |
707 | . "should_fail=$should_fail, " | |
708 | . "bracketed=$bracketed, " | |
709 | . "prepend=$prepend, " | |
710 | . "append=$append, " | |
711 | . "parend=$parend, " | |
712 | . "quantifier=$quantifier, " | |
713 | . "l_anchor=$l_anchor, " | |
714 | . "r_anchor=$r_anchor; " | |
715 | . "pattern_above_latin1=$pattern_above_latin1; " | |
716 | . "utf8_pattern=$utf8_pattern" | |
717 | ); | |
718 | } | |
719 | ||
aa3ca102 DM |
720 | my $c = "$prepend$lhs_str$append"; |
721 | my $p = qr/$quantified/i; | |
722 | utf8::upgrade($c) if length($upgrade_target); | |
723 | utf8::upgrade($p) if length($upgrade_pattern); | |
85882dd7 | 724 | my $res = $op ? ($c =~ $p): ($c !~ $p); |
aa3ca102 | 725 | |
67fc5dca DL |
726 | if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) { |
727 | # Failed or debug; output the result | |
728 | $count++; | |
729 | ok($res, $desc); | |
730 | } else { | |
731 | # Just count the test as passed | |
732 | $okays++; | |
733 | } | |
734 | $this_iteration++; | |
a2d9a01a KW |
735 | } |
736 | } | |
737 | } | |
738 | } | |
739 | } | |
740 | } | |
741 | } | |
742 | } | |
743 | } | |
744 | } | |
5f01d936 KW |
745 | unless($ENV{PERL_DEBUG_FULL_TEST}) { |
746 | $count++; | |
747 | is $okays, $this_iteration, "$okays subtests ok for" | |
748 | . " /$charset," | |
749 | . ' target="' . join("", @x_target) . '",' | |
750 | . ' pat="' . join("", @x_pattern) . '"'; | |
751 | } | |
a2d9a01a KW |
752 | } |
753 | } | |
754 | } | |
755 | ||
abf4d645 | 756 | plan($count); |
a2d9a01a KW |
757 | |
758 | 1 |