This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add warning message for locale/Unicode intermixing
[perl5.git] / t / re / fold_grind.t
CommitLineData
371a505e 1# Grind out a lot of combinatoric tests for folding.
a2d9a01a 2
a2d9a01a
KW
3binmode STDOUT, ":utf8";
4
5BEGIN {
6 chdir 't' if -d 't';
7 @INC = '../lib';
173ee337 8 require './test.pl'; require './charset_tools.pl';
569f7fc5 9 require Config; import Config;
a59efd0a 10 skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX");
fa2edc1a
JH
11 if ($^O eq 'dec_osf') {
12 skip_all("$^O cannot handle this test");
13 }
31f05a37 14 require './loc_tools.pl';
a2d9a01a
KW
15}
16
4fd7f5ff
NC
17use charnames ":full";
18
29d01a3e 19my $DEBUG = 0; # Outputs extra information for debugging this .t
abf4d645 20
a2d9a01a
KW
21use strict;
22use warnings;
ab0b796c 23no warnings 'locale'; # Plenty of these would otherwise get generated
d08723ac 24use Encode;
a59efd0a 25use POSIX;
a2d9a01a 26
ae937040
KW
27# Special-cased characters in the .c's that we want to make sure get tested.
28my %be_sure_to_test = (
29 "\xDF" => 1, # LATIN_SMALL_LETTER_SHARP_S
30 "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S
31 "\x{390}" => 1, # GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
32 "\x{3B0}" => 1, # GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
33 "\x{1FD3}" => 1, # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
34 "\x{1FE3}" => 1, # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
35 );
36
37
a2d9a01a
KW
38# Tests both unicode and not, so make sure not implicitly testing unicode
39no feature 'unicode_strings';
40
41# Case-insensitive matching is a large and complicated issue. Perl does not
42# implement it fully, properly. For example, it doesn't include normalization
43# as part of the equation. To test every conceivable combination is clearly
44# impossible; these tests are mostly drawn from visual inspection of the code
45# and experience, trying to exercise all areas.
46
47# There are three basic ranges of characters that Perl may treat differently:
48# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
49# referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants
50# are all controls that fold to themselves.
51my $ASCII = 1;
52
53# 2) Other characters that fit into a byte but are different in utf8 than not;
54# here referred to, taking some liberties, as Latin1.
55my $Latin1 = 2;
56
57# 3) Characters that won't fit in a byte; here referred to as Unicode
58my $Unicode = 3;
59
60# Within these basic groups are equivalence classes that testing any character
61# in is likely to lead to the same results as any other character. This is
62# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
63# set.
64my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
65
67fc5dca
DL
66# Additionally parts of this test run a lot of subtests, outputting the
67# resulting TAP can be expensive so the tests are summarised internally. The
68# PERL_DEBUG_FULL_TEST environment variable can be set to produce the full
69# output for debugging purposes.
70
a2d9a01a 71sub range_type {
ae937040 72 my $ord = ord shift;
a2d9a01a
KW
73
74 return $ASCII if $ord < 128;
75 return $Latin1 if $ord < 256;
76 return $Unicode;
77}
78
79sub numerically {
80 return $a <=> $b
81}
82
9f39198c 83my $list_all_tests = $ENV{PERL_DEBUG_FULL_TEST} || $DEBUG;
c09dddb1 84$| = 1 if $list_all_tests;
7109081a 85
5f01d936
KW
86# Significant time is saved by not outputting each test but grouping the
87# output into subtests
88my $okays; # Number of ok's in current subtest
89my $this_iteration; # Number of possible tests in current subtest
538e84ed 90my $count = 0; # Number of subtests = number of total tests
5f01d936 91
31f05a37
KW
92sub run_test($$$$) {
93 my ($test, $todo, $do_we_output_locale_name, $debug) = @_;
abf4d645
KW
94
95 $debug = "" unless $DEBUG;
5f01d936
KW
96 my $res = eval $test;
97
31f05a37
KW
98 if ($do_we_output_locale_name) {
99 $do_we_output_locale_name = 'setlocale(LC_CTYPE, "'
100 . POSIX::setlocale(&POSIX::LC_CTYPE)
101 . '"); ';
102 }
7109081a 103 if (!$res || $list_all_tests) {
5f01d936
KW
104 # Failed or debug; output the result
105 $count++;
31f05a37 106 ok($res, "$do_we_output_locale_name$test; $debug");
5f01d936
KW
107 } else {
108 # Just count the test as passed
109 $okays++;
110 }
111 $this_iteration++;
abf4d645
KW
112}
113
ae937040
KW
114my %has_test_by_participants; # Makes sure has tests for each range and each
115 # number of characters that fold to the same
116 # thing
117my %has_test_by_byte_count; # Makes sure has tests for each combination of
118 # n bytes folds to m bytes
119
120my %tests; # The set of tests.
121# Each key is a code point that folds to something else.
122# Each value is a list of things that the key folds to. If the 'thing' is a
123# single code point, it is that ordinal. If it is a multi-char fold, it is an
124# ordered list of the code points in that fold. Here's an example for 'S':
125# '83' => [ 115, 383 ]
126#
127# And one for a multi-char fold: \xDF
128# 223 => [
129# [ # 'ss'
130# 83,
131# 83
132# ],
133# [ # 'SS'
134# 115,
135# 115
136# ],
137# [ # LATIN SMALL LETTER LONG S
138# 383,
139# 383
140# ],
141# 7838 # LATIN_CAPITAL_LETTER_SHARP_S
142# ],
143
a3f3bf18
KW
144my %folds; # keys are code points that fold;
145 # values are each a list of code points the key folds to
ae937040
KW
146my %inverse_folds; # keys are strings of the folded-to;
147 # values are lists of characters that fold to them
148
149sub add_test($@) {
150 my ($to, @from) = @_;
151
152 # Called to cause the input to be tested by adding to %tests. @from is
153 # the list of characters that fold to the string $to. @from should be
154 # sorted so the lowest code point is first....
155 # The input is in string form; %tests uses code points, so have to
156 # convert.
157
158 my $to_chars = length $to;
159 my @test_to; # List of tests for $to
160
161 if ($to_chars == 1) {
162 @test_to = ord $to;
163 }
164 else {
165 push @test_to, [ map { ord $_ } split "", $to ];
166
167 # For multi-char folds, we also test that things that can fold to each
168 # individual character in the fold also work. If we were testing
169 # comprehensively, we would try every combination of upper and lower
170 # case in the fold, but it will have to suffice to avoid running
171 # forever to make sure that each thing that folds to these is tested
43d32dff
KW
172 # at least once. Because of complement matching ([^...]), we need to
173 # do both the folded, and the folded-from.
ae937040
KW
174 # We first look at each character in the multi-char fold, and save how
175 # many characters fold to it; and also the maximum number of such
176 # folds
177 my @folds_to_count; # 0th char in fold is index 0 ...
178 my $max_folds_to = 0;
179
180 for (my $i = 0; $i < $to_chars; $i++) {
181 my $to_char = substr($to, $i, 1);
182 if (exists $inverse_folds{$to_char}) {
183 $folds_to_count[$i] = scalar @{$inverse_folds{$to_char}};
184 $max_folds_to = $folds_to_count[$i] if $max_folds_to < $folds_to_count[$i];
185 }
186 else {
187 $folds_to_count[$i] = 0;
188 }
189 }
190
191 # We will need to generate as many tests as the maximum number of
192 # folds, so that each fold will have at least one test.
43d32dff
KW
193 # For example, consider character X which folds to the three character
194 # string 'xyz'. If 2 things fold to x (X and x), 4 to y (Y, Y'
195 # (Y-prime), Y'' (Y-prime-prime), and y), and 1 thing to z (itself), 4
196 # tests will be generated:
197 # xyz
198 # XYz
199 # xY'z
200 # xY''z
ae937040
KW
201 for (my $i = 0; $i < $max_folds_to; $i++) {
202 my @this_test_to; # Assemble a single test
203
204 # For each character in the multi-char fold ...
205 for (my $j = 0; $j < $to_chars; $j++) {
206 my $this_char = substr($to, $j, 1);
207
208 # Use its corresponding inverse fold, if available.
209 if ($i < $folds_to_count[$j]) {
210 push @this_test_to, ord $inverse_folds{$this_char}[$i];
211 }
212 else { # Or else itself.
213 push @this_test_to, ord $this_char;
214 }
215 }
216
217 # Add this test to the list
218 push @test_to, [ @this_test_to ];
219 }
220
221 # Here, have assembled all the tests for the multi-char fold. Sort so
222 # lowest code points are first for consistency and aesthetics in
223 # output. We know there are at least two characters in the fold, but
224 # I haven't bothered to worry about sorting on an optional third
225 # character if the first two are identical.
226 @test_to = sort { ($a->[0] == $b->[0])
227 ? $a->[1] <=> $b->[1]
228 : $a->[0] <=> $b->[0]
229 } @test_to;
230 }
231
232
233 # This test is from n bytes to m bytes. Record that so won't try to add
234 # another test that does the same.
235 use bytes;
236 my $to_bytes = length $to;
237 foreach my $from_map (@from) {
238 $has_test_by_byte_count{length $from_map}{$to_bytes} = $to;
239 }
240 no bytes;
241
242 my $ord_smallest_from = ord shift @from;
243 if (exists $tests{$ord_smallest_from}) {
244 die "There are already tests for $ord_smallest_from"
245 };
246
247 # Add in the fold tests,
248 push @{$tests{$ord_smallest_from}}, @test_to;
249
250 # Then any remaining froms in the equivalence class.
251 push @{$tests{$ord_smallest_from}}, map { ord $_ } @from;
252}
253
83521fcf 254# Get the Unicode rules and construct inverse mappings from them
a2d9a01a 255
83521fcf 256use Unicode::UCD;
a2d9a01a 257my $file="../lib/unicore/CaseFolding.txt";
ae937040 258
83521fcf
KW
259# Use the Unicode data file if we are on an ASCII platform (which its data is
260# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
261# available. This avoids being affected by potential bugs introduced by other
262# layers of Perl
263if (ord('A') == 65
264 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
265 && open my $fh, "<", $file)
266{
267 while (<$fh>) {
268 chomp;
269
270 # Lines look like (though without the initial '#')
271 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
272
273 # Get rid of comments, ignore blank or comment-only lines
274 my $line = $_ =~ s/ (?: \s* \# .* )? $ //rx;
275 next unless length $line;
276 my ($hex_from, $fold_type, @hex_folded) = split /[\s;]+/, $line;
277
278 next if $fold_type =~ / ^ [IT] $/x; # Perl doesn't do Turkish folding
279 next if $fold_type eq 'S'; # If Unicode's tables are correct, the F
280 # should be a superset of S
281
a3f3bf18
KW
282 my $from = hex $hex_from;
283 my @to = map { hex $_ } @hex_folded;
284 @{$folds{$from}} = @to;
285 my $folded_str = pack ("U0U*", @to);
286 push @{$inverse_folds{$folded_str}}, chr $from;
83521fcf
KW
287 }
288}
289else { # Here, can't use the .txt file: read the Unicode rules file and
290 # construct inverse mappings from it
291
292 my ($invlist_ref, $invmap_ref, undef, $default)
293 = Unicode::UCD::prop_invmap('Case_Folding');
294 for my $i (0 .. @$invlist_ref - 1 - 1) {
295 next if $invmap_ref->[$i] == $default;
296
297 # Make into an array if not so already, so can treat uniformly below
298 $invmap_ref->[$i] = [ $invmap_ref->[$i] ] if ! ref $invmap_ref->[$i];
299
300 # Each subsequent element of the range requires adjustment of +1 from
301 # the previous element
302 my $adjust = -1;
303 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
304 $adjust++;
a3f3bf18
KW
305 my @to = map { $_ + $adjust } @{$invmap_ref->[$i]};
306 push @{$folds{$j}}, @to;
307 my $folded_str = pack "U0U*", @to;
4922c5b0
KW
308 #note (sprintf "%d: %04X: %s", __LINE__, $j, join " ",
309 # map { sprintf "%04X", $_ + $adjust } @{$invmap_ref->[$i]});
83521fcf
KW
310 push @{$inverse_folds{$folded_str}}, chr $j;
311 }
312 }
a2d9a01a
KW
313}
314
ae937040
KW
315# Analyze the data and generate tests to get adequate test coverage. We sort
316# things so that smallest code points are done first.
317TO:
318foreach my $to (sort { (length $a == length $b)
319 ? $a cmp $b
320 : length $a <=> length $b
4922c5b0
KW
321 } keys %inverse_folds)
322{
ae937040
KW
323
324 # Within each fold, sort so that the smallest code points are done first
325 @{$inverse_folds{$to}} = sort { $a cmp $b } @{$inverse_folds{$to}};
326 my @from = @{$inverse_folds{$to}};
327
328 # Just add it to the tests if doing complete coverage
329 if (! $skip_apparently_redundant) {
330 add_test($to, @from);
331 next TO;
a2d9a01a 332 }
ae937040
KW
333
334 my $to_chars = length $to;
335 my $to_range_type = range_type(substr($to, 0, 1));
336
337 # If this is required to be tested, do so. We check for these first, as
338 # they will take up slots of byte-to-byte combinations that we otherwise
339 # would have to have other tests to get.
340 foreach my $from_map (@from) {
341 if (exists $be_sure_to_test{$from_map}) {
342 add_test($to, @from);
343 next TO;
344 }
a7caa9e8 345 }
a2d9a01a 346
ae937040
KW
347 # If the fold contains heterogeneous range types, is suspect and should be
348 # tested.
349 if ($to_chars > 1) {
350 foreach my $char (split "", $to) {
351 if (range_type($char) != $to_range_type) {
352 add_test($to, @from);
353 next TO;
354 }
355 }
356 }
357
358 # If the mapping crosses range types, is suspect and should be tested
359 foreach my $from_map (@from) {
360 if (range_type($from_map) != $to_range_type) {
361 add_test($to, @from);
362 next TO;
a2d9a01a
KW
363 }
364 }
a2d9a01a 365
ae937040
KW
366 # Here, all components of the mapping are in the same range type. For
367 # single character folds, we test one case in each range type that has 2
368 # particpants, 3 particpants, etc.
369 if ($to_chars == 1) {
370 if (! exists $has_test_by_participants{scalar @from}{$to_range_type}) {
371 add_test($to, @from);
372 $has_test_by_participants{scalar @from}{$to_range_type} = $to;
373 next TO;
374 }
375 }
376
377 # We also test all combinations of mappings from m to n bytes. This is
378 # because the regex optimizer cares. (Don't bother worrying about that
379 # Latin1 chars will occupy a different number of bytes under utf8, as
380 # there are plenty of other cases that catch these byte numbers.)
381 use bytes;
382 my $to_bytes = length $to;
383 foreach my $from_map (@from) {
384 if (! exists $has_test_by_byte_count{length $from_map}{$to_bytes}) {
385 add_test($to, @from);
386 next TO;
a2d9a01a
KW
387 }
388 }
389}
390
391# For each range type, test additionally a character that folds to itself
ae937040
KW
392add_test(chr 0x3A, chr 0x3A);
393add_test(chr 0xF7, chr 0xF7);
394add_test(chr 0x2C7, chr 0x2C7);
a2d9a01a 395
2f7f8cb1
KW
396# To cut down on the number of tests
397my $has_tested_aa_above_latin1;
398my $has_tested_latin1_aa;
6eea66eb 399my $has_tested_ascii_aa;
17580e7a 400my $has_tested_l_above_latin1;
ceb92b9b 401my $has_tested_above_latin1_l;
6eea66eb 402my $has_tested_ascii_l;
963bd580
KW
403my $has_tested_above_latin1_d;
404my $has_tested_ascii_d;
f16e8484 405my $has_tested_non_latin1_d;
419d8974
KW
406my $has_tested_above_latin1_a;
407my $has_tested_ascii_a;
408my $has_tested_non_latin1_a;
2f7f8cb1 409
a2d9a01a
KW
410# For use by pairs() in generating combinations
411sub prefix {
412 my $p = shift;
a7caa9e8 413 map [ $p, $_ ], @_
a2d9a01a
KW
414}
415
416# Returns all ordered combinations of pairs of elements from the input array.
417# It doesn't return pairs like (a, a), (b, b). Change the slice to an array
418# to do that. This was just to have fewer tests.
a7caa9e8 419sub pairs (@) {
4922c5b0 420 #print __LINE__, ": ", join(" XXX ", map { sprintf "%04X", $_ } @_), "\n";
a7caa9e8 421 map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
a2d9a01a
KW
422}
423
31f05a37
KW
424my $utf8_locale;
425
419d8974 426my @charsets = qw(d u a aa);
569f7fc5 427if($Config{d_setlocale}) {
1b4a62a4 428 my $current_locale = POSIX::setlocale( &POSIX::LC_CTYPE, "C") // "";
569f7fc5 429 if ($current_locale eq 'C') {
5f1269ab 430 use locale;
569f7fc5 431
0e18027d
KW
432 # Some implementations don't have the 128-255 range characters all
433 # mean nothing under the C locale (an example being VMS). This is
434 # legal, but since we don't know what the right answers should be,
435 # skip the locale tests in that situation.
569f7fc5
JR
436 for my $i (128 .. 255) {
437 my $char = chr($i);
31f05a37 438 goto skip_C_locale_tests if uc($char) ne $char || lc($char) ne $char;
569f7fc5
JR
439 }
440 push @charsets, 'l';
31f05a37
KW
441
442 skip_C_locale_tests:
443
444 # Look for utf8 locale. We use the pseudo-modifier 'L' to indicate
445 # that we really want /l, but change to a UTF-8 locale.
9b0711ee 446 $utf8_locale = find_utf8_ctype_locale();
31f05a37 447 push @charsets, 'L' if defined $utf8_locale;
24282c4e 448 }
24282c4e 449}
a2d9a01a
KW
450
451# Finally ready to do the tests
a2d9a01a
KW
452foreach my $test (sort { numerically } keys %tests) {
453
454 my $previous_target;
455 my $previous_pattern;
456 my @pairs = pairs(sort numerically $test, @{$tests{$test}});
457
458 # Each fold can be viewed as a closure of all the characters that
459 # participate in it. Look at each possible pairing from a closure, with the
460 # first member of the pair the target string to match against, and the
461 # second member forming the pattern. Thus each fold member gets tested as
462 # the string, and the pattern with every other member in the opposite role.
463 while (my $pair = shift @pairs) {
464 my ($target, $pattern) = @$pair;
465
466 # When testing a char that doesn't fold, we can get the same
467 # permutation twice; so skip all but the first.
468 next if $previous_target
469 && $previous_target == $target
470 && $previous_pattern == $pattern;
471 ($previous_target, $previous_pattern) = ($target, $pattern);
472
473 # Each side may be either a single char or a string. Extract each into an
474 # array (perhaps of length 1)
475 my @target, my @pattern;
476 @target = (ref $target) ? @$target : $target;
477 @pattern = (ref $pattern) ? @$pattern : $pattern;
478
6cc59f4c
KW
479 # We are testing just folds to/from a single character. If our pairs
480 # happens to generate multi/multi, skip.
481 next if @target > 1 && @pattern > 1;
482
a2d9a01a
KW
483 # Have to convert non-utf8 chars to native char set
484 @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
485 @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;
486
487 # Get in hex form.
488 my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
489 my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
490
491 my $target_above_latin1 = grep { $_ > 255 } @target;
492 my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
2f7f8cb1
KW
493 my $target_has_ascii = grep { $_ < 128 } @target;
494 my $pattern_has_ascii = grep { $_ < 128 } @pattern;
6eea66eb
KW
495 my $target_only_ascii = ! grep { $_ > 127 } @target;
496 my $pattern_only_ascii = ! grep { $_ > 127 } @pattern;
17580e7a 497 my $target_has_latin1 = grep { $_ < 256 } @target;
f16e8484
KW
498 my $target_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @target;
499 my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @pattern;
17580e7a 500 my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
a2d9a01a
KW
501 my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
502
503 # We don't test multi-char folding into other multi-chars. We are testing
504 # a code point that folds to or from other characters. Find the single
505 # code point for diagnostic purposes. (If both are single, choose the
506 # target string)
507 my $ord = @target == 1 ? $target[0] : $pattern[0];
7fea222d
KW
508 my $progress = sprintf "%04X: \"%s\" and /%s/",
509 $test,
a2d9a01a
KW
510 join("", @x_target),
511 join("", @x_pattern);
5473c576 512 #note $progress;
a2d9a01a
KW
513
514 # Now grind out tests, using various combinations.
a59efd0a 515 foreach my $charset (@charsets) {
31f05a37
KW
516 my $charset_mod = lc $charset;
517 my $current_locale = "";
518 if ($charset_mod eq 'l') {
519 $current_locale = POSIX::setlocale(&POSIX::LC_CTYPE,
520 ($charset eq 'L')
521 ? $utf8_locale
522 : 'C');
523 $current_locale = 'C locale' if $current_locale eq 'C';
524 }
5f01d936
KW
525 $okays = 0;
526 $this_iteration = 0;
2f7f8cb1 527
63fb01f9
KW
528 # To cut down somewhat on the enormous quantity of tests this currently
529 # runs, skip some for some of the character sets whose results aren't
530 # likely to differ from others. But run all tests on the code points
531 # that don't fold, plus one other set in each range group.
532 if (! $is_self) {
533
b0d6380c
KW
534 # /aa should only affect things with folds in the ASCII range. But, try
535 # it on one set in the other ranges just to make sure it doesn't break
536 # them.
537 if ($charset eq 'aa') {
a3f3bf18
KW
538
539 # It may be that this $pair of code points to test are both
540 # non-ascii, but if either of them actually fold to ascii, that is
541 # suspect and should be tested. So for /aa, use whether their folds
542 # are ascii or not
543 my $target_has_ascii = $target_has_ascii;
544 my $pattern_has_ascii = $pattern_has_ascii;
545 if (! $target_has_ascii) {
546 foreach my $cp (@target) {
547 if (exists $folds{$cp}
6deb7a5e 548 && grep { utf8::native_to_unicode($_) < 128 } @{$folds{$cp}} )
a3f3bf18
KW
549 {
550 $target_has_ascii = 1;
551 last;
552 }
553 }
554 }
555 if (! $pattern_has_ascii) {
556 foreach my $cp (@pattern) {
557 if (exists $folds{$cp}
6deb7a5e 558 && grep { utf8::native_to_unicode($_) < 128 } @{$folds{$cp}} )
a3f3bf18
KW
559 {
560 $pattern_has_ascii = 1;
561 last;
562 }
563 }
564 }
565
b0d6380c
KW
566 if (! $target_has_ascii && ! $pattern_has_ascii) {
567 if ($target_above_latin1 || $pattern_above_latin1) {
568 next if defined $has_tested_aa_above_latin1
569 && $has_tested_aa_above_latin1 != $test;
570 $has_tested_aa_above_latin1 = $test;
571 }
572 next if defined $has_tested_latin1_aa
573 && $has_tested_latin1_aa != $test;
574 $has_tested_latin1_aa = $test;
2f7f8cb1 575 }
f02497ec
KW
576 elsif ($target_only_ascii && $pattern_only_ascii) {
577
578 # And, except for one set just to make sure, skip tests
579 # where both elements in the pair are ASCII. If one works for
580 # aa, the others are likely too. This skips tests where the
581 # fold is from non-ASCII to ASCII, but this part of the test
582 # is just about the ASCII components.
583 next if defined $has_tested_ascii_l
584 && $has_tested_ascii_l != $test;
585 $has_tested_ascii_l = $test;
586 }
2f7f8cb1 587 }
b0d6380c 588 elsif ($charset eq 'l') {
6ed220eb
KW
589
590 # For l, don't need to test beyond one set those things that are
7b4853d1 591 # all above latin1, because unlikely to have different successes
a3f3bf18
KW
592 # than /u. But, for the same reason as described in the /aa above,
593 # it is suspect and should be tested, if either of the folds are to
594 # latin1.
595 my $target_has_latin1 = $target_has_latin1;
596 my $pattern_has_latin1 = $pattern_has_latin1;
597 if (! $target_has_latin1) {
598 foreach my $cp (@target) {
599 if (exists $folds{$cp}
600 && grep { $_ < 256 } @{$folds{$cp}} )
601 {
602 $target_has_latin1 = 1;
603 last;
604 }
605 }
606 }
607 if (! $pattern_has_latin1) {
608 foreach my $cp (@pattern) {
609 if (exists $folds{$cp}
610 && grep { $_ < 256 } @{$folds{$cp}} )
611 {
612 $pattern_has_latin1 = 1;
613 last;
614 }
615 }
616 }
b0d6380c 617 if (! $target_has_latin1 && ! $pattern_has_latin1) {
ceb92b9b
KW
618 next if defined $has_tested_above_latin1_l
619 && $has_tested_above_latin1_l != $test;
620 $has_tested_above_latin1_l = $test;
b0d6380c 621 }
6eea66eb
KW
622 elsif ($target_only_ascii && $pattern_only_ascii) {
623
624 # And, except for one set just to make sure, skip tests
f02497ec
KW
625 # where both elements in the pair are ASCII. This is
626 # essentially the same reasoning as above for /aa.
6eea66eb
KW
627 next if defined $has_tested_ascii_l
628 && $has_tested_ascii_l != $test;
629 $has_tested_ascii_l = $test;
630 }
17580e7a 631 }
f16e8484
KW
632 elsif ($charset eq 'd') {
633 # Similarly for d. Beyond one test (besides self) each, we don't
634 # test pairs that are both ascii; or both above latin1, or are
635 # combinations of ascii and above latin1.
636 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
637 if ($target_has_ascii && $pattern_has_ascii) {
963bd580
KW
638 next if defined $has_tested_ascii_d
639 && $has_tested_ascii_d != $test;
f16e8484
KW
640 $has_tested_ascii_d = $test
641 }
642 elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
643 next if defined $has_tested_above_latin1_d
644 && $has_tested_above_latin1_d != $test;
645 $has_tested_above_latin1_d = $test;
646 }
647 else {
648 next if defined $has_tested_non_latin1_d
649 && $has_tested_non_latin1_d != $test;
650 $has_tested_non_latin1_d = $test;
651 }
963bd580
KW
652 }
653 }
419d8974
KW
654 elsif ($charset eq 'a') {
655 # Similarly for a. This should match identically to /u, so wasn't
656 # tested at all until a bug was found that was thereby missed.
657 # As a compromise, beyond one test (besides self) each, we don't
658 # test pairs that are both ascii; or both above latin1, or are
659 # combinations of ascii and above latin1.
660 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
661 if ($target_has_ascii && $pattern_has_ascii) {
662 next if defined $has_tested_ascii_a
663 && $has_tested_ascii_a != $test;
664 $has_tested_ascii_a = $test
665 }
666 elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
667 next if defined $has_tested_above_latin1_a
668 && $has_tested_above_latin1_a != $test;
669 $has_tested_above_latin1_a = $test;
670 }
671 else {
672 next if defined $has_tested_non_latin1_a
673 && $has_tested_non_latin1_a != $test;
674 $has_tested_non_latin1_a = $test;
675 }
676 }
677 }
17580e7a 678 }
2f7f8cb1 679
a2d9a01a
KW
680 foreach my $utf8_target (0, 1) { # Both utf8 and not, for
681 # code points < 256
682 my $upgrade_target = "";
683
684 # These must already be in utf8 because the string to match has
685 # something above latin1. So impossible to test if to not to be in
686 # utf8; and otherwise, no upgrade is needed.
687 next if $target_above_latin1 && ! $utf8_target;
d08723ac 688 $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;
a2d9a01a 689
d08723ac
KW
690 foreach my $utf8_pattern (0, 1) {
691 next if $pattern_above_latin1 && ! $utf8_pattern;
17580e7a
KW
692
693 # Our testing of 'l' uses the POSIX locale, which is ASCII-only
31f05a37 694 my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || $charset eq 'L' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/);
a2d9a01a 695 my $upgrade_pattern = "";
d08723ac 696 $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
a2d9a01a
KW
697
698 my $lhs = join "", @x_target;
aa3ca102 699 my $lhs_str = eval qq{"$lhs"}; fail($@) if $@;
a2d9a01a 700 my @rhs = @x_pattern;
371a505e 701 my $rhs = join "", @rhs;
2f7f8cb1 702 my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self)
17580e7a
KW
703 || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii)
704 || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1);
371a505e
KW
705
706 # Do simple tests of referencing capture buffers, named and
707 # numbered.
708 my $op = '=~';
709 $op = '!~' if $should_fail;
d2025f57 710
6d7cd591 711 my $todo = 0; # No longer any todo's
31f05a37
KW
712 my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset_mod:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
713 run_test($eval, $todo, ($charset_mod eq 'l'), "");
abf4d645 714
31f05a37
KW
715 $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset_mod:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
716 run_test($eval, $todo, ($charset_mod eq 'l'), "");
abf4d645 717
371a505e 718 if ($lhs ne $rhs) {
31f05a37
KW
719 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset_mod:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
720 run_test($eval, "", ($charset_mod eq 'l'), "");
abf4d645 721
31f05a37
KW
722 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset_mod:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
723 run_test($eval, "", ($charset_mod eq 'l'), "");
371a505e 724 }
371a505e 725
fbebf34e 726 # See if works on what could be a simple trie.
92dd905c
KW
727 my $alternate;
728 {
729 # Keep the alternate | branch the same length as the tested one so
730 # that it's length doesn't influence things
731 my $evaled = eval "\"$rhs\""; # Convert e.g. \x{foo} into its
732 # chr equivalent
733 use bytes;
734 $alternate = 'q' x length $evaled;
735 }
31f05a37
KW
736 $eval = "my \$c = \"$lhs\"; my \$p = qr/$rhs|$alternate/i$charset_mod;$upgrade_target$upgrade_pattern \$c $op \$p";
737 run_test($eval, "", ($charset_mod eq 'l'), "");
67fc5dca 738
c74f6de9
KW
739 # Check that works when the folded character follows something that
740 # is quantified. This test knows the regex code internals to the
741 # extent that it knows this is a potential problem, and that there
742 # are three different types of quantifiers generated: 1) The thing
743 # being quantified matches a single character; 2) it matches more
744 # than one character, but is fixed width; 3) it can match a variable
745 # number of characters. (It doesn't know that case 3 shouldn't
746 # matter, since it doesn't do anything special for the character
747 # following the quantifier; nor that some of the different
748 # quantifiers execute the same underlying code, as these tests are
749 # quick, and this insulates these tests from changes in the
750 # implementation.)
751 for my $quantifier ('?', '??', '*', '*?', '+', '+?', '{1,2}', '{1,2}?') {
31f05a37
KW
752 $eval = "my \$c = \"_$lhs\"; my \$p = qr/(?$charset_mod:.$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
753 run_test($eval, "", ($charset_mod eq 'l'), "");
754 $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset_mod:(?:..)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
755 run_test($eval, "", ($charset_mod eq 'l'), "");
756 $eval = "my \$c = \"__$lhs\"; my \$p = qr/(?$charset_mod:(?:.|\\R)$quantifier$rhs)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
757 run_test($eval, "", ($charset_mod eq 'l'), "");
c74f6de9
KW
758 }
759
1ef17b72 760 foreach my $bracketed (0, 1) { # Put rhs in [...], or not
eee0f761
KW
761 next if $bracketed && @pattern != 1; # bracketed makes these
762 # or's instead of a sequence
3345a479 763 foreach my $optimize_bracketed (0, 1) {
91638220
KW
764 next if $optimize_bracketed && ! $bracketed;
765 foreach my $inverted (0,1) {
766 next if $inverted && ! $bracketed; # inversion only valid
767 # in [^...]
768 next if $inverted && @target != 1; # [perl #89750] multi-char
769 # not valid in [^...]
770
771 # In some cases, add an extra character that doesn't fold, and
772 # looks ok in the output.
773 my $extra_char = "_";
774 foreach my $prepend ("", $extra_char) {
775 foreach my $append ("", $extra_char) {
776
777 # Assemble the rhs. Put each character in a separate
778 # bracketed if using charclasses. This creates a stress on
779 # the code to span a match across multiple elements
780 my $rhs = "";
781 foreach my $rhs_char (@rhs) {
782 $rhs .= '[' if $bracketed;
783 $rhs .= '^' if $inverted;
784 $rhs .= $rhs_char;
785
786 # Add a character to the class, so class doesn't get
787 # optimized out, unless we are testing that optimization
788 $rhs .= '_' if $optimize_bracketed;
789 $rhs .= ']' if $bracketed;
790 }
85882dd7 791
91638220
KW
792 # Add one of: no capturing parens
793 # a single set
794 # a nested set
795 # Use quantifiers and extra variable width matches inside
796 # them to keep some optimizations from happening
797 foreach my $parend (0, 1, 2) {
798 my $interior = (! $parend)
799 ? $rhs
800 : ($parend == 1)
801 ? "(${rhs},?)"
802 : "((${rhs})+,?)";
803 foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
804
805 # Perhaps should be TODOs, as are unimplemented, but
806 # maybe will never be implemented
807 next if @pattern != 1 && $quantifier;
808
809 # A ? or * quantifier normally causes the thing to be
810 # able to match a null string
811 my $quantifier_can_match_null = $quantifier eq '?'
812 || $quantifier eq '*';
813
814 # But since we only quantify the last character in a
815 # multiple fold, the other characters will have width,
816 # except if we are quantifying the whole rhs
817 my $can_match_null = $quantifier_can_match_null
818 && (@rhs == 1 || $parend);
819
820 foreach my $l_anchor ("", '^') { # '\A' didn't change
821 # result)
822 foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't
823 # change result)
824 # The folded part can match the null string if it
825 # isn't required to have width, and there's not
826 # something on one or both sides that force it to.
827 my $both_sides = ($l_anchor && $r_anchor)
828 || ($l_anchor && $append)
829 || ($r_anchor && $prepend)
830 || ($prepend && $append);
831 my $must_match = ! $can_match_null || $both_sides;
832 # for performance, but doing this missed many failures
833 #next unless $must_match;
31f05a37 834 my $quantified = "(?$charset_mod:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
91638220
KW
835 my $op;
836 if ($must_match && $should_fail) {
837 $op = 0;
838 } else {
839 $op = 1;
840 }
841 $op = ! $op if $must_match && $inverted;
842
843 if ($inverted && @target > 1) {
844 # When doing an inverted match against a
845 # multi-char target, and there is not something on
846 # the left to anchor the match, if it shouldn't
847 # succeed, skip, as what will happen (when working
848 # correctly) is that it will match the first
849 # position correctly, and then be inverted to not
850 # match; then it will go to the second position
851 # where it won't match, but get inverted to match,
852 # and hence succeeding.
853 next if ! ($l_anchor || $prepend) && ! $op;
854
855 # Can't ever match for latin1 code points non-uni
856 # semantics that have a inverted multi-char fold
857 # when there is something on both sides and the
858 # quantifier isn't such as to span the required
859 # width, which is 2 or 3.
860 $op = 0 if $ord < 255
861 && ! $uni_semantics
862 && $both_sides
863 && ( ! $quantifier || $quantifier eq '?')
864 && $parend < 2;
865
866 # Similarly can't ever match when inverting a
867 # multi-char fold for /aa and the quantifier
868 # isn't sufficient to allow it to span to both
869 # sides.
870 $op = 0 if $target_has_ascii
871 && $charset eq 'aa'
872 && $both_sides
873 && ( ! $quantifier || $quantifier eq '?')
874 && $parend < 2;
875
876 # Or for /l
877 $op = 0 if $target_has_latin1 && $charset eq 'l'
878 && $both_sides
879 && ( ! $quantifier || $quantifier eq '?')
880 && $parend < 2;
881 }
882
883
31f05a37
KW
884 my $desc = "";
885 if ($charset_mod eq 'l') {
886 $desc .= 'setlocale(LC_CTYPE, "'
887 . POSIX::setlocale(&POSIX::LC_CTYPE)
888 . '"); '
889 }
890 $desc .= "my \$c = \"$prepend$lhs$append\"; "
91638220
KW
891 . "my \$p = qr/$quantified/i;"
892 . "$upgrade_target$upgrade_pattern "
893 . "\$c " . ($op ? "=~" : "!~") . " \$p; ";
894 if ($DEBUG) {
895 $desc .= (
896 "; uni_semantics=$uni_semantics, "
897 . "should_fail=$should_fail, "
898 . "bracketed=$bracketed, "
899 . "prepend=$prepend, "
900 . "append=$append, "
901 . "parend=$parend, "
902 . "quantifier=$quantifier, "
903 . "l_anchor=$l_anchor, "
904 . "r_anchor=$r_anchor; "
905 . "pattern_above_latin1=$pattern_above_latin1; "
906 . "utf8_pattern=$utf8_pattern"
907 );
908 }
909
910 my $c = "$prepend$lhs_str$append";
911 my $p = qr/$quantified/i;
912 utf8::upgrade($c) if length($upgrade_target);
913 utf8::upgrade($p) if length($upgrade_pattern);
914 my $res = $op ? ($c =~ $p): ($c !~ $p);
915
916 if (!$res || $list_all_tests) {
917 # Failed or debug; output the result
918 $count++;
919 ok($res, "test $count - $desc");
920 } else {
921 # Just count the test as passed
922 $okays++;
923 }
924 $this_iteration++;
67fc5dca 925 }
a2d9a01a
KW
926 }
927 }
928 }
929 }
930 }
931 }
932 }
933 }
934 }
935 }
7109081a 936 unless($list_all_tests) {
5f01d936
KW
937 $count++;
938 is $okays, $this_iteration, "$okays subtests ok for"
31f05a37
KW
939 . " /$charset_mod"
940 . (($charset_mod eq 'l') ? " ($current_locale)" : "")
941 . ', target="' . join("", @x_target) . '",'
5f01d936
KW
942 . ' pat="' . join("", @x_pattern) . '"';
943 }
a2d9a01a
KW
944 }
945 }
946}
947
abf4d645 948plan($count);
a2d9a01a
KW
949
9501