This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
fold_grind.t: Revamp test case coverage analysis
[perl5.git] / t / re / fold_grind.t
CommitLineData
371a505e 1# Grind out a lot of combinatoric tests for folding.
a2d9a01a 2
a2d9a01a
KW
3binmode STDOUT, ":utf8";
4
5BEGIN {
6 chdir 't' if -d 't';
7 @INC = '../lib';
8 require './test.pl';
a59efd0a 9 skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX");
a2d9a01a
KW
10}
11
4fd7f5ff
NC
12use charnames ":full";
13
29d01a3e 14my $DEBUG = 0; # Outputs extra information for debugging this .t
abf4d645 15
a2d9a01a
KW
16use strict;
17use warnings;
d08723ac 18use Encode;
a59efd0a 19use POSIX;
a2d9a01a 20
ae937040
KW
21# Special-cased characters in the .c's that we want to make sure get tested.
22my %be_sure_to_test = (
23 "\xDF" => 1, # LATIN_SMALL_LETTER_SHARP_S
24 "\x{1E9E}" => 1, # LATIN_CAPITAL_LETTER_SHARP_S
25 "\x{390}" => 1, # GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
26 "\x{3B0}" => 1, # GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
27 "\x{1FD3}" => 1, # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
28 "\x{1FE3}" => 1, # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
29 );
30
31
a2d9a01a
KW
32# Tests both unicode and not, so make sure not implicitly testing unicode
33no feature 'unicode_strings';
34
35# Case-insensitive matching is a large and complicated issue. Perl does not
36# implement it fully, properly. For example, it doesn't include normalization
37# as part of the equation. To test every conceivable combination is clearly
38# impossible; these tests are mostly drawn from visual inspection of the code
39# and experience, trying to exercise all areas.
40
41# There are three basic ranges of characters that Perl may treat differently:
42# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
43# referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants
44# are all controls that fold to themselves.
45my $ASCII = 1;
46
47# 2) Other characters that fit into a byte but are different in utf8 than not;
48# here referred to, taking some liberties, as Latin1.
49my $Latin1 = 2;
50
51# 3) Characters that won't fit in a byte; here referred to as Unicode
52my $Unicode = 3;
53
54# Within these basic groups are equivalence classes that testing any character
55# in is likely to lead to the same results as any other character. This is
56# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
57# set.
58my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
59
67fc5dca
DL
60# Additionally parts of this test run a lot of subtests, outputting the
61# resulting TAP can be expensive so the tests are summarised internally. The
62# PERL_DEBUG_FULL_TEST environment variable can be set to produce the full
63# output for debugging purposes.
64
a2d9a01a 65sub range_type {
ae937040 66 my $ord = ord shift;
a2d9a01a
KW
67
68 return $ASCII if $ord < 128;
69 return $Latin1 if $ord < 256;
70 return $Unicode;
71}
72
73sub numerically {
74 return $a <=> $b
75}
76
5f01d936
KW
77# Significant time is saved by not outputting each test but grouping the
78# output into subtests
79my $okays; # Number of ok's in current subtest
80my $this_iteration; # Number of possible tests in current subtest
81my $count=0; # Number of subtests = number of total tests
82
83sub run_test($$$) {
84 my ($test, $todo, $debug) = @_;
abf4d645
KW
85
86 $debug = "" unless $DEBUG;
5f01d936
KW
87 my $res = eval $test;
88
89 if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) {
90 # Failed or debug; output the result
91 $count++;
92 ok($res, "$test; $debug");
93 } else {
94 # Just count the test as passed
95 $okays++;
96 }
97 $this_iteration++;
abf4d645
KW
98}
99
ae937040
KW
100my %has_test_by_participants; # Makes sure has tests for each range and each
101 # number of characters that fold to the same
102 # thing
103my %has_test_by_byte_count; # Makes sure has tests for each combination of
104 # n bytes folds to m bytes
105
106my %tests; # The set of tests.
107# Each key is a code point that folds to something else.
108# Each value is a list of things that the key folds to. If the 'thing' is a
109# single code point, it is that ordinal. If it is a multi-char fold, it is an
110# ordered list of the code points in that fold. Here's an example for 'S':
111# '83' => [ 115, 383 ]
112#
113# And one for a multi-char fold: \xDF
114# 223 => [
115# [ # 'ss'
116# 83,
117# 83
118# ],
119# [ # 'SS'
120# 115,
121# 115
122# ],
123# [ # LATIN SMALL LETTER LONG S
124# 383,
125# 383
126# ],
127# 7838 # LATIN_CAPITAL_LETTER_SHARP_S
128# ],
129
130my %inverse_folds; # keys are strings of the folded-to;
131 # values are lists of characters that fold to them
132
133sub add_test($@) {
134 my ($to, @from) = @_;
135
136 # Called to cause the input to be tested by adding to %tests. @from is
137 # the list of characters that fold to the string $to. @from should be
138 # sorted so the lowest code point is first....
139 # The input is in string form; %tests uses code points, so have to
140 # convert.
141
142 my $to_chars = length $to;
143 my @test_to; # List of tests for $to
144
145 if ($to_chars == 1) {
146 @test_to = ord $to;
147 }
148 else {
149 push @test_to, [ map { ord $_ } split "", $to ];
150
151 # For multi-char folds, we also test that things that can fold to each
152 # individual character in the fold also work. If we were testing
153 # comprehensively, we would try every combination of upper and lower
154 # case in the fold, but it will have to suffice to avoid running
155 # forever to make sure that each thing that folds to these is tested
156 # at least once. Because of complement matching, we need to do both
157 # the folded, and the folded-from.
158 # We first look at each character in the multi-char fold, and save how
159 # many characters fold to it; and also the maximum number of such
160 # folds
161 my @folds_to_count; # 0th char in fold is index 0 ...
162 my $max_folds_to = 0;
163
164 for (my $i = 0; $i < $to_chars; $i++) {
165 my $to_char = substr($to, $i, 1);
166 if (exists $inverse_folds{$to_char}) {
167 $folds_to_count[$i] = scalar @{$inverse_folds{$to_char}};
168 $max_folds_to = $folds_to_count[$i] if $max_folds_to < $folds_to_count[$i];
169 }
170 else {
171 $folds_to_count[$i] = 0;
172 }
173 }
174
175 # We will need to generate as many tests as the maximum number of
176 # folds, so that each fold will have at least one test.
177 for (my $i = 0; $i < $max_folds_to; $i++) {
178 my @this_test_to; # Assemble a single test
179
180 # For each character in the multi-char fold ...
181 for (my $j = 0; $j < $to_chars; $j++) {
182 my $this_char = substr($to, $j, 1);
183
184 # Use its corresponding inverse fold, if available.
185 if ($i < $folds_to_count[$j]) {
186 push @this_test_to, ord $inverse_folds{$this_char}[$i];
187 }
188 else { # Or else itself.
189 push @this_test_to, ord $this_char;
190 }
191 }
192
193 # Add this test to the list
194 push @test_to, [ @this_test_to ];
195 }
196
197 # Here, have assembled all the tests for the multi-char fold. Sort so
198 # lowest code points are first for consistency and aesthetics in
199 # output. We know there are at least two characters in the fold, but
200 # I haven't bothered to worry about sorting on an optional third
201 # character if the first two are identical.
202 @test_to = sort { ($a->[0] == $b->[0])
203 ? $a->[1] <=> $b->[1]
204 : $a->[0] <=> $b->[0]
205 } @test_to;
206 }
207
208
209 # This test is from n bytes to m bytes. Record that so won't try to add
210 # another test that does the same.
211 use bytes;
212 my $to_bytes = length $to;
213 foreach my $from_map (@from) {
214 $has_test_by_byte_count{length $from_map}{$to_bytes} = $to;
215 }
216 no bytes;
217
218 my $ord_smallest_from = ord shift @from;
219 if (exists $tests{$ord_smallest_from}) {
220 die "There are already tests for $ord_smallest_from"
221 };
222
223 # Add in the fold tests,
224 push @{$tests{$ord_smallest_from}}, @test_to;
225
226 # Then any remaining froms in the equivalence class.
227 push @{$tests{$ord_smallest_from}}, map { ord $_ } @from;
228}
229
230# Read the Unicode rules file and construct inverse mappings from it
a2d9a01a 231
a2d9a01a
KW
232my $file="../lib/unicore/CaseFolding.txt";
233open my $fh, "<", $file or die "Failed to read '$file': $!";
ae937040 234
a2d9a01a
KW
235while (<$fh>) {
236 chomp;
237
238 # Lines look like (though without the initial '#')
239 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
240
241 my ($line, $comment) = split / \s+ \# \s+ /x, $_;
ae937040
KW
242 next if $line eq "" || $line =~ /^#/;
243 my ($hex_from, $fold_type, @hex_folded) = split /[\s;]+/, $line;
a2d9a01a 244
ae937040
KW
245 next if $fold_type eq 'T'; # Perl doesn't do Turkish folding
246 next if $fold_type eq 'S'; # If Unicode's tables are correct, the F
247 # should be a superset of S
a2d9a01a 248
ae937040
KW
249 my $folded_str = pack ("U0U*", map { hex $_ } @hex_folded);
250 push @{$inverse_folds{$folded_str}}, chr hex $hex_from;
a2d9a01a
KW
251}
252
ae937040
KW
253# Analyze the data and generate tests to get adequate test coverage. We sort
254# things so that smallest code points are done first.
255TO:
256foreach my $to (sort { (length $a == length $b)
257 ? $a cmp $b
258 : length $a <=> length $b
259 } keys %inverse_folds) {
260
261 # Within each fold, sort so that the smallest code points are done first
262 @{$inverse_folds{$to}} = sort { $a cmp $b } @{$inverse_folds{$to}};
263 my @from = @{$inverse_folds{$to}};
264
265 # Just add it to the tests if doing complete coverage
266 if (! $skip_apparently_redundant) {
267 add_test($to, @from);
268 next TO;
a2d9a01a 269 }
ae937040
KW
270
271 my $to_chars = length $to;
272 my $to_range_type = range_type(substr($to, 0, 1));
273
274 # If this is required to be tested, do so. We check for these first, as
275 # they will take up slots of byte-to-byte combinations that we otherwise
276 # would have to have other tests to get.
277 foreach my $from_map (@from) {
278 if (exists $be_sure_to_test{$from_map}) {
279 add_test($to, @from);
280 next TO;
281 }
a7caa9e8 282 }
a2d9a01a 283
ae937040
KW
284 # If the fold contains heterogeneous range types, is suspect and should be
285 # tested.
286 if ($to_chars > 1) {
287 foreach my $char (split "", $to) {
288 if (range_type($char) != $to_range_type) {
289 add_test($to, @from);
290 next TO;
291 }
292 }
293 }
294
295 # If the mapping crosses range types, is suspect and should be tested
296 foreach my $from_map (@from) {
297 if (range_type($from_map) != $to_range_type) {
298 add_test($to, @from);
299 next TO;
a2d9a01a
KW
300 }
301 }
a2d9a01a 302
ae937040
KW
303 # Here, all components of the mapping are in the same range type. For
304 # single character folds, we test one case in each range type that has 2
305 # particpants, 3 particpants, etc.
306 if ($to_chars == 1) {
307 if (! exists $has_test_by_participants{scalar @from}{$to_range_type}) {
308 add_test($to, @from);
309 $has_test_by_participants{scalar @from}{$to_range_type} = $to;
310 next TO;
311 }
312 }
313
314 # We also test all combinations of mappings from m to n bytes. This is
315 # because the regex optimizer cares. (Don't bother worrying about that
316 # Latin1 chars will occupy a different number of bytes under utf8, as
317 # there are plenty of other cases that catch these byte numbers.)
318 use bytes;
319 my $to_bytes = length $to;
320 foreach my $from_map (@from) {
321 if (! exists $has_test_by_byte_count{length $from_map}{$to_bytes}) {
322 add_test($to, @from);
323 next TO;
a2d9a01a
KW
324 }
325 }
326}
327
328# For each range type, test additionally a character that folds to itself
ae937040
KW
329add_test(chr 0x3A, chr 0x3A);
330add_test(chr 0xF7, chr 0xF7);
331add_test(chr 0x2C7, chr 0x2C7);
a2d9a01a 332
2f7f8cb1
KW
333# To cut down on the number of tests
334my $has_tested_aa_above_latin1;
335my $has_tested_latin1_aa;
6eea66eb 336my $has_tested_ascii_aa;
17580e7a 337my $has_tested_l_above_latin1;
ceb92b9b 338my $has_tested_above_latin1_l;
6eea66eb 339my $has_tested_ascii_l;
963bd580
KW
340my $has_tested_above_latin1_d;
341my $has_tested_ascii_d;
f16e8484 342my $has_tested_non_latin1_d;
419d8974
KW
343my $has_tested_above_latin1_a;
344my $has_tested_ascii_a;
345my $has_tested_non_latin1_a;
2f7f8cb1 346
a2d9a01a
KW
347# For use by pairs() in generating combinations
348sub prefix {
349 my $p = shift;
a7caa9e8 350 map [ $p, $_ ], @_
a2d9a01a
KW
351}
352
353# Returns all ordered combinations of pairs of elements from the input array.
354# It doesn't return pairs like (a, a), (b, b). Change the slice to an array
355# to do that. This was just to have fewer tests.
a7caa9e8 356sub pairs (@) {
a2d9a01a 357 #print __LINE__, ": ", join(" XXX ", @_), "\n";
a7caa9e8 358 map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
a2d9a01a
KW
359}
360
419d8974 361my @charsets = qw(d u a aa);
a59efd0a 362my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
24282c4e
KW
363if ($current_locale eq 'C') {
364 use locale;
365
6cc59f4c 366 # Some locale implementations don't have the range 128-255 characters all
24282c4e
KW
367 # mean nothing. Skip the locale tests in that situation.
368 for my $i (128 .. 255) {
369 my $char = chr($i);
370 goto bad_locale if uc($char) ne $char || lc($char) ne $char;
371 }
372 push @charsets, 'l';
373bad_locale:
374}
a2d9a01a
KW
375
376# Finally ready to do the tests
a2d9a01a
KW
377foreach my $test (sort { numerically } keys %tests) {
378
379 my $previous_target;
380 my $previous_pattern;
381 my @pairs = pairs(sort numerically $test, @{$tests{$test}});
382
383 # Each fold can be viewed as a closure of all the characters that
384 # participate in it. Look at each possible pairing from a closure, with the
385 # first member of the pair the target string to match against, and the
386 # second member forming the pattern. Thus each fold member gets tested as
387 # the string, and the pattern with every other member in the opposite role.
388 while (my $pair = shift @pairs) {
389 my ($target, $pattern) = @$pair;
390
391 # When testing a char that doesn't fold, we can get the same
392 # permutation twice; so skip all but the first.
393 next if $previous_target
394 && $previous_target == $target
395 && $previous_pattern == $pattern;
396 ($previous_target, $previous_pattern) = ($target, $pattern);
397
398 # Each side may be either a single char or a string. Extract each into an
399 # array (perhaps of length 1)
400 my @target, my @pattern;
401 @target = (ref $target) ? @$target : $target;
402 @pattern = (ref $pattern) ? @$pattern : $pattern;
403
6cc59f4c
KW
404 # We are testing just folds to/from a single character. If our pairs
405 # happens to generate multi/multi, skip.
406 next if @target > 1 && @pattern > 1;
407
a2d9a01a
KW
408 # Have to convert non-utf8 chars to native char set
409 @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
410 @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;
411
412 # Get in hex form.
413 my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
414 my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
415
416 my $target_above_latin1 = grep { $_ > 255 } @target;
417 my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
2f7f8cb1
KW
418 my $target_has_ascii = grep { $_ < 128 } @target;
419 my $pattern_has_ascii = grep { $_ < 128 } @pattern;
6eea66eb
KW
420 my $target_only_ascii = ! grep { $_ > 127 } @target;
421 my $pattern_only_ascii = ! grep { $_ > 127 } @pattern;
17580e7a 422 my $target_has_latin1 = grep { $_ < 256 } @target;
f16e8484
KW
423 my $target_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @target;
424 my $pattern_has_upper_latin1 = grep { $_ < 256 && $_ > 127 } @pattern;
17580e7a 425 my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
a2d9a01a
KW
426 my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
427
428 # We don't test multi-char folding into other multi-chars. We are testing
429 # a code point that folds to or from other characters. Find the single
430 # code point for diagnostic purposes. (If both are single, choose the
431 # target string)
432 my $ord = @target == 1 ? $target[0] : $pattern[0];
7fea222d
KW
433 my $progress = sprintf "%04X: \"%s\" and /%s/",
434 $test,
a2d9a01a
KW
435 join("", @x_target),
436 join("", @x_pattern);
5473c576 437 #note $progress;
a2d9a01a
KW
438
439 # Now grind out tests, using various combinations.
a59efd0a 440 foreach my $charset (@charsets) {
5f01d936
KW
441 $okays = 0;
442 $this_iteration = 0;
2f7f8cb1 443
63fb01f9
KW
444 # To cut down somewhat on the enormous quantity of tests this currently
445 # runs, skip some for some of the character sets whose results aren't
446 # likely to differ from others. But run all tests on the code points
447 # that don't fold, plus one other set in each range group.
448 if (! $is_self) {
449
b0d6380c
KW
450 # /aa should only affect things with folds in the ASCII range. But, try
451 # it on one set in the other ranges just to make sure it doesn't break
452 # them.
453 if ($charset eq 'aa') {
454 if (! $target_has_ascii && ! $pattern_has_ascii) {
455 if ($target_above_latin1 || $pattern_above_latin1) {
456 next if defined $has_tested_aa_above_latin1
457 && $has_tested_aa_above_latin1 != $test;
458 $has_tested_aa_above_latin1 = $test;
459 }
460 next if defined $has_tested_latin1_aa
461 && $has_tested_latin1_aa != $test;
462 $has_tested_latin1_aa = $test;
2f7f8cb1 463 }
f02497ec
KW
464 elsif ($target_only_ascii && $pattern_only_ascii) {
465
466 # And, except for one set just to make sure, skip tests
467 # where both elements in the pair are ASCII. If one works for
468 # aa, the others are likely too. This skips tests where the
469 # fold is from non-ASCII to ASCII, but this part of the test
470 # is just about the ASCII components.
471 next if defined $has_tested_ascii_l
472 && $has_tested_ascii_l != $test;
473 $has_tested_ascii_l = $test;
474 }
2f7f8cb1 475 }
b0d6380c 476 elsif ($charset eq 'l') {
6ed220eb
KW
477
478 # For l, don't need to test beyond one set those things that are
7b4853d1
KW
479 # all above latin1, because unlikely to have different successes
480 # than /u
b0d6380c 481 if (! $target_has_latin1 && ! $pattern_has_latin1) {
ceb92b9b
KW
482 next if defined $has_tested_above_latin1_l
483 && $has_tested_above_latin1_l != $test;
484 $has_tested_above_latin1_l = $test;
b0d6380c 485 }
6eea66eb
KW
486 elsif ($target_only_ascii && $pattern_only_ascii) {
487
488 # And, except for one set just to make sure, skip tests
f02497ec
KW
489 # where both elements in the pair are ASCII. This is
490 # essentially the same reasoning as above for /aa.
6eea66eb
KW
491 next if defined $has_tested_ascii_l
492 && $has_tested_ascii_l != $test;
493 $has_tested_ascii_l = $test;
494 }
17580e7a 495 }
f16e8484
KW
496 elsif ($charset eq 'd') {
497 # Similarly for d. Beyond one test (besides self) each, we don't
498 # test pairs that are both ascii; or both above latin1, or are
499 # combinations of ascii and above latin1.
500 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
501 if ($target_has_ascii && $pattern_has_ascii) {
963bd580
KW
502 next if defined $has_tested_ascii_d
503 && $has_tested_ascii_d != $test;
f16e8484
KW
504 $has_tested_ascii_d = $test
505 }
506 elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
507 next if defined $has_tested_above_latin1_d
508 && $has_tested_above_latin1_d != $test;
509 $has_tested_above_latin1_d = $test;
510 }
511 else {
512 next if defined $has_tested_non_latin1_d
513 && $has_tested_non_latin1_d != $test;
514 $has_tested_non_latin1_d = $test;
515 }
963bd580
KW
516 }
517 }
419d8974
KW
518 elsif ($charset eq 'a') {
519 # Similarly for a. This should match identically to /u, so wasn't
520 # tested at all until a bug was found that was thereby missed.
521 # As a compromise, beyond one test (besides self) each, we don't
522 # test pairs that are both ascii; or both above latin1, or are
523 # combinations of ascii and above latin1.
524 if (! $target_has_upper_latin1 && ! $pattern_has_upper_latin1) {
525 if ($target_has_ascii && $pattern_has_ascii) {
526 next if defined $has_tested_ascii_a
527 && $has_tested_ascii_a != $test;
528 $has_tested_ascii_a = $test
529 }
530 elsif (! $target_has_latin1 && ! $pattern_has_latin1) {
531 next if defined $has_tested_above_latin1_a
532 && $has_tested_above_latin1_a != $test;
533 $has_tested_above_latin1_a = $test;
534 }
535 else {
536 next if defined $has_tested_non_latin1_a
537 && $has_tested_non_latin1_a != $test;
538 $has_tested_non_latin1_a = $test;
539 }
540 }
541 }
17580e7a 542 }
2f7f8cb1 543
a2d9a01a
KW
544 foreach my $utf8_target (0, 1) { # Both utf8 and not, for
545 # code points < 256
546 my $upgrade_target = "";
547
548 # These must already be in utf8 because the string to match has
549 # something above latin1. So impossible to test if to not to be in
550 # utf8; and otherwise, no upgrade is needed.
551 next if $target_above_latin1 && ! $utf8_target;
d08723ac 552 $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;
a2d9a01a 553
d08723ac
KW
554 foreach my $utf8_pattern (0, 1) {
555 next if $pattern_above_latin1 && ! $utf8_pattern;
17580e7a
KW
556
557 # Our testing of 'l' uses the POSIX locale, which is ASCII-only
558 my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/);
a2d9a01a 559 my $upgrade_pattern = "";
d08723ac 560 $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
a2d9a01a
KW
561
562 my $lhs = join "", @x_target;
aa3ca102 563 my $lhs_str = eval qq{"$lhs"}; fail($@) if $@;
a2d9a01a 564 my @rhs = @x_pattern;
371a505e 565 my $rhs = join "", @rhs;
2f7f8cb1 566 my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self)
17580e7a
KW
567 || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii)
568 || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1);
371a505e
KW
569
570 # Do simple tests of referencing capture buffers, named and
571 # numbered.
572 my $op = '=~';
573 $op = '!~' if $should_fail;
d2025f57 574
6d7cd591 575 my $todo = 0; # No longer any todo's
371a505e 576 my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
5f01d936 577 run_test($eval, $todo, "");
abf4d645 578
371a505e 579 $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
5f01d936 580 run_test($eval, $todo, "");
abf4d645 581
371a505e
KW
582 if ($lhs ne $rhs) {
583 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
5f01d936 584 run_test($eval, "", "");
abf4d645 585
371a505e 586 $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
5f01d936 587 run_test($eval, "", "");
371a505e 588 }
371a505e 589
fbebf34e
KW
590 # See if works on what could be a simple trie.
591 $eval = "my \$c = \"$lhs\"; my \$p = qr/$rhs|xyz/i$charset;$upgrade_target$upgrade_pattern \$c $op \$p";
5f01d936 592 run_test($eval, "", "");
67fc5dca 593
1ef17b72 594 foreach my $bracketed (0, 1) { # Put rhs in [...], or not
eee0f761
KW
595 next if $bracketed && @pattern != 1; # bracketed makes these
596 # or's instead of a sequence
a2d9a01a 597 foreach my $inverted (0,1) {
d2025f57 598 next if $inverted && ! $bracketed; # inversion only valid in [^...]
827f5bb8
KW
599 next if $inverted && @target != 1; # [perl #89750] multi-char
600 # not valid in [^...]
a2d9a01a
KW
601
602 # In some cases, add an extra character that doesn't fold, and
603 # looks ok in the output.
604 my $extra_char = "_";
605 foreach my $prepend ("", $extra_char) {
606 foreach my $append ("", $extra_char) {
a2d9a01a
KW
607
608 # Assemble the rhs. Put each character in a separate
609 # bracketed if using charclasses. This creates a stress on
610 # the code to span a match across multiple elements
611 my $rhs = "";
612 foreach my $rhs_char (@rhs) {
613 $rhs .= '[' if $bracketed;
614 $rhs .= '^' if $inverted;
615 $rhs .= $rhs_char;
616
617 # Add a character to the class, so class doesn't get
618 # optimized out
619 $rhs .= '_]' if $bracketed;
620 }
621
622 # Add one of: no capturing parens
623 # a single set
624 # a nested set
625 # Use quantifiers and extra variable width matches inside
626 # them to keep some optimizations from happening
627 foreach my $parend (0, 1, 2) {
628 my $interior = (! $parend)
629 ? $rhs
630 : ($parend == 1)
631 ? "(${rhs},?)"
632 : "((${rhs})+,?)";
633 foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
634
eee0f761
KW
635 # Perhaps should be TODOs, as are unimplemented, but
636 # maybe will never be implemented
637 next if @pattern != 1 && $quantifier;
638
a2d9a01a
KW
639 # A ? or * quantifier normally causes the thing to be
640 # able to match a null string
641 my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*';
642
643 # But since we only quantify the last character in a
644 # multiple fold, the other characters will have width,
645 # except if we are quantifying the whole rhs
646 my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend);
647
648 foreach my $l_anchor ("", '^') { # '\A' didn't change result)
649 foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result)
650
651 # The folded part can match the null string if it
652 # isn't required to have width, and there's not
653 # something on one or both sides that force it to.
2f7f8cb1
KW
654 my $both_sides = ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append);
655 my $must_match = ! $can_match_null || $both_sides;
656 # for performance, but doing this missed many failures
a2d9a01a 657 #next unless $must_match;
d08723ac 658 my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
a2d9a01a 659 my $op;
d08723ac 660 if ($must_match && $should_fail) {
a2d9a01a
KW
661 $op = 0;
662 } else {
663 $op = 1;
664 }
665 $op = ! $op if $must_match && $inverted;
27f6057f
KW
666
667 if ($inverted && @target > 1) {
668 # When doing an inverted match against a
669 # multi-char target, and there is not something on
670 # the left to anchor the match, if it shouldn't
671 # succeed, skip, as what will happen (when working
672 # correctly) is that it will match the first
673 # position correctly, and then be inverted to not
674 # match; then it will go to the second position
675 # where it won't match, but get inverted to match,
676 # and hence succeeding.
677 next if ! ($l_anchor || $prepend) && ! $op;
678
679 # Can't ever match for latin1 code points non-uni
680 # semantics that have a inverted multi-char fold
681 # when there is something on both sides and the
682 # quantifier isn't such as to span the required
683 # width, which is 2 or 3.
684 $op = 0 if $ord < 255
685 && ! $uni_semantics
686 && $both_sides
687 && ( ! $quantifier || $quantifier eq '?')
688 && $parend < 2;
689
690 # Similarly can't ever match when inverting a multi-char
691 # fold for /aa and the quantifier isn't sufficient
692 # to allow it to span to both sides.
693 $op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
694
695 # Or for /l
696 $op = 0 if $target_has_latin1 && $charset eq 'l' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
697 }
698
85882dd7
DM
699
700 my $desc = "my \$c = \"$prepend$lhs$append\"; "
701 . "my \$p = qr/$quantified/i;"
702 . "$upgrade_target$upgrade_pattern "
703 . "\$c " . ($op ? "=~" : "!~") . " \$p; ";
704 if ($DEBUG) {
705 $desc .= (
706 "; uni_semantics=$uni_semantics, "
707 . "should_fail=$should_fail, "
708 . "bracketed=$bracketed, "
709 . "prepend=$prepend, "
710 . "append=$append, "
711 . "parend=$parend, "
712 . "quantifier=$quantifier, "
713 . "l_anchor=$l_anchor, "
714 . "r_anchor=$r_anchor; "
715 . "pattern_above_latin1=$pattern_above_latin1; "
716 . "utf8_pattern=$utf8_pattern"
717 );
718 }
719
aa3ca102
DM
720 my $c = "$prepend$lhs_str$append";
721 my $p = qr/$quantified/i;
722 utf8::upgrade($c) if length($upgrade_target);
723 utf8::upgrade($p) if length($upgrade_pattern);
85882dd7 724 my $res = $op ? ($c =~ $p): ($c !~ $p);
aa3ca102 725
67fc5dca
DL
726 if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) {
727 # Failed or debug; output the result
728 $count++;
729 ok($res, $desc);
730 } else {
731 # Just count the test as passed
732 $okays++;
733 }
734 $this_iteration++;
a2d9a01a
KW
735 }
736 }
737 }
738 }
739 }
740 }
741 }
742 }
743 }
744 }
5f01d936
KW
745 unless($ENV{PERL_DEBUG_FULL_TEST}) {
746 $count++;
747 is $okays, $this_iteration, "$okays subtests ok for"
748 . " /$charset,"
749 . ' target="' . join("", @x_target) . '",'
750 . ' pat="' . join("", @x_pattern) . '"';
751 }
a2d9a01a
KW
752 }
753 }
754}
755
abf4d645 756plan($count);
a2d9a01a
KW
757
7581