This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add fold_grind.t
[perl5.git] / t / re / fold_grind.t
CommitLineData
a2d9a01a
KW
1# Grind out a lot of combinatoric tests for folding. Still missing are
2# testing backreferences and tries.
3
4use charnames ":full";
5
6binmode STDOUT, ":utf8";
7
8BEGIN {
9 chdir 't' if -d 't';
10 @INC = '../lib';
11 require './test.pl';
12}
13
14use strict;
15use warnings;
16
17# Tests both unicode and not, so make sure not implicitly testing unicode
18no feature 'unicode_strings';
19
20# Case-insensitive matching is a large and complicated issue. Perl does not
21# implement it fully, properly. For example, it doesn't include normalization
22# as part of the equation. To test every conceivable combination is clearly
23# impossible; these tests are mostly drawn from visual inspection of the code
24# and experience, trying to exercise all areas.
25
26# There are three basic ranges of characters that Perl may treat differently:
27# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
28# referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants
29# are all controls that fold to themselves.
30my $ASCII = 1;
31
32# 2) Other characters that fit into a byte but are different in utf8 than not;
33# here referred to, taking some liberties, as Latin1.
34my $Latin1 = 2;
35
36# 3) Characters that won't fit in a byte; here referred to as Unicode
37my $Unicode = 3;
38
39# Within these basic groups are equivalence classes that testing any character
40# in is likely to lead to the same results as any other character. This is
41# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
42# set.
43my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
44
45sub range_type {
46 my $ord = shift;
47
48 return $ASCII if $ord < 128;
49 return $Latin1 if $ord < 256;
50 return $Unicode;
51}
52
53sub numerically {
54 return $a <=> $b
55}
56
57my %tests;
58my %simple_folds;
59my %multi_folds;
60
61# First, analyze the current Unicode's folding rules
62my %folded_from;
63my $file="../lib/unicore/CaseFolding.txt";
64open my $fh, "<", $file or die "Failed to read '$file': $!";
65while (<$fh>) {
66 chomp;
67
68 # Lines look like (though without the initial '#')
69 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
70
71 my ($line, $comment) = split / \s+ \# \s+ /x, $_;
72 next if $line eq "" || substr($line, 0, 1) eq '#';
73 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
74
75 my $from = hex $hex_from;
76
77 if ($fold_type eq 'F') {
78 next; # XXX TODO multi-char folds
79 my $from_range_type = range_type($from);
80 @folded = map { hex $_ } @folded;
81
82 # Include three code points that are handled internally by the regex
83 # engine specially, plus all non-Unicode multi folds (which actually
84 # the only one is already included in the three, but this makes sure)
85 # And if any member of the fold is not the same range type as the
86 # source, add it directly to the tests. It needs to be an array of an
87 # array, so that it is distinguished from multiple single folds
88 if ($from == 0xDF || $from == 0x390 || $from == 0x3B0
89 || $from_range_type != $Unicode
90 || grep { range_type($_) != $from_range_type } @folded)
91 {
92 $tests{$from} = [ [ @folded ] ];
93 }
94 else {
95
96 # Must be Unicode here, so chr is automatically utf8. Get the
97 # number of bytes in each. This is because the optimizer cares
98 # about length differences.
99 my $from_length = length encode('utf-8', chr($from));
100 my $to_length = length encode('utf-8', pack 'U*', @folded);
101 push @{$multi_folds{$from_length}{$to_length}}, { $from => [ @folded ] };
102 }
103 }
104
105 # Perl only deals with C and F folds
106 next if $fold_type ne 'C';
107
108 # C folds are single-char $from to single-char $folded
109 push @{$folded_from{hex $folded[0]}}, $from;
110}
111
112# Now try to sort the single char folds into equivalence classes of that are
113# likely to have identical successes and failures. Any fold that crosses
114# range types is suspect, and is automatically tested. Otherwise, store by
115# the number of characters that participate in a fold. Likely all folds in a
116# range type that fold to each other like B->b->B will have identical success
117# and failure; similarly all folds that have three characters participating
118# are likely to have the same successes and failures, etc.
119foreach my $folded (sort numerically keys %folded_from) {
120 my $target_range_type = range_type($folded);
121 my $count = @{$folded_from{$folded}};
122
123 # Automatically test any fold that crosses range types
124 if (grep { range_type($_) != $target_range_type } @{$folded_from{$folded}})
125 {
126 $tests{$folded} = $folded_from{$folded};
127 }
128 else {
129 push @{$simple_folds{$target_range_type}{$count}},
130 { $folded => $folded_from{$folded} };
131 }
132}
133
134foreach my $from_length (keys %multi_folds) {
135 foreach my $fold_length (keys %{$multi_folds{$from_length}}) {
136 #print __LINE__, ref $multi_folds{$from_length}{$fold_length}, Dumper $multi_folds{$from_length}{$fold_length};
137 foreach my $test (@{$multi_folds{$from_length}{$fold_length}}) {
138 #print __LINE__, ": $from_length, $fold_length, $test:\n";
139 my ($target, $pattern) = each %$test;
140 #print __LINE__, ": $target: $pattern\n";
141 $tests{$target} = $pattern;
142 last if $skip_apparently_redundant;
143 }
144 }
145}
146
147# Add in tests for single character folds. Add tests for each range type,
148# and within those tests for each number of characters participating in a
149# fold. Thus B->b has two characters participating. But K->k and Kelvin
150# Sign->k has three characters participating. So we would make sure that
151# there is a test for 3 chars, 4 chars, ... . (Note that the 'k' example is a
152# bad one because it crosses range types, so is automatically tested. In the
153# Unicode range there are various of these 3 and 4 char classes, but aren't as
154# easily described as the 'k' one.)
155foreach my $type (keys %simple_folds) {
156 foreach my $count (keys %{$simple_folds{$type}}) {
157 foreach my $test (@{$simple_folds{$type}{$count}}) {
158 my ($target, $pattern) = each %$test;
159 $tests{$target} = $pattern;
160 last if $skip_apparently_redundant;
161 }
162 }
163}
164
165# For each range type, test additionally a character that folds to itself
166$tests{0x3A} = [ 0x3A ];
167$tests{0xF7} = [ 0xF7 ];
168$tests{0x2C7} = [ 0x2C7 ];
169
170my $clump_execs = 10000; # Speed up by building an 'exec' of many tests
171my @eval_tests;
172
173# For use by pairs() in generating combinations
174sub prefix {
175 my $p = shift;
176 map [ $p, $_ ], @_
177}
178
179# Returns all ordered combinations of pairs of elements from the input array.
180# It doesn't return pairs like (a, a), (b, b). Change the slice to an array
181# to do that. This was just to have fewer tests.
182sub pairs (@) {
183 #print __LINE__, ": ", join(" XXX ", @_), "\n";
184 map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
185}
186
187
188# Finally ready to do the tests
189my $count=1;
190foreach my $test (sort { numerically } keys %tests) {
191
192 my $previous_target;
193 my $previous_pattern;
194 my @pairs = pairs(sort numerically $test, @{$tests{$test}});
195
196 # Each fold can be viewed as a closure of all the characters that
197 # participate in it. Look at each possible pairing from a closure, with the
198 # first member of the pair the target string to match against, and the
199 # second member forming the pattern. Thus each fold member gets tested as
200 # the string, and the pattern with every other member in the opposite role.
201 while (my $pair = shift @pairs) {
202 my ($target, $pattern) = @$pair;
203
204 # When testing a char that doesn't fold, we can get the same
205 # permutation twice; so skip all but the first.
206 next if $previous_target
207 && $previous_target == $target
208 && $previous_pattern == $pattern;
209 ($previous_target, $previous_pattern) = ($target, $pattern);
210
211 # Each side may be either a single char or a string. Extract each into an
212 # array (perhaps of length 1)
213 my @target, my @pattern;
214 @target = (ref $target) ? @$target : $target;
215 @pattern = (ref $pattern) ? @$pattern : $pattern;
216
217 # Have to convert non-utf8 chars to native char set
218 @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
219 @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;
220
221 # Get in hex form.
222 my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
223 my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
224
225 my $target_above_latin1 = grep { $_ > 255 } @target;
226 my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
227 my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
228
229 # We don't test multi-char folding into other multi-chars. We are testing
230 # a code point that folds to or from other characters. Find the single
231 # code point for diagnostic purposes. (If both are single, choose the
232 # target string)
233 my $ord = @target == 1 ? $target[0] : $pattern[0];
234 my $progress = sprintf "\"%s\" and /%s/",
235 join("", @x_target),
236 join("", @x_pattern);
237 #print $progress, "\n";
238 #diag $progress;
239
240 # Now grind out tests, using various combinations.
241 foreach my $uni_semantics ("", 'u') { # Both non- and uni semantics
242 foreach my $utf8_target (0, 1) { # Both utf8 and not, for
243 # code points < 256
244 my $upgrade_target = "";
245
246 # These must already be in utf8 because the string to match has
247 # something above latin1. So impossible to test if to not to be in
248 # utf8; and otherwise, no upgrade is needed.
249 next if $target_above_latin1 && ! $utf8_target;
250 $upgrade_target = '; utf8::upgrade($c)' if ! $target_above_latin1 && $utf8_target;
251
252 foreach my $utf8_pattern (0, 1) {
253 next if $pattern_above_latin1 && ! $utf8_pattern;
254 my $upgrade_pattern = "";
255 $upgrade_pattern = '; utf8::upgrade($p)' if ! $pattern_above_latin1 && $utf8_pattern;
256
257 my $lhs = join "", @x_target;
258 my @rhs = @x_pattern;
259 #print "$lhs: ", "/@rhs/\n";
260
261 foreach my $bracketed (0, 1) { # Put rhs in [...], or not
262 foreach my $inverted (0,1) {
263 next if $inverted && ! $bracketed;
264
265 # In some cases, add an extra character that doesn't fold, and
266 # looks ok in the output.
267 my $extra_char = "_";
268 foreach my $prepend ("", $extra_char) {
269 foreach my $append ("", $extra_char) {
270 # Append a char for after quantifier, as results vary if no
271 # char appended.
272
273 # Assemble the rhs. Put each character in a separate
274 # bracketed if using charclasses. This creates a stress on
275 # the code to span a match across multiple elements
276 my $rhs = "";
277 foreach my $rhs_char (@rhs) {
278 $rhs .= '[' if $bracketed;
279 $rhs .= '^' if $inverted;
280 $rhs .= $rhs_char;
281
282 # Add a character to the class, so class doesn't get
283 # optimized out
284 $rhs .= '_]' if $bracketed;
285 }
286
287 # Add one of: no capturing parens
288 # a single set
289 # a nested set
290 # Use quantifiers and extra variable width matches inside
291 # them to keep some optimizations from happening
292 foreach my $parend (0, 1, 2) {
293 my $interior = (! $parend)
294 ? $rhs
295 : ($parend == 1)
296 ? "(${rhs},?)"
297 : "((${rhs})+,?)";
298 foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
299
300 # A ? or * quantifier normally causes the thing to be
301 # able to match a null string
302 my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*';
303
304 # But since we only quantify the last character in a
305 # multiple fold, the other characters will have width,
306 # except if we are quantifying the whole rhs
307 my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend);
308
309 foreach my $l_anchor ("", '^') { # '\A' didn't change result)
310 foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result)
311
312 # The folded part can match the null string if it
313 # isn't required to have width, and there's not
314 # something on one or both sides that force it to.
315 my $must_match = ! $can_match_null || ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append);
316 #next unless $must_match;
317 my $quantified = "(?$uni_semantics:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
318 my $op;
319 if ($must_match && ! $utf8_target && ! $utf8_pattern && ! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self) {
320 $op = 0;
321 } else {
322 $op = 1;
323 }
324 $op = ! $op if $must_match && $inverted;
325 $op = ($op) ? '=~' : '!~';
326
327 my $stuff .= " utf8_target=$utf8_target, uni_semantics=$uni_semantics, utf8_pattern=$utf8_pattern, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quantifier=$quantifier, l_anchor=$l_anchor, r_anchor=$r_anchor";
328 my $eval = "my \$c = \"$prepend$lhs$append\"$upgrade_target; my \$p = qr/$quantified/i$upgrade_pattern; \$c $op \$p";
329
330 # XXX Doesn't currently test multi-char folds
331 next if @pattern != 1;
332 #next if ! $must_match;
333 push @eval_tests, qq[ok(eval '$eval', '$eval')];
334 $count++;
335
336 # Group tests
337 if (@eval_tests >= $clump_execs) {
338 eval join ";\n", @eval_tests;
339 undef @eval_tests;
340 }
341 }
342 }
343 }
344 }
345 }
346 }
347 }
348 }
349 }
350 }
351 }
352 }
353}
354
355# Finish up any tests not already done
356eval join ";\n", @eval_tests;
357
358plan($count-1);
359
3601