This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Generate REFFU and NREFFU
[perl5.git] / t / re / fold_grind.t
CommitLineData
a2d9a01a
KW
1# Grind out a lot of combinatoric tests for folding. Still missing are
2# testing backreferences and tries.
3
4use charnames ":full";
5
6binmode STDOUT, ":utf8";
7
8BEGIN {
9 chdir 't' if -d 't';
10 @INC = '../lib';
11 require './test.pl';
12}
13
14use strict;
15use warnings;
d08723ac 16use Encode;
a2d9a01a
KW
17
18# Tests both unicode and not, so make sure not implicitly testing unicode
19no feature 'unicode_strings';
20
21# Case-insensitive matching is a large and complicated issue. Perl does not
22# implement it fully, properly. For example, it doesn't include normalization
23# as part of the equation. To test every conceivable combination is clearly
24# impossible; these tests are mostly drawn from visual inspection of the code
25# and experience, trying to exercise all areas.
26
27# There are three basic ranges of characters that Perl may treat differently:
28# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
29# referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants
30# are all controls that fold to themselves.
31my $ASCII = 1;
32
33# 2) Other characters that fit into a byte but are different in utf8 than not;
34# here referred to, taking some liberties, as Latin1.
35my $Latin1 = 2;
36
37# 3) Characters that won't fit in a byte; here referred to as Unicode
38my $Unicode = 3;
39
40# Within these basic groups are equivalence classes that testing any character
41# in is likely to lead to the same results as any other character. This is
42# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
43# set.
44my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
45
46sub range_type {
47 my $ord = shift;
48
49 return $ASCII if $ord < 128;
50 return $Latin1 if $ord < 256;
51 return $Unicode;
52}
53
54sub numerically {
55 return $a <=> $b
56}
57
58my %tests;
59my %simple_folds;
60my %multi_folds;
61
62# First, analyze the current Unicode's folding rules
63my %folded_from;
64my $file="../lib/unicore/CaseFolding.txt";
65open my $fh, "<", $file or die "Failed to read '$file': $!";
66while (<$fh>) {
67 chomp;
68
69 # Lines look like (though without the initial '#')
70 #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
71
72 my ($line, $comment) = split / \s+ \# \s+ /x, $_;
73 next if $line eq "" || substr($line, 0, 1) eq '#';
74 my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
75
76 my $from = hex $hex_from;
77
78 if ($fold_type eq 'F') {
79 next; # XXX TODO multi-char folds
80 my $from_range_type = range_type($from);
81 @folded = map { hex $_ } @folded;
82
83 # Include three code points that are handled internally by the regex
84 # engine specially, plus all non-Unicode multi folds (which actually
85 # the only one is already included in the three, but this makes sure)
86 # And if any member of the fold is not the same range type as the
87 # source, add it directly to the tests. It needs to be an array of an
88 # array, so that it is distinguished from multiple single folds
89 if ($from == 0xDF || $from == 0x390 || $from == 0x3B0
90 || $from_range_type != $Unicode
91 || grep { range_type($_) != $from_range_type } @folded)
92 {
93 $tests{$from} = [ [ @folded ] ];
94 }
95 else {
96
97 # Must be Unicode here, so chr is automatically utf8. Get the
98 # number of bytes in each. This is because the optimizer cares
99 # about length differences.
100 my $from_length = length encode('utf-8', chr($from));
101 my $to_length = length encode('utf-8', pack 'U*', @folded);
102 push @{$multi_folds{$from_length}{$to_length}}, { $from => [ @folded ] };
103 }
104 }
105
106 # Perl only deals with C and F folds
107 next if $fold_type ne 'C';
108
109 # C folds are single-char $from to single-char $folded
110 push @{$folded_from{hex $folded[0]}}, $from;
111}
112
113# Now try to sort the single char folds into equivalence classes of that are
114# likely to have identical successes and failures. Any fold that crosses
115# range types is suspect, and is automatically tested. Otherwise, store by
116# the number of characters that participate in a fold. Likely all folds in a
117# range type that fold to each other like B->b->B will have identical success
118# and failure; similarly all folds that have three characters participating
119# are likely to have the same successes and failures, etc.
120foreach my $folded (sort numerically keys %folded_from) {
121 my $target_range_type = range_type($folded);
122 my $count = @{$folded_from{$folded}};
123
124 # Automatically test any fold that crosses range types
125 if (grep { range_type($_) != $target_range_type } @{$folded_from{$folded}})
126 {
127 $tests{$folded} = $folded_from{$folded};
128 }
129 else {
130 push @{$simple_folds{$target_range_type}{$count}},
131 { $folded => $folded_from{$folded} };
132 }
133}
134
135foreach my $from_length (keys %multi_folds) {
136 foreach my $fold_length (keys %{$multi_folds{$from_length}}) {
137 #print __LINE__, ref $multi_folds{$from_length}{$fold_length}, Dumper $multi_folds{$from_length}{$fold_length};
138 foreach my $test (@{$multi_folds{$from_length}{$fold_length}}) {
139 #print __LINE__, ": $from_length, $fold_length, $test:\n";
140 my ($target, $pattern) = each %$test;
141 #print __LINE__, ": $target: $pattern\n";
142 $tests{$target} = $pattern;
143 last if $skip_apparently_redundant;
144 }
145 }
146}
147
148# Add in tests for single character folds. Add tests for each range type,
149# and within those tests for each number of characters participating in a
150# fold. Thus B->b has two characters participating. But K->k and Kelvin
151# Sign->k has three characters participating. So we would make sure that
152# there is a test for 3 chars, 4 chars, ... . (Note that the 'k' example is a
153# bad one because it crosses range types, so is automatically tested. In the
154# Unicode range there are various of these 3 and 4 char classes, but aren't as
155# easily described as the 'k' one.)
156foreach my $type (keys %simple_folds) {
157 foreach my $count (keys %{$simple_folds{$type}}) {
158 foreach my $test (@{$simple_folds{$type}{$count}}) {
159 my ($target, $pattern) = each %$test;
160 $tests{$target} = $pattern;
161 last if $skip_apparently_redundant;
162 }
163 }
164}
165
166# For each range type, test additionally a character that folds to itself
167$tests{0x3A} = [ 0x3A ];
168$tests{0xF7} = [ 0xF7 ];
169$tests{0x2C7} = [ 0x2C7 ];
170
171my $clump_execs = 10000; # Speed up by building an 'exec' of many tests
172my @eval_tests;
173
174# For use by pairs() in generating combinations
175sub prefix {
176 my $p = shift;
177 map [ $p, $_ ], @_
178}
179
180# Returns all ordered combinations of pairs of elements from the input array.
181# It doesn't return pairs like (a, a), (b, b). Change the slice to an array
182# to do that. This was just to have fewer tests.
183sub pairs (@) {
184 #print __LINE__, ": ", join(" XXX ", @_), "\n";
185 map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
186}
187
188
189# Finally ready to do the tests
190my $count=1;
191foreach my $test (sort { numerically } keys %tests) {
192
193 my $previous_target;
194 my $previous_pattern;
195 my @pairs = pairs(sort numerically $test, @{$tests{$test}});
196
197 # Each fold can be viewed as a closure of all the characters that
198 # participate in it. Look at each possible pairing from a closure, with the
199 # first member of the pair the target string to match against, and the
200 # second member forming the pattern. Thus each fold member gets tested as
201 # the string, and the pattern with every other member in the opposite role.
202 while (my $pair = shift @pairs) {
203 my ($target, $pattern) = @$pair;
204
205 # When testing a char that doesn't fold, we can get the same
206 # permutation twice; so skip all but the first.
207 next if $previous_target
208 && $previous_target == $target
209 && $previous_pattern == $pattern;
210 ($previous_target, $previous_pattern) = ($target, $pattern);
211
212 # Each side may be either a single char or a string. Extract each into an
213 # array (perhaps of length 1)
214 my @target, my @pattern;
215 @target = (ref $target) ? @$target : $target;
216 @pattern = (ref $pattern) ? @$pattern : $pattern;
217
218 # Have to convert non-utf8 chars to native char set
219 @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
220 @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;
221
222 # Get in hex form.
223 my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
224 my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
225
226 my $target_above_latin1 = grep { $_ > 255 } @target;
227 my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
228 my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
229
230 # We don't test multi-char folding into other multi-chars. We are testing
231 # a code point that folds to or from other characters. Find the single
232 # code point for diagnostic purposes. (If both are single, choose the
233 # target string)
234 my $ord = @target == 1 ? $target[0] : $pattern[0];
235 my $progress = sprintf "\"%s\" and /%s/",
236 join("", @x_target),
237 join("", @x_pattern);
238 #print $progress, "\n";
239 #diag $progress;
240
241 # Now grind out tests, using various combinations.
d08723ac
KW
242 # XXX foreach my $charset ('d', 'u', 'l') {
243 foreach my $charset ('d', 'u') {
a2d9a01a
KW
244 foreach my $utf8_target (0, 1) { # Both utf8 and not, for
245 # code points < 256
246 my $upgrade_target = "";
247
248 # These must already be in utf8 because the string to match has
249 # something above latin1. So impossible to test if to not to be in
250 # utf8; and otherwise, no upgrade is needed.
251 next if $target_above_latin1 && ! $utf8_target;
d08723ac 252 $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;
a2d9a01a 253
d08723ac
KW
254 foreach my $utf8_pattern (0, 1) {
255 next if $pattern_above_latin1 && ! $utf8_pattern;
256 my $uni_semantics = $utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern);
a2d9a01a 257 my $upgrade_pattern = "";
d08723ac 258 $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
a2d9a01a
KW
259
260 my $lhs = join "", @x_target;
261 my @rhs = @x_pattern;
d08723ac 262 my $should_fail = ! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self;
1ef17b72 263 foreach my $bracketed (0, 1) { # Put rhs in [...], or not
a2d9a01a
KW
264 foreach my $inverted (0,1) {
265 next if $inverted && ! $bracketed;
266
267 # In some cases, add an extra character that doesn't fold, and
268 # looks ok in the output.
269 my $extra_char = "_";
270 foreach my $prepend ("", $extra_char) {
271 foreach my $append ("", $extra_char) {
272 # Append a char for after quantifier, as results vary if no
273 # char appended.
274
275 # Assemble the rhs. Put each character in a separate
276 # bracketed if using charclasses. This creates a stress on
277 # the code to span a match across multiple elements
278 my $rhs = "";
279 foreach my $rhs_char (@rhs) {
280 $rhs .= '[' if $bracketed;
281 $rhs .= '^' if $inverted;
282 $rhs .= $rhs_char;
283
284 # Add a character to the class, so class doesn't get
285 # optimized out
286 $rhs .= '_]' if $bracketed;
287 }
288
289 # Add one of: no capturing parens
290 # a single set
291 # a nested set
292 # Use quantifiers and extra variable width matches inside
293 # them to keep some optimizations from happening
294 foreach my $parend (0, 1, 2) {
295 my $interior = (! $parend)
296 ? $rhs
297 : ($parend == 1)
298 ? "(${rhs},?)"
299 : "((${rhs})+,?)";
300 foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
301
302 # A ? or * quantifier normally causes the thing to be
303 # able to match a null string
304 my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*';
305
306 # But since we only quantify the last character in a
307 # multiple fold, the other characters will have width,
308 # except if we are quantifying the whole rhs
309 my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend);
310
311 foreach my $l_anchor ("", '^') { # '\A' didn't change result)
312 foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result)
313
314 # The folded part can match the null string if it
315 # isn't required to have width, and there's not
316 # something on one or both sides that force it to.
317 my $must_match = ! $can_match_null || ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append);
318 #next unless $must_match;
d08723ac 319 my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
a2d9a01a 320 my $op;
d08723ac 321 if ($must_match && $should_fail) {
a2d9a01a
KW
322 $op = 0;
323 } else {
324 $op = 1;
325 }
326 $op = ! $op if $must_match && $inverted;
327 $op = ($op) ? '=~' : '!~';
328
d08723ac
KW
329 my $stuff .= " uni_semantics=$uni_semantics, should_fail=$should_fail, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quantifier=$quantifier, l_anchor=$l_anchor, r_anchor=$r_anchor";
330 $stuff .= "; pattern_above_latin1=$pattern_above_latin1; utf8_pattern=$utf8_pattern";
331 my $eval = "my \$c = \"$prepend$lhs$append\"; my \$p = qr/$quantified/i;$upgrade_target$upgrade_pattern \$c $op \$p;";
a2d9a01a
KW
332
333 # XXX Doesn't currently test multi-char folds
334 next if @pattern != 1;
335 #next if ! $must_match;
336 push @eval_tests, qq[ok(eval '$eval', '$eval')];
337 $count++;
338
339 # Group tests
340 if (@eval_tests >= $clump_execs) {
341 eval join ";\n", @eval_tests;
342 undef @eval_tests;
343 }
344 }
345 }
346 }
347 }
348 }
349 }
350 }
351 }
352 }
353 }
354 }
355 }
356}
357
358# Finish up any tests not already done
359eval join ";\n", @eval_tests;
360
361plan($count-1);
362
3631