[perl5.git] / t / re / fold_grind.t

# Grind out a lot of combinatoric tests for folding.

use charnames ":full";

binmode STDOUT, ":utf8";

BEGIN {
    chdir 't' if -d 't';
    @INC = '../lib';
    require './test.pl';
    skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX");
}

my $DEBUG = 0;  # Outputs extra information for debugging this .t

use strict;
use warnings;
use Encode;
use POSIX;

# Tests both unicode and not, so make sure not implicitly testing unicode
no feature 'unicode_strings';

# Case-insensitive matching is a large and complicated issue.  Perl does not
# implement it fully, properly.  For example, it doesn't include normalization
# as part of the equation.  To test every conceivable combination is clearly
# impossible; these tests are mostly drawn from visual inspection of the code
# and experience, trying to exercise all areas.

# There are three basic ranges of characters that Perl may treat differently:
# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
#    referred to here as ASCII.  On EBCDIC machines, the non-ASCII invariants
#    are all controls that fold to themselves.
my $ASCII = 1;

# 2) Other characters that fit into a byte but are different in utf8 than not;
#    here referred to, taking some liberties, as Latin1.
my $Latin1 = 2;

# 3) Characters that won't fit in a byte; here referred to as Unicode
my $Unicode = 3;

# Within these basic groups are equivalence classes that testing any character
# in is likely to lead to the same results as any other character.  This is
# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
# set.
my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};

sub range_type {
    my $ord = shift;

    return $ASCII if $ord < 128;
    return $Latin1 if $ord < 256;
    return $Unicode;
}

sub numerically {
    return $a <=> $b
}

sub format_test($$$$) {
    my ($test, $count, $todo, $debug) = @_;

    # Create a test entry, with TODO set if it is one of the known problem
    # code points

    $debug = "" unless $DEBUG;
    $todo = "Known problem" if $todo;

    return qq[TODO: { local \$::TODO = "$todo"; ok(eval '$test', '$test; $debug'); }];
}

my %tests;          # The final set of tests. keys are the code points to test
my %simple_folds;
my %multi_folds;

# First, analyze the current Unicode's folding rules
my %folded_from;
my $file="../lib/unicore/CaseFolding.txt";
open my $fh, "<", $file or die "Failed to read '$file': $!";
while (<$fh>) {
    chomp;

    # Lines look like (though without the initial '#')
    #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE

    my ($line, $comment) = split / \s+ \# \s+ /x, $_;
    next if $line eq "" || substr($line, 0, 1) eq '#';
    my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;

    my $from = hex $hex_from;

    if ($fold_type eq 'F') {
         my $from_range_type = range_type($from);

        # If we were testing comprehensively, we would try every combination
        # of upper and lower case in the fold, but it is quite likely that if
        # the code can handle all combinations if it can handle the cases
        # where everything is upper and when everything is lower.  Because of
        # complement matching, we need to do both.  And we use the
        # reverse-fold instead of uppercase.
        @folded = map { hex $_ } @folded;
        # XXX better to use reverse fold of these instead of uc
        my @uc_folded = map { ord uc chr $_ } @folded;

        # Include three code points that are handled internally by the regex
        # engine specially, plus all non-above-255 multi folds (which actually
        # the only one is already included in the three, but this makes sure)
        # And if any member of the fold is not the same range type as the
        # source, add it directly to the tests.  It needs to be an array of an
        # array, so that it is distinguished from multiple single folds
        if ($from == 0xDF || $from == 0x390 || $from == 0x3B0
            || $from_range_type != $Unicode
            || grep { range_type($_) != $from_range_type } @folded)
        {
            $tests{$from} = [ [ @folded ], [ @uc_folded ] ];
        }
        else {

            # The only multi-char non-utf8 fold is DF, which is handled above,
            # so here chr() must be utf8.  Get the number of bytes in each.
            # This is because the optimizer cares about length differences.
            my $from_length = length encode('UTF-8', chr($from));
            my $to_length = length encode('UTF-8', pack 'U*', @folded);
            push @{$multi_folds{$from_length}{$to_length}}, { $from => [ [ @folded ], [ @uc_folded ] ] };
        }
    }

    # Perl only deals with C and F folds
    next if $fold_type ne 'C';

    # C folds are single-char $from to single-char $folded, in chr terms
    # folded_from{'s'} = [ 'S', \N{LATIN SMALL LETTER LONG S} ]
    push @{$folded_from{hex $folded[0]}}, $from;
}

# Now try to sort the single char folds into equivalence classes that are
# likely to have identical successes and failures.  Any fold that crosses
# range types is suspect, and is automatically tested.  Otherwise, store by
# the number of characters that participate in a fold.  Likely all folds in a
# range type that fold to each other like B->b->B will have identical success
# and failure; similarly all folds that have three characters participating
# are likely to have the same successes and failures, etc.
foreach my $folded (sort numerically keys %folded_from) {
    my $target_range_type  = range_type($folded);
    my $count = @{$folded_from{$folded}};

    # Automatically test any fold that crosses range types
    if (grep { range_type($_) != $target_range_type } @{$folded_from{$folded}})
    {
        $tests{$folded} = $folded_from{$folded};
    }
    else {
        push @{$simple_folds{$target_range_type}{$count}},
               { $folded => $folded_from{$folded} };
    }
}

foreach my $from_length (keys %multi_folds) {
    foreach my $fold_length (keys %{$multi_folds{$from_length}}) {
        #print __LINE__, ref $multi_folds{$from_length}{$fold_length}, Dumper $multi_folds{$from_length}{$fold_length};
        foreach my $test (@{$multi_folds{$from_length}{$fold_length}}) {
            #print __LINE__, ": $from_length, $fold_length, $test:\n";
            my ($target, $pattern) = each %$test;
            #print __LINE__, ": $target: $pattern\n";
            $tests{$target} = $pattern;
            last if $skip_apparently_redundant;
        }
    }
}

# Add in tests for single character folds.  Add tests for each range type,
# and within those tests for each number of characters participating in a
# fold.  Thus B->b has two characters participating.  But K->k and Kelvin
# Sign->k has three characters participating.  So we would make sure that
# there is a test for 3 chars, 4 chars, ... .  (Note that the 'k' example is a
# bad one because it crosses range types, so is automatically tested.  In the
# Unicode range there are various of these 3 and 4 char classes, but aren't as
# easily described as the 'k' one.)
foreach my $type (keys %simple_folds) {
    foreach my $count (keys %{$simple_folds{$type}}) {
        foreach my $test (@{$simple_folds{$type}{$count}}) {
            my ($target, $pattern) = each %$test;
            $tests{$target} = $pattern;
            last if $skip_apparently_redundant;
        }
    }
}

# For each range type, test additionally a character that folds to itself
$tests{0x3A} = [ 0x3A ];
$tests{0xF7} = [ 0xF7 ];
$tests{0x2C7} = [ 0x2C7 ];

my $clump_execs = 1000;    # Speed up by building an 'exec' of many tests
my @eval_tests;

# To cut down on the number of tests
my $has_tested_aa_above_latin1;
my $has_tested_latin1_aa;
my $has_tested_l_above_latin1;
my $has_tested_latin1_l;

# For use by pairs() in generating combinations
sub prefix {
    my $p = shift;
    map [ $p, $_ ], @_
}

# Returns all ordered combinations of pairs of elements from the input array.
# It doesn't return pairs like (a, a), (b, b).  Change the slice to an array
# to do that.  This was just to have fewer tests.
sub pairs (@) {
    #print __LINE__, ": ", join(" XXX ", @_), "\n";
    map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
}

my @charsets = qw(d u aa);
my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
push @charsets, 'l' if $current_locale eq 'C';

# Finally ready to do the tests
my $count=0;
foreach my $test (sort { numerically } keys %tests) {

  my $previous_target;
  my $previous_pattern;
  my @pairs = pairs(sort numerically $test, @{$tests{$test}});

  # Each fold can be viewed as a closure of all the characters that
  # participate in it.  Look at each possible pairing from a closure, with the
  # first member of the pair the target string to match against, and the
  # second member forming the pattern.  Thus each fold member gets tested as
  # the string, and the pattern with every other member in the opposite role.
  while (my $pair = shift @pairs) {
    my ($target, $pattern) = @$pair;

    # When testing a char that doesn't fold, we can get the same
    # permutation twice; so skip all but the first.
    next if $previous_target
            && $previous_target == $target
            && $previous_pattern == $pattern;
    ($previous_target, $previous_pattern) = ($target, $pattern);

    # Each side may be either a single char or a string.  Extract each into an
    # array (perhaps of length 1)
    my @target, my @pattern;
    @target = (ref $target) ? @$target : $target;
    @pattern = (ref $pattern) ? @$pattern : $pattern;

    # Have to convert non-utf8 chars to native char set
    @target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
    @pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;

    # Get in hex form.
    my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
    my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;

    my $target_above_latin1 = grep { $_ > 255 } @target;
    my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
    my $target_has_ascii = grep { $_ < 128 } @target;
    my $pattern_has_ascii = grep { $_ < 128 } @pattern;
    my $target_has_latin1 = grep { $_ < 256 } @target;
    my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
    my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];

    # We don't test multi-char folding into other multi-chars.  We are testing
    # a code point that folds to or from other characters.  Find the single
    # code point for diagnostic purposes.  (If both are single, choose the
    # target string)
    my $ord = @target == 1 ? $target[0] : $pattern[0];
    my $progress = sprintf "%04X: \"%s\" and /%s/",
                            $test,
                            join("", @x_target),
                            join("", @x_pattern);
    #print $progress, "\n";
    #diag $progress;

    # Now grind out tests, using various combinations.
    foreach my $charset (@charsets) {

      # /aa should only affect things with folds in the ASCII range.  But, try
      # it on one pair in the other ranges just to make sure it doesn't break
      # them.  Set these flags.  They are set to the ord of the character
      # tested so that all pairs of that ord get tested.
      if ($charset eq 'aa') {
        if (! $target_has_ascii && ! $pattern_has_ascii) {
          if ($target_above_latin1 || $pattern_above_latin1) {
            next if defined $has_tested_aa_above_latin1
                    && $has_tested_aa_above_latin1 != $test;
            $has_tested_aa_above_latin1 = $test;
          }
          next if defined $has_tested_latin1_aa && $has_tested_latin1_aa != $test;
          $has_tested_latin1_aa = $test;
        }
      }
      elsif ($charset eq 'l') {
        if (! $target_has_latin1 && ! $pattern_has_latin1) {
          next if defined $has_tested_latin1_l && $has_tested_latin1_l != $test;
          $has_tested_latin1_l = $test;
        }
      }

      foreach my $utf8_target (0, 1) {    # Both utf8 and not, for
                                          # code points < 256
        my $upgrade_target = "";

        # These must already be in utf8 because the string to match has
        # something above latin1.  So impossible to test if to not to be in
        # utf8; and otherwise, no upgrade is needed.
        next if $target_above_latin1 && ! $utf8_target;
        $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;

        foreach my $utf8_pattern (0, 1) {
          next if $pattern_above_latin1 && ! $utf8_pattern;

          # Our testing of 'l' uses the POSIX locale, which is ASCII-only
          my $uni_semantics = $charset ne 'l' && ($utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern) || $charset =~ /a/);
          my $upgrade_pattern = "";
          $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;

          my $lhs = join "", @x_target;
          my @rhs = @x_pattern;
          my $rhs = join "", @rhs;
          my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self)
                            || ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii)
                            || ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1);

          # Do simple tests of referencing capture buffers, named and
          # numbered.
          my $op = '=~';
          $op = '!~' if $should_fail;

          # I'm afraid this was derived from trial and error.
          my $todo = ($test == 0xdf
                      && $lhs =~ /DF/
                      && $uni_semantics
                      && ($charset eq 'u' || $charset eq 'd')
                      && ! ($charset eq 'u' && (($upgrade_target eq "") != ($upgrade_pattern eq "")))
                      && ! ($charset eq 'd' && (! $upgrade_target || ! $upgrade_pattern))
                      );
          my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
          push @eval_tests, format_test($eval, ++$count, $todo, "");

          $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
          push @eval_tests, format_test($eval, ++$count, $todo, "");

          if ($lhs ne $rhs) {
            $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
            push @eval_tests, format_test($eval, ++$count, "", "");

            $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
            push @eval_tests, format_test($eval, ++$count, "", "");
          }

          foreach my $bracketed (0, 1) {   # Put rhs in [...], or not
            foreach my $inverted (0,1) {
                next if $inverted && ! $bracketed;  # inversion only valid in [^...]

              # In some cases, add an extra character that doesn't fold, and
              # looks ok in the output.
              my $extra_char = "_";
              foreach my $prepend ("", $extra_char) {
                foreach my $append ("", $extra_char) {

                  # Assemble the rhs.  Put each character in a separate
                  # bracketed if using charclasses.  This creates a stress on
                  # the code to span a match across multiple elements
                  my $rhs = "";
                  foreach my $rhs_char (@rhs) {
                      $rhs .= '[' if $bracketed;
                      $rhs .= '^' if $inverted;
                      $rhs .=  $rhs_char;

                      # Add a character to the class, so class doesn't get
                      # optimized out
                      $rhs .= '_]' if $bracketed;
                  }

                  # Add one of: no capturing parens
                  #             a single set
                  #             a nested set
                  # Use quantifiers and extra variable width matches inside
                  # them to keep some optimizations from happening
                  foreach my $parend (0, 1, 2) {
                    my $interior = (! $parend)
                                    ? $rhs
                                    : ($parend == 1)
                                        ? "(${rhs},?)"
                                        : "((${rhs})+,?)";
                    foreach my $quantifier ("", '?', '*', '+', '{1,3}') {

                      # A ? or * quantifier normally causes the thing to be
                      # able to match a null string
                      my $quantifier_can_match_null = $quantifier eq '?' || $quantifier eq '*';

                      # But since we only quantify the last character in a
                      # multiple fold, the other characters will have width,
                      # except if we are quantifying the whole rhs
                      my $can_match_null = $quantifier_can_match_null && (@rhs == 1 || $parend);

                      foreach my $l_anchor ("", '^') { # '\A' didn't change result)
                        foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result)

                          # The folded part can match the null string if it
                          # isn't required to have width, and there's not
                          # something on one or both sides that force it to.
                          my $both_sides = ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append);
                          my $must_match = ! $can_match_null || $both_sides;
                          # for performance, but doing this missed many failures
                          #next unless $must_match;
                          my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
                          my $op;
                          if ($must_match && $should_fail)  {
                              $op = 0;
                          } else {
                              $op = 1;
                          }
                          $op = ! $op if $must_match && $inverted;

                          if ($inverted && @target > 1) {
                            # When doing an inverted match against a
                            # multi-char target, and there is not something on
                            # the left to anchor the match, if it shouldn't
                            # succeed, skip, as what will happen (when working
                            # correctly) is that it will match the first
                            # position correctly, and then be inverted to not
                            # match; then it will go to the second position
                            # where it won't match, but get inverted to match,
                            # and hence succeeding.
                            next if ! ($l_anchor || $prepend) && ! $op;

                            # Can't ever match for latin1 code points non-uni
                            # semantics that have a inverted multi-char fold
                            # when there is something on both sides and the
                            # quantifier isn't such as to span the required
                            # width, which is 2 or 3.
                            $op = 0 if $ord < 255
                                       && ! $uni_semantics
                                       && $both_sides
                                       && ( ! $quantifier || $quantifier eq '?')
                                       && $parend < 2;

                            # Similarly can't ever match when inverting a multi-char
                            # fold for /aa and the quantifier isn't sufficient
                            # to allow it to span to both sides.
                            $op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;

                            # Or for /l
                            $op = 0 if $target_has_latin1 && $charset eq 'l' && $both_sides && ( ! $quantifier || $quantifier eq '?') && $parend < 2;
                          }

                          $op = ($op) ? '=~' : '!~';

                          my $debug .= " uni_semantics=$uni_semantics, should_fail=$should_fail, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quantifier=$quantifier, l_anchor=$l_anchor, r_anchor=$r_anchor";
                          $debug .= "; pattern_above_latin1=$pattern_above_latin1; utf8_pattern=$utf8_pattern";
                          my $eval = "my \$c = \"$prepend$lhs$append\"; my \$p = qr/$quantified/i;$upgrade_target$upgrade_pattern \$c $op \$p";

                          # XXX Doesn't currently test multi-char folds in pattern
                          next if @pattern != 1;
                          push @eval_tests, format_test($eval, ++$count, "", $debug);

                          # Group tests
                          if (@eval_tests >= $clump_execs) {
                              #eval "use re qw(Debug COMPILE EXECUTE);" . join ";\n", @eval_tests;
                              eval join ";\n", @eval_tests;
                              if ($@) {
                                fail($@);
                                exit 1;
                              }
                              undef @eval_tests;
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}

# Finish up any tests not already done
eval join ";\n", @eval_tests;
if ($@) {
  fail($@);
  exit 1;
}

plan($count);

1
Commit	Line	Data
371a505e	1	# Grind out a lot of combinatoric tests for folding.
a2d9a01a KW	2
	3	use charnames ":full";
	4
	5	binmode STDOUT, ":utf8";
	6
	7	BEGIN {
	8	chdir 't' if -d 't';
	9	@INC = '../lib';
	10	require './test.pl';
a59efd0a	11	skip_all_if_miniperl("no dynamic loading on miniperl, no Encode nor POSIX");
a2d9a01a KW	12	}
a2d9a01a KW	13
29d01a3e	14	my $DEBUG = 0; # Outputs extra information for debugging this .t
abf4d645	15
a2d9a01a KW	16	use strict;
a2d9a01a KW	17	use warnings;
d08723ac	18	use Encode;
a59efd0a	19	use POSIX;
a2d9a01a KW	20
	21	# Tests both unicode and not, so make sure not implicitly testing unicode
	22	no feature 'unicode_strings';
	23
	24	# Case-insensitive matching is a large and complicated issue. Perl does not
	25	# implement it fully, properly. For example, it doesn't include normalization
	26	# as part of the equation. To test every conceivable combination is clearly
	27	# impossible; these tests are mostly drawn from visual inspection of the code
	28	# and experience, trying to exercise all areas.
	29
	30	# There are three basic ranges of characters that Perl may treat differently:
	31	# 1) Invariants under utf8 which on ASCII-ish machines are ASCII, and are
	32	# referred to here as ASCII. On EBCDIC machines, the non-ASCII invariants
	33	# are all controls that fold to themselves.
	34	my $ASCII = 1;
	35
	36	# 2) Other characters that fit into a byte but are different in utf8 than not;
	37	# here referred to, taking some liberties, as Latin1.
	38	my $Latin1 = 2;
	39
	40	# 3) Characters that won't fit in a byte; here referred to as Unicode
	41	my $Unicode = 3;
	42
	43	# Within these basic groups are equivalence classes that testing any character
	44	# in is likely to lead to the same results as any other character. This is
	45	# used to cut down the number of tests needed, unless PERL_RUN_SLOW_TESTS is
	46	# set.
	47	my $skip_apparently_redundant = ! $ENV{PERL_RUN_SLOW_TESTS};
	48
	49	sub range_type {
	50	my $ord = shift;
	51
	52	return $ASCII if $ord < 128;
	53	return $Latin1 if $ord < 256;
	54	return $Unicode;
	55	}
	56
	57	sub numerically {
	58	return $a <=> $b
	59	}
	60
7a0a13a6 KW	61	sub format_test($$$$) {
7a0a13a6 KW	62	my ($test, $count, $todo, $debug) = @_;
abf4d645	63
29d01a3e KW	64	# Create a test entry, with TODO set if it is one of the known problem
	65	# code points
	66
abf4d645	67	$debug = "" unless $DEBUG;
7a0a13a6	68	$todo = "Known problem" if $todo;
abf4d645 KW	69
	70	return qq[TODO: { local \$::TODO = "$todo"; ok(eval '$test', '$test; $debug'); }];
	71	}
	72
29d01a3e	73	my %tests; # The final set of tests. keys are the code points to test
a2d9a01a KW	74	my %simple_folds;
	75	my %multi_folds;
	76
	77	# First, analyze the current Unicode's folding rules
	78	my %folded_from;
	79	my $file="../lib/unicore/CaseFolding.txt";
	80	open my $fh, "<", $file or die "Failed to read '$file': $!";
	81	while (<$fh>) {
	82	chomp;
	83
	84	# Lines look like (though without the initial '#')
	85	#0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
	86
	87	my ($line, $comment) = split / \s+ \# \s+ /x, $_;
	88	next if $line eq "" \|\| substr($line, 0, 1) eq '#';
	89	my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
	90
	91	my $from = hex $hex_from;
	92
	93	if ($fold_type eq 'F') {
27f6057f KW	94	my $from_range_type = range_type($from);
	95
	96	# If we were testing comprehensively, we would try every combination
	97	# of upper and lower case in the fold, but it is quite likely that if
	98	# the code can handle all combinations if it can handle the cases
	99	# where everything is upper and when everything is lower. Because of
	100	# complement matching, we need to do both. And we use the
	101	# reverse-fold instead of uppercase.
a2d9a01a	102	@folded = map { hex $_ } @folded;
27f6057f KW	103	# XXX better to use reverse fold of these instead of uc
27f6057f KW	104	my @uc_folded = map { ord uc chr $_ } @folded;
a2d9a01a KW	105
a2d9a01a KW	106	# Include three code points that are handled internally by the regex
29d01a3e	107	# engine specially, plus all non-above-255 multi folds (which actually
a2d9a01a KW	108	# the only one is already included in the three, but this makes sure)
	109	# And if any member of the fold is not the same range type as the
	110	# source, add it directly to the tests. It needs to be an array of an
	111	# array, so that it is distinguished from multiple single folds
	112	if ($from == 0xDF \|\| $from == 0x390 \|\| $from == 0x3B0
	113	\|\| $from_range_type != $Unicode
	114	\|\| grep { range_type($_) != $from_range_type } @folded)
	115	{
27f6057f	116	$tests{$from} = [ [ @folded ], [ @uc_folded ] ];
a2d9a01a KW	117	}
	118	else {
	119
27f6057f KW	120	# The only multi-char non-utf8 fold is DF, which is handled above,
	121	# so here chr() must be utf8. Get the number of bytes in each.
	122	# This is because the optimizer cares about length differences.
	123	my $from_length = length encode('UTF-8', chr($from));
	124	my $to_length = length encode('UTF-8', pack 'U*', @folded);
	125	push @{$multi_folds{$from_length}{$to_length}}, { $from => [ [ @folded ], [ @uc_folded ] ] };
a2d9a01a KW	126	}
	127	}
	128
	129	# Perl only deals with C and F folds
	130	next if $fold_type ne 'C';
	131
d2025f57 KW	132	# C folds are single-char $from to single-char $folded, in chr terms
d2025f57 KW	133	# folded_from{'s'} = [ 'S', \N{LATIN SMALL LETTER LONG S} ]
a2d9a01a KW	134	push @{$folded_from{hex $folded[0]}}, $from;
	135	}
	136
d2025f57	137	# Now try to sort the single char folds into equivalence classes that are
a2d9a01a KW	138	# likely to have identical successes and failures. Any fold that crosses
	139	# range types is suspect, and is automatically tested. Otherwise, store by
	140	# the number of characters that participate in a fold. Likely all folds in a
	141	# range type that fold to each other like B->b->B will have identical success
	142	# and failure; similarly all folds that have three characters participating
	143	# are likely to have the same successes and failures, etc.
	144	foreach my $folded (sort numerically keys %folded_from) {
	145	my $target_range_type = range_type($folded);
	146	my $count = @{$folded_from{$folded}};
	147
	148	# Automatically test any fold that crosses range types
	149	if (grep { range_type($_) != $target_range_type } @{$folded_from{$folded}})
	150	{
	151	$tests{$folded} = $folded_from{$folded};
	152	}
	153	else {
	154	push @{$simple_folds{$target_range_type}{$count}},
	155	{ $folded => $folded_from{$folded} };
a7caa9e8	156	}
a2d9a01a KW	157	}
	158
	159	foreach my $from_length (keys %multi_folds) {
	160	foreach my $fold_length (keys %{$multi_folds{$from_length}}) {
	161	#print __LINE__, ref $multi_folds{$from_length}{$fold_length}, Dumper $multi_folds{$from_length}{$fold_length};
	162	foreach my $test (@{$multi_folds{$from_length}{$fold_length}}) {
	163	#print __LINE__, ": $from_length, $fold_length, $test:\n";
	164	my ($target, $pattern) = each %$test;
	165	#print __LINE__, ": $target: $pattern\n";
	166	$tests{$target} = $pattern;
	167	last if $skip_apparently_redundant;
	168	}
	169	}
	170	}
	171
	172	# Add in tests for single character folds. Add tests for each range type,
	173	# and within those tests for each number of characters participating in a
	174	# fold. Thus B->b has two characters participating. But K->k and Kelvin
	175	# Sign->k has three characters participating. So we would make sure that
	176	# there is a test for 3 chars, 4 chars, ... . (Note that the 'k' example is a
	177	# bad one because it crosses range types, so is automatically tested. In the
	178	# Unicode range there are various of these 3 and 4 char classes, but aren't as
	179	# easily described as the 'k' one.)
	180	foreach my $type (keys %simple_folds) {
	181	foreach my $count (keys %{$simple_folds{$type}}) {
	182	foreach my $test (@{$simple_folds{$type}{$count}}) {
	183	my ($target, $pattern) = each %$test;
	184	$tests{$target} = $pattern;
	185	last if $skip_apparently_redundant;
	186	}
	187	}
	188	}
	189
	190	# For each range type, test additionally a character that folds to itself
	191	$tests{0x3A} = [ 0x3A ];
	192	$tests{0xF7} = [ 0xF7 ];
	193	$tests{0x2C7} = [ 0x2C7 ];
	194
29069c2e	195	my $clump_execs = 1000; # Speed up by building an 'exec' of many tests
a2d9a01a KW	196	my @eval_tests;
a2d9a01a KW	197
2f7f8cb1 KW	198	# To cut down on the number of tests
	199	my $has_tested_aa_above_latin1;
	200	my $has_tested_latin1_aa;
17580e7a KW	201	my $has_tested_l_above_latin1;
17580e7a KW	202	my $has_tested_latin1_l;
2f7f8cb1	203
a2d9a01a KW	204	# For use by pairs() in generating combinations
	205	sub prefix {
	206	my $p = shift;
a7caa9e8	207	map [ $p, $_ ], @_
a2d9a01a KW	208	}
	209
	210	# Returns all ordered combinations of pairs of elements from the input array.
	211	# It doesn't return pairs like (a, a), (b, b). Change the slice to an array
	212	# to do that. This was just to have fewer tests.
a7caa9e8	213	sub pairs (@) {
a2d9a01a	214	#print __LINE__, ": ", join(" XXX ", @_), "\n";
a7caa9e8	215	map { prefix $_[$_], @_[0..$_-1, $_+1..$#_] } 0..$#_
a2d9a01a KW	216	}
a2d9a01a KW	217
a59efd0a KW	218	my @charsets = qw(d u aa);
	219	my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
	220	push @charsets, 'l' if $current_locale eq 'C';
a2d9a01a KW	221
a2d9a01a KW	222	# Finally ready to do the tests
abf4d645	223	my $count=0;
a2d9a01a KW	224	foreach my $test (sort { numerically } keys %tests) {
	225
	226	my $previous_target;
	227	my $previous_pattern;
	228	my @pairs = pairs(sort numerically $test, @{$tests{$test}});
	229
	230	# Each fold can be viewed as a closure of all the characters that
	231	# participate in it. Look at each possible pairing from a closure, with the
	232	# first member of the pair the target string to match against, and the
	233	# second member forming the pattern. Thus each fold member gets tested as
	234	# the string, and the pattern with every other member in the opposite role.
	235	while (my $pair = shift @pairs) {
	236	my ($target, $pattern) = @$pair;
	237
	238	# When testing a char that doesn't fold, we can get the same
	239	# permutation twice; so skip all but the first.
	240	next if $previous_target
	241	&& $previous_target == $target
	242	&& $previous_pattern == $pattern;
	243	($previous_target, $previous_pattern) = ($target, $pattern);
	244
	245	# Each side may be either a single char or a string. Extract each into an
	246	# array (perhaps of length 1)
	247	my @target, my @pattern;
	248	@target = (ref $target) ? @$target : $target;
	249	@pattern = (ref $pattern) ? @$pattern : $pattern;
	250
	251	# Have to convert non-utf8 chars to native char set
	252	@target = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @target;
	253	@pattern = map { $_ > 255 ? $_ : ord latin1_to_native(chr($_)) } @pattern;
	254
	255	# Get in hex form.
	256	my @x_target = map { sprintf "\\x{%04X}", $_ } @target;
	257	my @x_pattern = map { sprintf "\\x{%04X}", $_ } @pattern;
	258
	259	my $target_above_latin1 = grep { $_ > 255 } @target;
	260	my $pattern_above_latin1 = grep { $_ > 255 } @pattern;
2f7f8cb1 KW	261	my $target_has_ascii = grep { $_ < 128 } @target;
2f7f8cb1 KW	262	my $pattern_has_ascii = grep { $_ < 128 } @pattern;
17580e7a KW	263	my $target_has_latin1 = grep { $_ < 256 } @target;
17580e7a KW	264	my $pattern_has_latin1 = grep { $_ < 256 } @pattern;
a2d9a01a KW	265	my $is_self = @target == 1 && @pattern == 1 && $target[0] == $pattern[0];
	266
	267	# We don't test multi-char folding into other multi-chars. We are testing
	268	# a code point that folds to or from other characters. Find the single
	269	# code point for diagnostic purposes. (If both are single, choose the
	270	# target string)
	271	my $ord = @target == 1 ? $target[0] : $pattern[0];
7fea222d KW	272	my $progress = sprintf "%04X: \"%s\" and /%s/",
7fea222d KW	273	$test,
a2d9a01a KW	274	join("", @x_target),
	275	join("", @x_pattern);
	276	#print $progress, "\n";
	277	#diag $progress;
	278
	279	# Now grind out tests, using various combinations.
a59efd0a	280	foreach my $charset (@charsets) {
2f7f8cb1 KW	281
	282	# /aa should only affect things with folds in the ASCII range. But, try
	283	# it on one pair in the other ranges just to make sure it doesn't break
	284	# them. Set these flags. They are set to the ord of the character
	285	# tested so that all pairs of that ord get tested.
	286	if ($charset eq 'aa') {
	287	if (! $target_has_ascii && ! $pattern_has_ascii) {
	288	if ($target_above_latin1 \|\| $pattern_above_latin1) {
	289	next if defined $has_tested_aa_above_latin1
41ce0a5e KW	290	&& $has_tested_aa_above_latin1 != $test;
41ce0a5e KW	291	$has_tested_aa_above_latin1 = $test;
2f7f8cb1	292	}
41ce0a5e KW	293	next if defined $has_tested_latin1_aa && $has_tested_latin1_aa != $test;
41ce0a5e KW	294	$has_tested_latin1_aa = $test;
2f7f8cb1 KW	295	}
2f7f8cb1 KW	296	}
17580e7a KW	297	elsif ($charset eq 'l') {
	298	if (! $target_has_latin1 && ! $pattern_has_latin1) {
	299	next if defined $has_tested_latin1_l && $has_tested_latin1_l != $test;
	300	$has_tested_latin1_l = $test;
	301	}
	302	}
2f7f8cb1	303
a2d9a01a KW	304	foreach my $utf8_target (0, 1) { # Both utf8 and not, for
	305	# code points < 256
	306	my $upgrade_target = "";
	307
	308	# These must already be in utf8 because the string to match has
	309	# something above latin1. So impossible to test if to not to be in
	310	# utf8; and otherwise, no upgrade is needed.
	311	next if $target_above_latin1 && ! $utf8_target;
d08723ac	312	$upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target;
a2d9a01a	313
d08723ac KW	314	foreach my $utf8_pattern (0, 1) {
d08723ac KW	315	next if $pattern_above_latin1 && ! $utf8_pattern;
17580e7a KW	316
	317	# Our testing of 'l' uses the POSIX locale, which is ASCII-only
	318	my $uni_semantics = $charset ne 'l' && ($utf8_target \|\| $charset eq 'u' \|\| ($charset eq 'd' && $utf8_pattern) \|\| $charset =~ /a/);
a2d9a01a	319	my $upgrade_pattern = "";
d08723ac	320	$upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern;
a2d9a01a KW	321
	322	my $lhs = join "", @x_target;
	323	my @rhs = @x_pattern;
371a505e	324	my $rhs = join "", @rhs;
2f7f8cb1	325	my $should_fail = (! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self)
17580e7a KW	326	\|\| ($charset eq 'aa' && $target_has_ascii != $pattern_has_ascii)
17580e7a KW	327	\|\| ($charset eq 'l' && $target_has_latin1 != $pattern_has_latin1);
371a505e KW	328
	329	# Do simple tests of referencing capture buffers, named and
	330	# numbered.
	331	my $op = '=~';
	332	$op = '!~' if $should_fail;
d2025f57	333
7a0a13a6 KW	334	# I'm afraid this was derived from trial and error.
	335	my $todo = ($test == 0xdf
	336	&& $lhs =~ /DF/
	337	&& $uni_semantics
	338	&& ($charset eq 'u' \|\| $charset eq 'd')
	339	&& ! ($charset eq 'u' && (($upgrade_target eq "") != ($upgrade_pattern eq "")))
	340	&& ! ($charset eq 'd' && (! $upgrade_target \|\| ! $upgrade_pattern))
	341	);
371a505e	342	my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
7a0a13a6	343	push @eval_tests, format_test($eval, ++$count, $todo, "");
abf4d645	344
371a505e	345	$eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
7a0a13a6	346	push @eval_tests, format_test($eval, ++$count, $todo, "");
abf4d645	347
371a505e KW	348	if ($lhs ne $rhs) {
371a505e KW	349	$eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
7a0a13a6	350	push @eval_tests, format_test($eval, ++$count, "", "");
abf4d645	351
371a505e	352	$eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p";
7a0a13a6	353	push @eval_tests, format_test($eval, ++$count, "", "");
371a505e	354	}
371a505e	355
1ef17b72	356	foreach my $bracketed (0, 1) { # Put rhs in [...], or not
a2d9a01a	357	foreach my $inverted (0,1) {
d2025f57	358	next if $inverted && ! $bracketed; # inversion only valid in [^...]
a2d9a01a KW	359
	360	# In some cases, add an extra character that doesn't fold, and
	361	# looks ok in the output.
	362	my $extra_char = "_";
	363	foreach my $prepend ("", $extra_char) {
	364	foreach my $append ("", $extra_char) {
a2d9a01a KW	365
	366	# Assemble the rhs. Put each character in a separate
	367	# bracketed if using charclasses. This creates a stress on
	368	# the code to span a match across multiple elements
	369	my $rhs = "";
	370	foreach my $rhs_char (@rhs) {
	371	$rhs .= '[' if $bracketed;
	372	$rhs .= '^' if $inverted;
	373	$rhs .= $rhs_char;
	374
	375	# Add a character to the class, so class doesn't get
	376	# optimized out
	377	$rhs .= '_]' if $bracketed;
	378	}
	379
	380	# Add one of: no capturing parens
	381	# a single set
	382	# a nested set
	383	# Use quantifiers and extra variable width matches inside
	384	# them to keep some optimizations from happening
	385	foreach my $parend (0, 1, 2) {
	386	my $interior = (! $parend)
	387	? $rhs
	388	: ($parend == 1)
	389	? "(${rhs},?)"
	390	: "((${rhs})+,?)";
	391	foreach my $quantifier ("", '?', '*', '+', '{1,3}') {
	392
	393	# A ? or * quantifier normally causes the thing to be
	394	# able to match a null string
	395	my $quantifier_can_match_null = $quantifier eq '?' \|\| $quantifier eq '*';
	396
	397	# But since we only quantify the last character in a
	398	# multiple fold, the other characters will have width,
	399	# except if we are quantifying the whole rhs
	400	my $can_match_null = $quantifier_can_match_null && (@rhs == 1 \|\| $parend);
	401
	402	foreach my $l_anchor ("", '^') { # '\A' didn't change result)
	403	foreach my $r_anchor ("", '$') { # '\Z', '\z' didn't change result)
	404
	405	# The folded part can match the null string if it
	406	# isn't required to have width, and there's not
	407	# something on one or both sides that force it to.
2f7f8cb1 KW	408	my $both_sides = ($l_anchor && $r_anchor) \|\| ($l_anchor && $append) \|\| ($r_anchor && $prepend) \|\| ($prepend && $append);
	409	my $must_match = ! $can_match_null \|\| $both_sides;
	410	# for performance, but doing this missed many failures
a2d9a01a	411	#next unless $must_match;
d08723ac	412	my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)";
a2d9a01a	413	my $op;
d08723ac	414	if ($must_match && $should_fail) {
a2d9a01a KW	415	$op = 0;
	416	} else {
	417	$op = 1;
	418	}
	419	$op = ! $op if $must_match && $inverted;
27f6057f KW	420
	421	if ($inverted && @target > 1) {
	422	# When doing an inverted match against a
	423	# multi-char target, and there is not something on
	424	# the left to anchor the match, if it shouldn't
	425	# succeed, skip, as what will happen (when working
	426	# correctly) is that it will match the first
	427	# position correctly, and then be inverted to not
	428	# match; then it will go to the second position
	429	# where it won't match, but get inverted to match,
	430	# and hence succeeding.
	431	next if ! ($l_anchor \|\| $prepend) && ! $op;
	432
	433	# Can't ever match for latin1 code points non-uni
	434	# semantics that have a inverted multi-char fold
	435	# when there is something on both sides and the
	436	# quantifier isn't such as to span the required
	437	# width, which is 2 or 3.
	438	$op = 0 if $ord < 255
	439	&& ! $uni_semantics
	440	&& $both_sides
	441	&& ( ! $quantifier \|\| $quantifier eq '?')
	442	&& $parend < 2;
	443
	444	# Similarly can't ever match when inverting a multi-char
	445	# fold for /aa and the quantifier isn't sufficient
	446	# to allow it to span to both sides.
	447	$op = 0 if $target_has_ascii && $charset eq 'aa' && $both_sides && ( ! $quantifier \|\| $quantifier eq '?') && $parend < 2;
	448
	449	# Or for /l
	450	$op = 0 if $target_has_latin1 && $charset eq 'l' && $both_sides && ( ! $quantifier \|\| $quantifier eq '?') && $parend < 2;
	451	}
	452
a2d9a01a KW	453	$op = ($op) ? '=~' : '!~';
a2d9a01a KW	454
abf4d645 KW	455	my $debug .= " uni_semantics=$uni_semantics, should_fail=$should_fail, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quantifier=$quantifier, l_anchor=$l_anchor, r_anchor=$r_anchor";
	456	$debug .= "; pattern_above_latin1=$pattern_above_latin1; utf8_pattern=$utf8_pattern";
	457	my $eval = "my \$c = \"$prepend$lhs$append\"; my \$p = qr/$quantified/i;$upgrade_target$upgrade_pattern \$c $op \$p";
a2d9a01a	458
abf4d645	459	# XXX Doesn't currently test multi-char folds in pattern
a2d9a01a	460	next if @pattern != 1;
7a0a13a6	461	push @eval_tests, format_test($eval, ++$count, "", $debug);
a2d9a01a KW	462
	463	# Group tests
	464	if (@eval_tests >= $clump_execs) {
abf4d645	465	#eval "use re qw(Debug COMPILE EXECUTE);" . join ";\n", @eval_tests;
a2d9a01a	466	eval join ";\n", @eval_tests;
abf4d645 KW	467	if ($@) {
	468	fail($@);
	469	exit 1;
	470	}
a2d9a01a KW	471	undef @eval_tests;
	472	}
	473	}
	474	}
	475	}
	476	}
	477	}
	478	}
	479	}
	480	}
	481	}
	482	}
	483	}
	484	}
	485	}
	486
	487	# Finish up any tests not already done
	488	eval join ";\n", @eval_tests;
abf4d645 KW	489	if ($@) {
	490	fail($@);
	491	exit 1;
	492	}
a2d9a01a	493
abf4d645	494	plan($count);
a2d9a01a KW	495
a2d9a01a KW	496	1