+my %has_test_by_participants; # Makes sure has tests for each range and each
+ # number of characters that fold to the same
+ # thing
+my %has_test_by_byte_count; # Makes sure has tests for each combination of
+ # n bytes folds to m bytes
+
+my %tests; # The set of tests.
+# Each key is a code point that folds to something else.
+# Each value is a list of things that the key folds to. If the 'thing' is a
+# single code point, it is that ordinal. If it is a multi-char fold, it is an
+# ordered list of the code points in that fold. Here's an example for 'S':
+# '83' => [ 115, 383 ]
+#
+# And one for a multi-char fold: \xDF
+# 223 => [
+# [ # 'ss'
+# 83,
+# 83
+# ],
+# [ # 'SS'
+# 115,
+# 115
+# ],
+# [ # LATIN SMALL LETTER LONG S
+# 383,
+# 383
+# ],
+# 7838 # LATIN_CAPITAL_LETTER_SHARP_S
+# ],
+
+my %inverse_folds; # keys are strings of the folded-to;
+ # values are lists of characters that fold to them
+
+sub add_test($@) {
+ my ($to, @from) = @_;
+
+ # Called to cause the input to be tested by adding to %tests. @from is
+ # the list of characters that fold to the string $to. @from should be
+ # sorted so the lowest code point is first....
+ # The input is in string form; %tests uses code points, so have to
+ # convert.
+
+ my $to_chars = length $to;
+ my @test_to; # List of tests for $to
+
+ if ($to_chars == 1) {
+ @test_to = ord $to;
+ }
+ else {
+ push @test_to, [ map { ord $_ } split "", $to ];
+
+ # For multi-char folds, we also test that things that can fold to each
+ # individual character in the fold also work. If we were testing
+ # comprehensively, we would try every combination of upper and lower
+ # case in the fold, but it will have to suffice to avoid running
+ # forever to make sure that each thing that folds to these is tested
+ # at least once. Because of complement matching, we need to do both
+ # the folded, and the folded-from.
+ # We first look at each character in the multi-char fold, and save how
+ # many characters fold to it; and also the maximum number of such
+ # folds
+ my @folds_to_count; # 0th char in fold is index 0 ...
+ my $max_folds_to = 0;
+
+ for (my $i = 0; $i < $to_chars; $i++) {
+ my $to_char = substr($to, $i, 1);
+ if (exists $inverse_folds{$to_char}) {
+ $folds_to_count[$i] = scalar @{$inverse_folds{$to_char}};
+ $max_folds_to = $folds_to_count[$i] if $max_folds_to < $folds_to_count[$i];
+ }
+ else {
+ $folds_to_count[$i] = 0;
+ }
+ }
+
+ # We will need to generate as many tests as the maximum number of
+ # folds, so that each fold will have at least one test.
+ for (my $i = 0; $i < $max_folds_to; $i++) {
+ my @this_test_to; # Assemble a single test
+
+ # For each character in the multi-char fold ...
+ for (my $j = 0; $j < $to_chars; $j++) {
+ my $this_char = substr($to, $j, 1);
+
+ # Use its corresponding inverse fold, if available.
+ if ($i < $folds_to_count[$j]) {
+ push @this_test_to, ord $inverse_folds{$this_char}[$i];
+ }
+ else { # Or else itself.
+ push @this_test_to, ord $this_char;
+ }
+ }
+
+ # Add this test to the list
+ push @test_to, [ @this_test_to ];
+ }
+
+ # Here, have assembled all the tests for the multi-char fold. Sort so
+ # lowest code points are first for consistency and aesthetics in
+ # output. We know there are at least two characters in the fold, but
+ # I haven't bothered to worry about sorting on an optional third
+ # character if the first two are identical.
+ @test_to = sort { ($a->[0] == $b->[0])
+ ? $a->[1] <=> $b->[1]
+ : $a->[0] <=> $b->[0]
+ } @test_to;
+ }
+
+
+ # This test is from n bytes to m bytes. Record that so won't try to add
+ # another test that does the same.
+ use bytes;
+ my $to_bytes = length $to;
+ foreach my $from_map (@from) {
+ $has_test_by_byte_count{length $from_map}{$to_bytes} = $to;
+ }
+ no bytes;
+
+ my $ord_smallest_from = ord shift @from;
+ if (exists $tests{$ord_smallest_from}) {
+ die "There are already tests for $ord_smallest_from"
+ };
+
+ # Add in the fold tests,
+ push @{$tests{$ord_smallest_from}}, @test_to;
+
+ # Then any remaining froms in the equivalence class.
+ push @{$tests{$ord_smallest_from}}, map { ord $_ } @from;
+}
+
+# Read the Unicode rules file and construct inverse mappings from it