X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/1675ea0da1956ed6edbfd08e4b0421ce28567b74..2e2778b218a143b698b7844d136cb7df300c9ab3:/lib/unicore/mktables diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 0ee1b2f..58f3678 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -608,6 +608,7 @@ sub uniques { $0 = File::Spec->canonpath($0); my $make_test_script = 0; # ? Should we output a test script +my $make_norm_test_script = 0; # ? Should we output a normalization test script my $write_unchanged_files = 0; # ? Should we update the output files even if # we don't think they have changed my $use_directory = ""; # ? Should we chdir somewhere. @@ -668,6 +669,10 @@ while (@ARGV) { { $make_test_script = 1; } + elsif ($arg eq '-makenormtest') + { + $make_norm_test_script = 1; + } elsif ($arg eq '-makelist') { $make_list = 1; } @@ -833,6 +838,7 @@ if ($v_version ge v5.2.0) { # Enum values for to_output_map() method in the Map_Table package. my $EXTERNAL_MAP = 1; my $INTERNAL_MAP = 2; +my $OUTPUT_ADJUSTED = 3; # To override computed values for writing the map tables for these properties. # The default for enum map tables is to write them out, so that the Unicode @@ -1244,6 +1250,7 @@ my $INTEGER_FORMAT = 'i'; my $HEX_FORMAT = 'x'; my $RATIONAL_FORMAT = 'r'; my $STRING_FORMAT = 's'; +my $ADJUST_FORMAT = 'a'; my $DECOMP_STRING_FORMAT = 'c'; my $STRING_WHITE_SPACE_LIST = 'sw'; @@ -1255,6 +1262,7 @@ my %map_table_formats = ( $HEX_FORMAT => 'non-negative hex whole number; a code point', $RATIONAL_FORMAT => 'rational: an integer or a fraction', $STRING_FORMAT => 'string', + $ADJUST_FORMAT => 'some entries need adjustment', $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' ); @@ -1342,6 +1350,9 @@ my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes # anonymous hash. my @code_points_ending_in_code_point; +# To hold Unicode's normalization test suite +my @normalization_tests; + # Boolean: does this Unicode version have the hangul syllables, and are we # writing out a table for them? my $has_hangul_syllables = 0; @@ -2171,6 +2182,7 @@ sub trace { return main::trace(@_); } fallback => 0, qw("") => "_operator_stringify", "." => \&main::_operator_dot, + ".=" => \&main::_operator_dot_equal, ; sub _operator_stringify { @@ -2848,6 +2860,7 @@ sub trace { return main::trace(@_); } fallback => 0, qw("") => "_operator_stringify", "." => \&main::_operator_dot, + ".=" => \&main::_operator_dot_equal, ; sub _operator_stringify { @@ -2998,6 +3011,7 @@ sub trace { return main::trace(@_); } fallback => 0, qw("") => "_operator_stringify", "." => \&main::_operator_dot, + ".=" => \&main::_operator_dot_equal, ; sub _operator_stringify { @@ -3014,7 +3028,7 @@ sub trace { return main::trace(@_); } # either a constructor or a method. If called as a method, the result # will be a new() instance of the calling object, containing the union # of that object with the other parameter's code points; if called as - # a constructor, the first parameter gives the class the new object + # a constructor, the first parameter gives the class that the new object # should be, and the second parameter gives the code points to go into # it. # In either case, there are two parameters looked at by this routine; @@ -3026,14 +3040,20 @@ sub trace { return main::trace(@_); } # just a single code point. # # If they are ranges, this routine doesn't make any effort to preserve - # the range values of one input over the other. Therefore this base - # class should not allow _union to be called from other than + # the range values and types of one input over the other. Therefore + # this base class should not allow _union to be called from other than # initialization code, so as to prevent two tables from being added # together where the range values matter. The general form of this # routine therefore belongs in a derived class, but it was moved here # to avoid duplication of code. The failure to overload this in this # class keeps it safe. # + # It does make the effort during initialization to accept tables with + # multiple values for the same code point, and to preserve the order + # of these. If there is only one input range or range set, it doesn't + # sort (as it should already be sorted to the desired order), and will + # accept multiple values per code point. Otherwise it will merge + # multiple values into a single one. my $self; my @args; # Arguments to pass to the constructor @@ -3054,6 +3074,7 @@ sub trace { return main::trace(@_); } # Accumulate all records from both lists. my @records; + my $input_count = 0; for my $arg (@args) { #local $to_trace = 0 if main::DEBUG; trace "argument = $arg" if main::DEBUG && $to_trace; @@ -3066,18 +3087,22 @@ sub trace { return main::trace(@_); } Carp::my_carp_bug($message .= "Undefined argument to _union. No union done."); return; } + $arg = [ $arg ] if ! ref $arg; my $type = ref $arg; if ($type eq 'ARRAY') { foreach my $element (@$arg) { push @records, Range->new($element, $element); + $input_count++; } } elsif ($arg->isa('Range')) { push @records, $arg; + $input_count++; } elsif ($arg->can('ranges')) { push @records, $arg->ranges; + $input_count++; } else { my $message = ""; @@ -3093,13 +3118,15 @@ sub trace { return main::trace(@_); } # Sort with the range containing the lowest ordinal first, but if # two ranges start at the same code point, sort with the bigger range # of the two first, because it takes fewer cycles. - @records = sort { ($a->start <=> $b->start) + if ($input_count > 1) { + @records = sort { ($a->start <=> $b->start) or # if b is shorter than a, b->end will be # less than a->end, and we want to select # a, so want to return -1 ($b->end <=> $a->end) } @records; + } my $new = $class->new(@_); @@ -3107,12 +3134,20 @@ sub trace { return main::trace(@_); } for my $set (@records) { my $start = $set->start; my $end = $set->end; - my $value = $set->value; + my $value = $set->value; + my $type = $set->type; if ($start > $new->max) { - $new->_add_delete('+', $start, $end, $value); + $new->_add_delete('+', $start, $end, $value, Type => $type); } elsif ($end > $new->max) { - $new->_add_delete('+', $new->max +1, $end, $value); + $new->_add_delete('+', $new->max +1, $end, $value, + Type => $type); + } + elsif ($input_count == 1) { + # Here, overlaps existing range, but is from a single input, + # so preserve the multiple values from that input. + $new->_add_delete('+', $start, $end, $value, Type => $type, + Replace => $MULTIPLE_AFTER); } } @@ -3349,7 +3384,7 @@ sub trace { return main::trace(@_); } # new and old values are identical, the # replacement is skipped to save cycles # => $IF_NOT_EQUIVALENT means to replace the existing values - # with this one if they are not equivalent. + # (the default) with this one if they are not equivalent. # Ranges are equivalent if their types are the # same, and they are the same string; or if # both are type 0 ranges, if their Unicode @@ -3370,9 +3405,17 @@ sub trace { return main::trace(@_); } # multiple times. They are stored LIFO, so # that the final one inserted is the first one # returned in an ordered search of the table. + # If this is an exact duplicate, including the + # value, the original will be moved to be + # first, before any other duplicate ranges + # with different values. # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored # FIFO, so that this one is inserted after all - # others that currently exist. + # others that currently exist. If this is an + # exact duplicate, including value, of an + # existing range, this one is discarded + # (leaving the existing one in its original, + # higher priority position # => anything else is the same as => $IF_NOT_EQUIVALENT # # "same value" means identical for non-type-0 ranges, and it means @@ -3412,6 +3455,9 @@ sub trace { return main::trace(@_); } Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken."); return; } + if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') { + Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway"); + } #local $to_trace = 1 if main::DEBUG; if ($operation eq '-') { @@ -3653,12 +3699,34 @@ sub trace { return main::trace(@_); } } # If to place this new record after, move to beyond all existing - # ones. + # ones; but don't add this one if identical to any of them, as it + # isn't really a multiple. This leaves the original order, so + # that the current request is ignored. The reasoning is that the + # previous request that wanted this record to have high priority + # should have precedence. if ($replace == $MULTIPLE_AFTER) { while ($i < @$r && $r->[$i]->start == $start) { + return if $value eq $r->[$i]->value + && $type eq $r->[$i]->type; $i++; } } + else { + # If instead we are to place this new record before any + # existing ones, remove any identical ones that come after it. + # This changes the existing order so that the new one is + # first, as is being requested. + for (my $j = $i + 1; + $j < @$r && $r->[$j]->start == $start; + $j++) + { + if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) { + splice @$r, $j, 1; + last; # There should only be one instance, so no + # need to keep looking + } + } + } trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace; my @return = splice @$r, @@ -4120,6 +4188,21 @@ sub trace { return main::trace(@_); } return $self->_union($other) }, + '+=' => sub { my $self = shift; + my $other = shift; + my $reversed = shift; + + if ($reversed) { + Carp::my_carp_bug("Bad news. Can't cope with '" + . ref($other) + . ' += ' + . ref($self) + . "'. undef returned."); + return; + } + + return $self->_union($other) + }, '&' => sub { my $self = shift; my $other = shift; @@ -4806,6 +4889,7 @@ END use overload fallback => 0, "." => \&main::_operator_dot, + ".=" => \&main::_operator_dot_equal, '!=' => \&main::_operator_not_equal, '==' => \&main::_operator_equal, ; @@ -4840,7 +4924,9 @@ END my $status = delete $args{'Status'}; $status = $NORMAL unless defined $status; - my $ucd = delete $args{'UCD'} // 1; + # An internal name does not get documented, unless overridden by the + # input. + my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1); Carp::carp_extra_args(\%args) if main::DEBUG && %args; @@ -5132,9 +5218,11 @@ END # Write a representation of the table to its file. It calls several # functions furnished by sub-classes of this abstract base class to # handle non-normal ranges, to add stuff before the table, and at its - # end. + # end. If the table is to be written so that adjustments are + # required, this does that conversion. my $self = shift; + my $use_adjustments = shift; # ? output in adjusted format or not my $tab_stops = shift; # The number of tab stops over to put any # comment. my $suppress_value = shift; # Optional, if the value associated with @@ -5203,7 +5291,7 @@ END if ($annotate) { - # if annotating each code point, must print 1 per line. + # If annotating each code point, must print 1 per line. # The variable could point to a subroutine, and we don't want # to lose that fact, so only set if not set already $range_size_1 = 1 if ! $range_size_1; @@ -5221,6 +5309,21 @@ END ); } + # Values for previous time through the loop. Initialize to + # something that won't be adjacent to the first iteration; + # only $previous_end matters for that. + my $previous_start; + my $previous_end = -2; + my $previous_value; + + # Values for next time through the portion of the loop that splits + # the range. 0 in $next_start means there is no remaining portion + # to deal with. + my $next_start = 0; + my $next_end; + my $next_value; + my $offset = 0; + # Output each range as part of the here document. RANGE: for my $set ($range_list{$addr}->ranges) { @@ -5236,189 +5339,274 @@ END next RANGE if defined $suppress_value && $value eq $suppress_value; - # If there is a range and doesn't need a single point range - # output - if ($start != $end && ! $range_size_1) { - push @OUT, sprintf "%04X\t%04X", $start, $end; - $OUT[-1] .= "\t$value" if $value ne ""; - - # Add a comment with the size of the range, if requested. - # Expand Tabs to make sure they all start in the same - # column, and then unexpand to use mostly tabs. - if (! $output_range_counts{$addr}) { - $OUT[-1] .= "\n"; + { # This bare block encloses the scope where we may need to + # split a range (when outputting adjusteds), and each time + # through we handle the next portion of the original by + # ending the block with a 'redo'. The values to use for + # that next time through are set up just below in the + # scalars whose names begin with '$next_'. + + if ($use_adjustments) { + + # When converting to use adjustments, we can handle + # only single element ranges. Set up so that this + # time through the loop, we look at the first element, + # and the next time through, we start off with the + # remainder. Thus each time through we look at the + # first element of the range + if ($end != $start) { + $next_start = $start + 1; + $next_end = $end; + $next_value = $value; + $end = $start; + } + + # The values for some of these tables are stored as + # hex strings. Convert those to decimal + $value = hex($value) + if $self->default_map eq $CODE_POINT + && $value =~ / ^ [A-Fa-f0-9]+ $ /x; + + # If this range is adjacent to the previous one, and + # the values in each are integers that are also + # adjacent (differ by 1), then this range really + # extends the previous one that is already in element + # $OUT[-1]. So we pop that element, and pretend that + # the range starts with whatever it started with. + # $offset is incremented by 1 each time so that it + # gives the current offset from the first element in + # the accumulating range, and we keep in $value the + # value of that first element. + if ($start == $previous_end + 1 + && $value =~ /^ -? \d+ $/xa + && $previous_value =~ /^ -? \d+ $/xa + && ($value == ($previous_value + ++$offset))) + { + pop @OUT; + $start = $previous_start; + $value = $previous_value; + } + else { + $offset = 0; + } + + # Save the current values for the next time through + # the loop. + $previous_start = $start; + $previous_end = $end; + $previous_value = $value; } - else { - $OUT[-1] = Text::Tabs::expand($OUT[-1]); - my $count = main::clarify_number($end - $start + 1); - use integer; - - my $width = $tab_stops * 8 - 1; - $OUT[-1] = sprintf("%-*s # [%s]\n", - $width, - $OUT[-1], - $count); - $OUT[-1] = Text::Tabs::unexpand($OUT[-1]); + + # If there is a range and doesn't need a single point range + # output + if ($start != $end && ! $range_size_1) { + push @OUT, sprintf "%04X\t%04X", $start, $end; + $OUT[-1] .= "\t$value" if $value ne ""; + + # Add a comment with the size of the range, if + # requested. Expand Tabs to make sure they all start + # in the same column, and then unexpand to use mostly + # tabs. + if (! $output_range_counts{$addr}) { + $OUT[-1] .= "\n"; + } + else { + $OUT[-1] = Text::Tabs::expand($OUT[-1]); + my $count = main::clarify_number($end - $start + 1); + use integer; + + my $width = $tab_stops * 8 - 1; + $OUT[-1] = sprintf("%-*s # [%s]\n", + $width, + $OUT[-1], + $count); + $OUT[-1] = Text::Tabs::unexpand($OUT[-1]); + } } - next RANGE; - } - # Here to output a single code point per line + # Here to output a single code point per line. + # If not to annotate, use the simple formats + elsif (! $annotate) { - # If not to annotate, use the simple formats - if (! $annotate) { + # Use any passed in subroutine to output. + if (ref $range_size_1 eq 'CODE') { + for my $i ($start .. $end) { + push @OUT, &{$range_size_1}($i, $value); + } + } + else { - # Use any passed in subroutine to output. - if (ref $range_size_1 eq 'CODE') { - for my $i ($start .. $end) { - push @OUT, &{$range_size_1}($i, $value); + # Here, caller is ok with default output. + for (my $i = $start; $i <= $end; $i++) { + push @OUT, sprintf "%04X\t\t%s\n", $i, $value; + } } } else { - # Here, caller is ok with default output. + # Here, wants annotation. for (my $i = $start; $i <= $end; $i++) { - push @OUT, sprintf "%04X\t\t%s\n", $i, $value; - } - } - next RANGE; - } - # Here, wants annotation. - for (my $i = $start; $i <= $end; $i++) { - - # Get character information if don't have it already - main::populate_char_info($i) - if ! defined $viacode[$i]; - my $type = $annotate_char_type[$i]; - - # Figure out if should output the next code points as part - # of a range or not. If this is not in an annotation - # range, then won't output as a range, so returns $i. - # Otherwise use the end of the annotation range, but no - # further than the maximum possible end point of the loop. - my $range_end = main::min($annotate_ranges->value_of($i) - || $i, - $end); - - # Use a range if it is a range, and either is one of the - # special annotation ranges, or the range is at most 3 - # long. This last case causes the algorithmically named - # code points to be output individually in spans of at - # most 3, as they are the ones whose $type is > 0. - if ($range_end != $i - && ( $type < 0 || $range_end - $i > 2)) - { - # Here is to output a range. We don't allow a - # caller-specified output format--just use the - # standard one. - push @OUT, sprintf "%04X\t%04X\t%s\t#", $i, + # Get character information if don't have it already + main::populate_char_info($i) + if ! defined $viacode[$i]; + my $type = $annotate_char_type[$i]; + + # Figure out if should output the next code points + # as part of a range or not. If this is not in an + # annotation range, then won't output as a range, + # so returns $i. Otherwise use the end of the + # annotation range, but no further than the + # maximum possible end point of the loop. + my $range_end = main::min( + $annotate_ranges->value_of($i) || $i, + $end); + + # Use a range if it is a range, and either is one + # of the special annotation ranges, or the range + # is at most 3 long. This last case causes the + # algorithmically named code points to be output + # individually in spans of at most 3, as they are + # the ones whose $type is > 0. + if ($range_end != $i + && ( $type < 0 || $range_end - $i > 2)) + { + # Here is to output a range. We don't allow a + # caller-specified output format--just use the + # standard one. + push @OUT, sprintf "%04X\t%04X\t%s\t#", $i, $range_end, $value; - my $range_name = $viacode[$i]; - - # For the code points which end in their hex value, we - # eliminate that from the output annotation, and - # capitalize only the first letter of each word. - if ($type == $CP_IN_NAME) { - my $hex = sprintf "%04X", $i; - $range_name =~ s/-$hex$//; - my @words = split " ", $range_name; - for my $word (@words) { - $word = ucfirst(lc($word)) if $word ne 'CJK'; - } - $range_name = join " ", @words; - } - elsif ($type == $HANGUL_SYLLABLE) { - $range_name = "Hangul Syllable"; - } + my $range_name = $viacode[$i]; + + # For the code points which end in their hex + # value, we eliminate that from the output + # annotation, and capitalize only the first + # letter of each word. + if ($type == $CP_IN_NAME) { + my $hex = sprintf "%04X", $i; + $range_name =~ s/-$hex$//; + my @words = split " ", $range_name; + for my $word (@words) { + $word = + ucfirst(lc($word)) if $word ne 'CJK'; + } + $range_name = join " ", @words; + } + elsif ($type == $HANGUL_SYLLABLE) { + $range_name = "Hangul Syllable"; + } - $OUT[-1] .= " $range_name" if $range_name; + $OUT[-1] .= " $range_name" if $range_name; - # Include the number of code points in the range - my $count = main::clarify_number($range_end - $i + 1); - $OUT[-1] .= " [$count]\n"; + # Include the number of code points in the + # range + my $count = + main::clarify_number($range_end - $i + 1); + $OUT[-1] .= " [$count]\n"; - # Skip to the end of the range - $i = $range_end; - } - else { # Not in a range. - my $comment = ""; - - # When outputting the names of each character, use - # the character itself if printable - $comment .= "'" . chr($i) . "' " if $printable[$i]; - - # To make it more readable, use a minimum indentation - my $comment_indent; - - # Determine the annotation - if ($format eq $DECOMP_STRING_FORMAT) { - - # This is very specialized, with the type of - # decomposition beginning the line enclosed in - # <...>, and the code points that the code point - # decomposes to separated by blanks. Create two - # strings, one of the printable characters, and - # one of their official names. - (my $map = $value) =~ s/ \ * < .*? > \ +//x; - my $tostr = ""; - my $to_name = ""; - my $to_chr = ""; - foreach my $to (split " ", $map) { - $to = CORE::hex $to; - $to_name .= " + " if $to_name; - $to_chr .= chr($to); - main::populate_char_info($to) - if ! defined $viacode[$to]; - $to_name .= $viacode[$to]; + # Skip to the end of the range + $i = $range_end; } + else { # Not in a range. + my $comment = ""; + + # When outputting the names of each character, + # use the character itself if printable + $comment .= "'" . chr($i) . "' " + if $printable[$i]; + + # To make it more readable, use a minimum + # indentation + my $comment_indent; + + # Determine the annotation + if ($format eq $DECOMP_STRING_FORMAT) { + + # This is very specialized, with the type + # of decomposition beginning the line + # enclosed in <...>, and the code points + # that the code point decomposes to + # separated by blanks. Create two + # strings, one of the printable + # characters, and one of their official + # names. + (my $map = $value) =~ s/ \ * < .*? > \ +//x; + my $tostr = ""; + my $to_name = ""; + my $to_chr = ""; + foreach my $to (split " ", $map) { + $to = CORE::hex $to; + $to_name .= " + " if $to_name; + $to_chr .= chr($to); + main::populate_char_info($to) + if ! defined $viacode[$to]; + $to_name .= $viacode[$to]; + } - $comment .= + $comment .= "=> '$to_chr'; $viacode[$i] => $to_name"; - $comment_indent = 25; # Determined by experiment - } - else { - - # Assume that any table that has hex format is a - # mapping of one code point to another. - if ($format eq $HEX_FORMAT) { - my $decimal_value = CORE::hex $value; - main::populate_char_info($decimal_value) + $comment_indent = 25; # Determined by + # experiment + } + else { + + # Assume that any table that has hex + # format is a mapping of one code point to + # another. + if ($format eq $HEX_FORMAT) { + my $decimal_value = CORE::hex $value; + main::populate_char_info($decimal_value) if ! defined $viacode[$decimal_value]; - $comment .= "=> '" - . chr($decimal_value) - . "'; " if $printable[$decimal_value]; - } - $comment .= $viacode[$i] if $include_name - && $viacode[$i]; - if ($format eq $HEX_FORMAT) { - my $decimal_value = CORE::hex $value; - $comment .= " => $viacode[$decimal_value]" - if $viacode[$decimal_value]; - } + $comment .= "=> '" + . chr($decimal_value) + . "'; " if $printable[$decimal_value]; + } + $comment .= $viacode[$i] if $include_name + && $viacode[$i]; + if ($format eq $HEX_FORMAT) { + my $decimal_value = CORE::hex $value; + $comment .= + " => $viacode[$decimal_value]" + if $viacode[$decimal_value]; + } - # If including the name, no need to indent, as the - # name will already be way across the line. - $comment_indent = ($include_name) ? 0 : 60; - } + # If including the name, no need to + # indent, as the name will already be way + # across the line. + $comment_indent = ($include_name) ? 0 : 60; + } - # Use any passed in routine to output the base part of - # the line. - if (ref $range_size_1 eq 'CODE') { - my $base_part = &{$range_size_1}($i, $value); - chomp $base_part; - push @OUT, $base_part; - } - else { - push @OUT, sprintf "%04X\t\t%s", $i, $value; + # Use any passed in routine to output the base + # part of the line. + if (ref $range_size_1 eq 'CODE') { + my $base_part=&{$range_size_1}($i, $value); + chomp $base_part; + push @OUT, $base_part; + } + else { + push @OUT, sprintf "%04X\t\t%s", $i, $value; + } + + # And add the annotation. + $OUT[-1] = sprintf "%-*s\t# %s", + $comment_indent, + $OUT[-1], + $comment + if $comment; + $OUT[-1] .= "\n"; + } } + } - # And add the annotation. - $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent, - $OUT[-1], - $comment if $comment; - $OUT[-1] .= "\n"; + # If we split the range, set up so the next time through + # we get the remainder, and redo. + if ($next_start) { + $start = $next_start; + $end = $next_end; + $value = $next_value; + $next_start = 0; + redo; } } } # End of loop through all the table's ranges @@ -5642,7 +5830,7 @@ sub trace { return main::trace(@_); } 'readable_array'); my %to_output_map; - # Enum as to whether or not to write out this map table: + # Enum as to whether or not to write out this map table, and how: # 0 don't output # $EXTERNAL_MAP means its existence is noted in the documentation, and # it should not be removed nor its format changed. This @@ -5650,9 +5838,16 @@ sub trace { return main::trace(@_); } # output. # $INTERNAL_MAP means Perl reserves the right to do anything it wants # with this file + # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of + # outputting the actual mappings as-is, we adjust things + # to create a much more compact table. Only those few + # tables where the mapping is convertible at least to an + # integer and compacting makes a big difference should + # have this. Hence, the default is to not do this + # unless the table's default mapping is to $CODE_POINT, + # and the range size is not 1. main::set_access('to_output_map', \%to_output_map, 's'); - sub new { my $class = shift; my $name = shift; @@ -5665,6 +5860,7 @@ sub trace { return main::trace(@_); } my $default_map = delete $args{'Default_Map'}; my $property = delete $args{'_Property'}; my $full_name = delete $args{'Full_Name'}; + my $to_output_map = delete $args{'To_Output_Map'}; # Rest of parameters passed on @@ -5682,6 +5878,7 @@ sub trace { return main::trace(@_); } $anomalous_entries{$addr} = []; $default_map{$addr} = $default_map; + $to_output_map{$addr} = $to_output_map; $self->initialize($initialize) if defined $initialize; @@ -5865,8 +6062,16 @@ sub trace { return main::trace(@_); } # Don't want to output binary map tables even for debugging. return 0 if $type == $BINARY; - # But do want to output string ones. - return $EXTERNAL_MAP if $type == $STRING; + # But do want to output string ones. All the ones that remain to + # be dealt with (i.e. which haven't explicitly been set to external) + # are for internal Perl use only. The default for those that map to + # $CODE_POINT and haven't been restricted to a single element range + # is to use the adjusted form. + if ($type == $STRING) { + return $INTERNAL_MAP if $self->range_size_1 + || $default_map{$addr} ne $CODE_POINT; + return $OUTPUT_ADJUSTED; + } # Otherwise is an $ENUM, do output it, for Perl's purposes return $INTERNAL_MAP; @@ -5890,11 +6095,11 @@ sub trace { return main::trace(@_); } my $return = $self->SUPER::header(); - if ($self->to_output_map == $INTERNAL_MAP) { + if ($self->to_output_map >= $INTERNAL_MAP) { $return .= $INTERNAL_ONLY_HEADER; } else { - my $property_name = $self->property->full_name; + my $property_name = $self->property->full_name =~ s/Legacy_//r; $return .= <status; - if ($status) { + if ($status && $status ne $PLACEHOLDER) { my $warn = uc $status_past_participles{$status}; $comment .= <property->full_name; for my $i (0 .. @property_aliases - 1) { - $comment .= sprintf("%-8s%s\n", - " ", - $property_aliases[$i]->name . '(cp)' - ); + my $name = $property_aliases[$i]->name; + $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)'); + if ($property_aliases[$i]->ucd) { + if ($name eq $full_name) { + $ucd_accessible_name = $full_name; + } + elsif (! $ucd_accessible_name) { + $ucd_accessible_name = $name; + } + } + } + $comment .= "\nwhere 'cp' is $cp."; + if ($ucd_accessible_name) { + $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD"; } - my $full_name = $self->property->full_name; - $comment .= "\nwhere 'cp' is $cp. Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD"; # And append any commentary already set from the actual property. $comment .= "\n\n" . $self->comment if $self->comment; @@ -6258,19 +6473,46 @@ END my $format = $self->format; - my $return = <to_output_map == $OUTPUT_ADJUSTED); + if ($output_adjusted) { + if ($specials_name) { + $return .= <_set_format($format); + # If the output is to be adjusted, the format of the table that gets + # output is actually 'a' instead of whatever it is stored internally + # as. + my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED); + if ($output_adjusted) { + $format = $ADJUST_FORMAT; + } - # Core Perl has a different definition of mapping ranges than we do, - # that is applicable mainly to mapping code points, so for tables - # where it is possible that core Perl could be used to read it, - # make it range size 1 to prevent possible confusion - $self->set_range_size_1(1) if $format eq $HEX_FORMAT; + $self->_set_format($format); return $self->SUPER::write( + $output_adjusted, ($self->property == $block) ? 7 # block file needs more tab stops : 3, @@ -6577,6 +6827,16 @@ sub trace { return main::trace(@_); } '+=' => sub { my $self = shift; my $other = shift; + my $reversed = shift; + + if ($reversed) { + Carp::my_carp_bug("Bad news. Can't cope with '" + . ref($other) + . ' += ' + . ref($self) + . "'. undef returned."); + return; + } return if $self->carp_if_locked; @@ -6594,6 +6854,14 @@ sub trace { return main::trace(@_); } } return $self; }, + '&=' => sub { + my $self = shift; + my $other = shift; + + return if $self->carp_if_locked; + $self->_set_range_list($self->_range_list & $other); + return $self; + }, '-' => sub { my $self = shift; my $other = shift; my $reversed = shift; @@ -6879,7 +7147,7 @@ sub trace { return main::trace(@_); } my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - return $self->SUPER::write(2); # 2 tab stops + return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops } sub set_final_comment { @@ -6924,6 +7192,9 @@ END # \p{}'s my @global_comments; # List of all the tables' comments that are # there before this routine was called. + my $has_ucd_alias = 0; # If there is an alias that is accessible via + # Unicode::UCD. If not, then don't say it is + # in the comment # Get list of all the parent tables that are equivalent to this one # (including itself). @@ -7018,6 +7289,7 @@ END [$i % @table_aliases]; my $table_alias = $table_alias_object->name; my $loose_match = $table_alias_object->loose_match; + $has_ucd_alias |= $table_alias_object->ucd; if ($table_alias !~ /\D/) { # Clarify large numbers. $table_alias = main::clarify_number($table_alias) @@ -7039,7 +7311,9 @@ END my $flag = $property->status || $table->status || $table_alias_object->status; - $flags{$flag} = $status_past_participles{$flag} if $flag; + if ($flag && $flag ne $PLACEHOLDER) { + $flags{$flag} = $status_past_participles{$flag}; + } $loose_count++; @@ -7128,7 +7402,10 @@ END $any_of_these = 'any of these' } - my $comment = "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n"; + my $comment = ""; + if ($has_ucd_alias) { + $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n"; + } if ($has_unrelated) { $comment .= < 0, qw("") => "_operator_stringify", "." => \&main::_operator_dot, + ".=" => \&main::_operator_dot_equal, '==' => \&main::_operator_equal, '!=' => \&main::_operator_not_equal, '=' => sub { return shift }, @@ -8525,6 +8803,24 @@ sub _operator_dot { : "$self$other"; } +sub _operator_dot_equal { + # Overloaded '.=' method that is common to all packages. + + my $self = shift; + my $other = shift; + my $reversed = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + $other = "" unless defined $other; + + if ($reversed) { + return $other .= "$self"; + } + else { + return "$self" . "$other"; + } +} + sub _operator_equal { # Generic overloaded '==' routine. To be equal, they must be the exact # same object @@ -8604,15 +8900,6 @@ sub finish_property_setup { # Perl adds this alias. $gc->add_alias('Category'); - # For backwards compatibility, these property files have particular names. - property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what - # utf8.c calls it - property_ref('Lowercase_Mapping')->set_file('Lower'); - property_ref('Titlecase_Mapping')->set_file('Title'); - - my $fold = property_ref('Case_Folding'); - $fold->set_file('Fold') if defined $fold; - # Unicode::Normalize expects this file with this name and directory. my $ccc = property_ref('Canonical_Combining_Class'); if (defined $ccc) { @@ -8620,19 +8907,6 @@ sub finish_property_setup { $ccc->set_directory(File::Spec->curdir()); } - # utf8.c has a different meaning for non range-size-1 for map properties - # that this program doesn't currently handle; and even if it were changed - # to do so, some other code may be using them expecting range size 1. - foreach my $property (qw { - Case_Folding - Lowercase_Mapping - Titlecase_Mapping - Uppercase_Mapping - }) - { - property_ref($property)->set_range_size_1(1); - } - # These two properties aren't actually used in the core, but unfortunately # the names just above that are in the core interfere with these, so # choose different names. These aren't a problem unless the map tables @@ -8751,6 +9025,22 @@ sub finish_property_setup { $urs->add_alias('kRSUnicode'); } } + + # For backwards compatibility with applications that may read the mapping + # file directly (it was documented in 5.12 and 5.14 as being thusly + # usable), keep it from being adjusted. (range_size_1 is + # used to force the traditional format.) + if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) { + $nfkc_cf->set_to_output_map($EXTERNAL_MAP); + $nfkc_cf->set_range_size_1(1); + } + if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) { + $bmg->set_to_output_map($EXTERNAL_MAP); + $bmg->set_range_size_1(1); + } + + property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED); + return; } @@ -8935,6 +9225,9 @@ sub process_PropValueAliases { # Process each line of the file ... while ($file->next_line) { + # Fix typo in input file + s/CCC133/CCC132/g if $v_version eq v6.1.0; + my ($property, @data) = split /\s*;\s*/; # The ccc property has an extra field at the beginning, which is the @@ -9329,6 +9622,33 @@ END return @return; } +sub process_NormalizationsTest { + + # Each line looks like: + # source code point; NFC; NFD; NFKC; NFKD + # e.g. + # 1E0A;1E0A;0044 0307;1E0A;0044 0307; + + my $file= shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # Process each line of the file ... + while ($file->next_line) { + + next if /^@/; + + my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/; + + foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) { + $$var = pack "U0U*", map { hex } split " ", $$var; + $$var =~ s/(\\)/$1$1/g; + } + + push @normalization_tests, + "Test_N(q$c1, q$c2, q$c3, q$c4, q$c5);\n"; + } # End of looping through the file +} + sub output_perl_charnames_line ($$) { # Output the entries in Perl_charnames specially, using 5 digits instead @@ -9813,6 +10133,7 @@ END my $input_field_count = $i; # This routine in addition outputs these extra fields: + my $DECOMP_TYPE = $i++; # Decomposition type # These fields are modifications of ones above, and are usually @@ -9953,6 +10274,7 @@ END # body of the table Map_Type => $COMPUTE_NO_MULTI_CP, Type => $STRING, + To_Output_Map => $INTERNAL_MAP, ); $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type'); $Perl_decomp->add_comment(join_lines(<new("Perl_Decimal_Digit", Default_Map => "", Perl_Extension => 1, - File => 'Digit', # Trad. location Directory => $map_directory, Type => $STRING, - Range_Size_1 => 1, + To_Output_Map => $OUTPUT_ADJUSTED, ); $Decimal_Digit->add_comment(join_lines(<carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x; if ($fields[$PERL_DECIMAL_DIGIT] ne "") { $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC]; + $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd"; $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal'; } elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") { @@ -10138,9 +10461,8 @@ END # Some code points in this file have the pseudo-name # '', but the official name for such ones is the null - # string. For charnames.pm, we use the Unicode version 1 name - $fields[$NAME] = ""; - $fields[$CHARNAME] = $fields[$UNICODE_1_NAME]; + # string. + $fields[$NAME] = $fields[$CHARNAME] = ""; # We had better not be in between range lines. if ($in_range) { @@ -10354,7 +10676,7 @@ END Carp::carp_extra_args(\@_) if main::DEBUG && @_; # Flush the buffers. - foreach my $i (1 .. $last_field) { + foreach my $i (0 .. $last_field) { $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]"); } @@ -10536,7 +10858,7 @@ END # http://www.unicode.org/versions/corrigendum8.html $fields[$BIDI] = "AL"; } - elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name + elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name $fields[$CHARNAME] = ""; } @@ -10664,7 +10986,7 @@ sub filter_arabic_shaping_line { # simple ones are in UnicodeData.txt, which should already have been # read in to the full property data structures, so as to initialize # these with the simple ones. Then the SpecialCasing.txt entries - # overwrite the ones which have different full mappings. + # add or overwrite the ones which have different full mappings. # This routine sees if the simple mappings are to be output, and if # so, copies what has already been put into the full mapping tables, @@ -10676,16 +10998,6 @@ sub filter_arabic_shaping_line { # relatively few entries in them that have different full mappings, # and thus skip the simple mapping tables altogether. - # New tables with just the simple mappings that are overridden by the - # full ones are constructed. These are for Unicode::UCD, which - # requires the simple mappings. The Case_Folding table is a combined - # table of both the simple and full mappings, with the full ones being - # in the hash, and the simple ones, even those overridden by the hash, - # being in the base table. That same mechanism could have been - # employed here, except that the docs have said that the generated - # files are usuable directly by programs, so we dare not change the - # format in any way. - my $file= shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; @@ -10694,34 +11006,49 @@ sub filter_arabic_shaping_line { $uc = property_ref('uc'); # For each of the case change mappings... - foreach my $case_table ($lc, $tc, $uc) { - my $case = $case_table->name; - my $full = property_ref($case); - unless (defined $full && ! $full->is_empty) { + foreach my $full_table ($lc, $tc, $uc) { + my $full_name = $full_table->name; + unless (defined $full_table && ! $full_table->is_empty) { Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated."); } - # The simple version's name in each mapping merely has an 's' in - # front of the full one's - my $simple_name = 's' . $case; - my $simple = property_ref($simple_name); - $simple->initialize($full) if $simple->to_output_map(); + # Create a table in the old-style format and with the original + # file name for backwards compatibility with applications that + # read it directly. The new tables contain both the simple and + # full maps, and the old are missing simple maps when there is a + # conflicting full one. Probably it would have been ok to add + # those to the legacy version, as was already done in 5.14 to the + # case folding one, but this was not done, out of an abundance of + # caution. The tables are set up here before we deal with the + # full maps so that as we handle those, we can override the simple + # maps for them in the legacy table, and merely add them in the + # new-style one. + my $legacy = Property->new("Legacy_" . $full_table->full_name, + File => $full_table->full_name =~ + s/case_Mapping//r, + Range_Size_1 => 1, + Format => $HEX_FORMAT, + Default_Map => $CODE_POINT, + UCD => 0, + Initialize => $full_table, + To_Output_Map => $EXTERNAL_MAP, + ); - my $simple_only = Property->new("_s$case", - Type => $STRING, - Default_Map => $CODE_POINT, - Perl_Extension => 1, - Fate => $INTERNAL_ONLY, - Description => "This contains the simple mappings for $case for just the code points that have different full mappings"); - $simple_only->set_to_output_map($INTERNAL_MAP); - $simple_only->add_comment(join_lines( <add_comment(join_lines( <initialize($full_table) if $simple->to_output_map(); + unless ($simple->to_output_map()) { - $simple_only->set_proxy_for($simple_name); + $full_table->set_proxy_for($simple_name); } } @@ -10787,28 +11114,54 @@ END return; } - $_ = "$fields[0]; lc; $fields[1]"; - $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]"); - $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]"); + my $decimal_code_point = hex $fields[0]; - # Copy any simple case change to the special tables constructed if - # being overridden by a multi-character case change. - if ($fields[1] ne $fields[0] - && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT) - { - $file->insert_adjusted_lines("$fields[0]; _slc; $value"); - } - if ($fields[2] ne $fields[0] - && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT) - { - $file->insert_adjusted_lines("$fields[0]; _stc; $value"); - } - if ($fields[3] ne $fields[0] - && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT) - { - $file->insert_adjusted_lines("$fields[0]; _suc; $value"); + # Loop to handle each of the three mappings in the input line, in + # order, with $i indicating the current field number. + my $i = 0; + for my $object ($lc, $tc, $uc) { + $i++; # First time through, $i = 0 ... 3rd time = 3 + + my $value = $object->value_of($decimal_code_point); + $value = ($value eq $CODE_POINT) + ? $decimal_code_point + : hex $value; + + # If this isn't a multi-character mapping, it should already have + # been read in. + if ($fields[$i] !~ / /) { + if ($value != hex $fields[$i]) { + Carp::my_carp("Bad news. UnicodeData.txt thinks " + . $object->name + . "(0x$fields[0]) is $value" + . " and SpecialCasing.txt thinks it is " + . hex($fields[$i]) + . ". Good luck. Retaining UnicodeData value, and proceeding anyway."); + } + } + else { + + # The mapping goes into both the legacy table, in which it + # replaces the simple one... + $file->insert_adjusted_lines("$fields[0]; Legacy_" + . $object->full_name + . "; $fields[$i]"); + + # ... and, the The regular table, in which it is additional, + # beyond the simple mapping. + $file->insert_adjusted_lines("$fields[0]; " + . $object->name + . "; " + . $CMD_DELIM + . "$REPLACE_CMD=$MULTIPLE_BEFORE" + . $CMD_DELIM + . $fields[$i]); + } } + # Everything has been handled by the insert_adjusted_lines() + $_ = ""; + return; } } @@ -10865,7 +11218,7 @@ sub filter_old_style_case_folding { $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds", Perl_Extension => 1, Fate => $INTERNAL_ONLY, - Description => "Code points that particpate in a multi-char fold not in the final position", + Description => "Code points that particpate in a multi-char fold and are not the final character of said fold", ); # If we ever wanted to show that these tables were combined, a new @@ -11354,7 +11707,7 @@ sub setup_script_extensions { my $scx = property_ref("Script_Extensions"); $scx = Property->new("scx", Full_Name => "Script_Extensions") - if ! defined $scx; + if ! defined $scx; $scx->_set_format($STRING_WHITE_SPACE_LIST); $scx->initialize($script); $scx->set_default_map($script->default_map); @@ -11387,6 +11740,10 @@ sub filter_script_extensions_line { # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW my @fields = split /\s*;\s*/; + + # This script was erroneously omitted in this Unicode version. + $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/; + my @full_names; foreach my $short_name (split " ", $fields[1]) { push @full_names, $script->table($short_name)->full_name; @@ -11398,15 +11755,273 @@ sub filter_script_extensions_line { } sub setup_early_name_alias { + my $file= shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # This has the effect of pretending that the Name_Alias property was + # available in all Unicode releases. Strictly speaking, this property + # should not be availabe in early releases, but doing this allows + # charnames.pm to work on older releases without change. Prior to v5.16 + # it had these names hard-coded inside it. Unicode 6.1 came along and + # created these names, and so they were removed from charnames. + my $aliases = property_ref('Name_Alias'); - $aliases = Property->new('Name_Alias') if ! defined $aliases; + if (! defined $aliases) { + $aliases = Property->new('Name_Alias', Default_Map => ""); + } + + $file->insert_lines(get_old_name_aliases()); - # Before 6.0, this wasn't a problem, and after it, this alias is part of - # the Unicode-delivered file. - $aliases->add_map(7, 7, "ALERT: control") if $v_version eq v6.0.0; return; } +sub get_old_name_aliases () { + + # The Unicode_1_Name field, contains most of these names. One would + # expect, given the field's name, that its values would be fixed across + # versions, giving the true Unicode version 1 name for the character. + # Sadly, this is not the case. Actually Version 1.1.5 had no names for + # any of the controls; Version 2.0 introduced names for the C0 controls, + # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2 + # changed some names: it + # changed to parenthesized versions like "NEXT LINE" to + # "NEXT LINE (NEL)"; + # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD + # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;; + # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR + # This list contains all the names that were defined so that + # charnames::vianame(), etc. understand them all EVEN if this version of + # Unicode didn't specify them (this could be construed as a bug). + # mktables elsewhere gives preference to the Unicode_1_Name field over + # these names, so that viacode() will return the correct value for that + # version of Unicode, except when that version doesn't define a name, + # viacode() will return one anyway (this also could be construed as a + # bug). But these potential "bugs" allow for the smooth working of code + # on earlier Unicode releases. + + my @return = split /\n/, <<'END'; +0000;NULL;control +0000;NUL;abbreviation +0001;START OF HEADING;control +0001;SOH;abbreviation +0002;START OF TEXT;control +0002;STX;abbreviation +0003;END OF TEXT;control +0003;ETX;abbreviation +0004;END OF TRANSMISSION;control +0004;EOT;abbreviation +0005;ENQUIRY;control +0005;ENQ;abbreviation +0006;ACKNOWLEDGE;control +0006;ACK;abbreviation +0007;BELL;control +0007;BEL;abbreviation +0008;BACKSPACE;control +0008;BS;abbreviation +0009;CHARACTER TABULATION;control +0009;HORIZONTAL TABULATION;control +0009;HT;abbreviation +0009;TAB;abbreviation +000A;LINE FEED;control +000A;LINE FEED (LF);control +000A;NEW LINE;control +000A;END OF LINE;control +000A;LF;abbreviation +000A;NL;abbreviation +000A;EOL;abbreviation +000B;LINE TABULATION;control +000B;VERTICAL TABULATION;control +000B;VT;abbreviation +000C;FORM FEED;control +000C;FORM FEED (FF);control +000C;FF;abbreviation +000D;CARRIAGE RETURN;control +000D;CARRIAGE RETURN (CR);control +000D;CR;abbreviation +000E;SHIFT OUT;control +000E;LOCKING-SHIFT ONE;control +000E;SO;abbreviation +000F;SHIFT IN;control +000F;LOCKING-SHIFT ZERO;control +000F;SI;abbreviation +0010;DATA LINK ESCAPE;control +0010;DLE;abbreviation +0011;DEVICE CONTROL ONE;control +0011;DC1;abbreviation +0012;DEVICE CONTROL TWO;control +0012;DC2;abbreviation +0013;DEVICE CONTROL THREE;control +0013;DC3;abbreviation +0014;DEVICE CONTROL FOUR;control +0014;DC4;abbreviation +0015;NEGATIVE ACKNOWLEDGE;control +0015;NAK;abbreviation +0016;SYNCHRONOUS IDLE;control +0016;SYN;abbreviation +0017;END OF TRANSMISSION BLOCK;control +0017;ETB;abbreviation +0018;CANCEL;control +0018;CAN;abbreviation +0019;END OF MEDIUM;control +0019;EOM;abbreviation +001A;SUBSTITUTE;control +001A;SUB;abbreviation +001B;ESCAPE;control +001B;ESC;abbreviation +001C;INFORMATION SEPARATOR FOUR;control +001C;FILE SEPARATOR;control +001C;FS;abbreviation +001D;INFORMATION SEPARATOR THREE;control +001D;GROUP SEPARATOR;control +001D;GS;abbreviation +001E;INFORMATION SEPARATOR TWO;control +001E;RECORD SEPARATOR;control +001E;RS;abbreviation +001F;INFORMATION SEPARATOR ONE;control +001F;UNIT SEPARATOR;control +001F;US;abbreviation +0020;SP;abbreviation +007F;DELETE;control +007F;DEL;abbreviation +0080;PADDING CHARACTER;figment +0080;PAD;abbreviation +0081;HIGH OCTET PRESET;figment +0081;HOP;abbreviation +0082;BREAK PERMITTED HERE;control +0082;BPH;abbreviation +0083;NO BREAK HERE;control +0083;NBH;abbreviation +0084;INDEX;control +0084;IND;abbreviation +0085;NEXT LINE;control +0085;NEXT LINE (NEL);control +0085;NEL;abbreviation +0086;START OF SELECTED AREA;control +0086;SSA;abbreviation +0087;END OF SELECTED AREA;control +0087;ESA;abbreviation +0088;CHARACTER TABULATION SET;control +0088;HORIZONTAL TABULATION SET;control +0088;HTS;abbreviation +0089;CHARACTER TABULATION WITH JUSTIFICATION;control +0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control +0089;HTJ;abbreviation +008A;LINE TABULATION SET;control +008A;VERTICAL TABULATION SET;control +008A;VTS;abbreviation +008B;PARTIAL LINE FORWARD;control +008B;PARTIAL LINE DOWN;control +008B;PLD;abbreviation +008C;PARTIAL LINE BACKWARD;control +008C;PARTIAL LINE UP;control +008C;PLU;abbreviation +008D;REVERSE LINE FEED;control +008D;REVERSE INDEX;control +008D;RI;abbreviation +008E;SINGLE SHIFT TWO;control +008E;SINGLE-SHIFT-2;control +008E;SS2;abbreviation +008F;SINGLE SHIFT THREE;control +008F;SINGLE-SHIFT-3;control +008F;SS3;abbreviation +0090;DEVICE CONTROL STRING;control +0090;DCS;abbreviation +0091;PRIVATE USE ONE;control +0091;PRIVATE USE-1;control +0091;PU1;abbreviation +0092;PRIVATE USE TWO;control +0092;PRIVATE USE-2;control +0092;PU2;abbreviation +0093;SET TRANSMIT STATE;control +0093;STS;abbreviation +0094;CANCEL CHARACTER;control +0094;CCH;abbreviation +0095;MESSAGE WAITING;control +0095;MW;abbreviation +0096;START OF GUARDED AREA;control +0096;START OF PROTECTED AREA;control +0096;SPA;abbreviation +0097;END OF GUARDED AREA;control +0097;END OF PROTECTED AREA;control +0097;EPA;abbreviation +0098;START OF STRING;control +0098;SOS;abbreviation +0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment +0099;SGC;abbreviation +009A;SINGLE CHARACTER INTRODUCER;control +009A;SCI;abbreviation +009B;CONTROL SEQUENCE INTRODUCER;control +009B;CSI;abbreviation +009C;STRING TERMINATOR;control +009C;ST;abbreviation +009D;OPERATING SYSTEM COMMAND;control +009D;OSC;abbreviation +009E;PRIVACY MESSAGE;control +009E;PM;abbreviation +009F;APPLICATION PROGRAM COMMAND;control +009F;APC;abbreviation +00A0;NBSP;abbreviation +00AD;SHY;abbreviation +200B;ZWSP;abbreviation +200C;ZWNJ;abbreviation +200D;ZWJ;abbreviation +200E;LRM;abbreviation +200F;RLM;abbreviation +202A;LRE;abbreviation +202B;RLE;abbreviation +202C;PDF;abbreviation +202D;LRO;abbreviation +202E;RLO;abbreviation +FEFF;BYTE ORDER MARK;alternate +FEFF;BOM;abbreviation +FEFF;ZWNBSP;abbreviation +END + + if ($v_version ge v3.0.0) { + push @return, split /\n/, <<'END'; +180B; FVS1; abbreviation +180C; FVS2; abbreviation +180D; FVS3; abbreviation +180E; MVS; abbreviation +202F; NNBSP; abbreviation +END + } + + if ($v_version ge v3.2.0) { + push @return, split /\n/, <<'END'; +034F; CGJ; abbreviation +205F; MMSP; abbreviation +2060; WJ; abbreviation +END + # Add in VS1..VS16 + my $cp = 0xFE00 - 1; + for my $i (1..16) { + push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i); + } + } + if ($v_version ge v4.0.0) { # Add in VS17..VS256 + my $cp = 0xE0100 - 17; + for my $i (17..256) { + push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i); + } + } + + # ALERT did not come along until 6.0, at which point it became preferred + # over BELL, and was never in the Unicode_1_Name field. For the same + # reasons, that the other names are made known to all releases by this + # function, we make ALERT known too. By inserting it + # last in early releases, BELL is preferred over it; and vice-vers in 6.0 + my $alert = '0007; ALERT; control'; + if ($v_version lt v6.0.0) { + push @return, $alert; + } + else { + unshift @return, $alert; + } + + return @return; +} + sub filter_later_version_name_alias_line { # This file has an extra entry per line for the alias type. This is @@ -11431,8 +12046,10 @@ sub filter_later_version_name_alias_line { sub filter_early_version_name_alias_line { # Early versions did not have the trailing alias type field; implicitly it - # was 'correction' - $_ .= "; correction"; + # was 'correction'. But our synthetic lines we add in this program do + # have it, so test for the type field. + $_ .= "; correction" if $_ !~ /;.*;/; + filter_later_version_name_alias_line; return; } @@ -11440,14 +12057,36 @@ sub filter_early_version_name_alias_line { sub finish_Unicode() { # This routine should be called after all the Unicode files have been read # in. It: - # 1) Adds the mappings for code points missing from the files which have + # 1) Creates properties that are missing from the version of Unicode being + # compiled, and which, for whatever reason, are needed for the Perl + # core to function properly. These are minimally populated as + # necessary. + # 2) Adds the mappings for code points missing from the files which have # defaults specified for them. - # 2) At this this point all mappings are known, so it computes the type of + # 3) At this this point all mappings are known, so it computes the type of # each property whose type hasn't been determined yet. - # 3) Calculates all the regular expression match tables based on the + # 4) Calculates all the regular expression match tables based on the # mappings. - # 3) Calculates and adds the tables which are defined by Unicode, but - # which aren't derived by them + # 5) Calculates and adds the tables which are defined by Unicode, but + # which aren't derived by them, and certain derived tables that Perl + # uses. + + # GCB and hst are not in early Unicode releases; create dummy ones if + # they don't exist, as the core needs tables generated from them. + my $gcb = property_ref('Grapheme_Cluster_Break'); + if (! defined $gcb) { + $gcb = Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break', + Status => $PLACEHOLDER, + Type => $ENUM, + Default_Map => 'Other'); + } + my $hst = property_ref('Hangul_Syllable_Type'); + if (!defined $hst) { + $hst = Property->new('hst', Full_Name => 'Hangul_Syllable_Type', + Status => $PLACEHOLDER, + Type => $ENUM, + Default_Map => 'Not_Applicable'); + } # For each property, fill in any missing mappings, and calculate the re # match tables. If a property has more than one missing mapping, the @@ -11685,7 +12324,8 @@ END Lowercase_Mapping Titlecase_Mapping Case_Folding - } ) { + } ) + { my $full = property_ref($map); if ($full->is_empty) { my $simple = property_ref('Simple_' . $map); @@ -11699,6 +12339,40 @@ END } } + # Create digit and case fold tables with the original file names for + # backwards compatibility with applications that read them directly. + my $Digit = Property->new("Legacy_Perl_Decimal_Digit", + Default_Map => "", + Perl_Extension => 1, + File => 'Digit', # Trad. location + Directory => $map_directory, + UCD => 0, + Type => $STRING, + To_Output_Map => $EXTERNAL_MAP, + Range_Size_1 => 1, + Initialize => property_ref('Perl_Decimal_Digit'), + ); + $Digit->add_comment(join_lines(<new('Legacy_Case_Folding', + File => "Fold", + Directory => $map_directory, + Default_Map => $CODE_POINT, + UCD => 0, + Range_Size_1 => 1, + Type => $STRING, + To_Output_Map => $EXTERNAL_MAP, + Format => $HEX_FORMAT, + Initialize => property_ref('cf'), + ); + # The Script_Extensions property started out as a clone of the Script # property. But processing its data file caused some elements to be # replaced with different data. (These elements were for the Common and @@ -11847,7 +12521,12 @@ sub compile_perl() { # identical code points, but their caseless equivalents are not the same, # one being 'Cased' and the other being 'LC', and so now must be kept as # separate entities. - $Title += $lt if defined $lt; + if (defined $lt) { + $Title += $lt; + } + else { + push @tables_that_may_be_empty, $Title->complete_name; + } # If this Unicode version doesn't have Cased, set up our own. From # Unicode 5.1: Definition D120: A character C is defined to be cased if @@ -12013,11 +12692,12 @@ sub compile_perl() { # Perl's traditional space doesn't include Vertical Tab my $XPerlSpace = $perl->add_match_table('XPerlSpace', Description => '\s, including beyond ASCII', - Initialize => $Space - 0x000B, + #Initialize => $Space - 0x000B, + Initialize => $Space, ); $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym my $PerlSpace = $perl->add_match_table('PerlSpace', - Description => '\s, restricted to ASCII = [ \f\n\r\t]', + Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab', Initialize => $XPerlSpace & $ASCII, ); @@ -12164,9 +12844,9 @@ sub compile_perl() { # The 'extended' grapheme cluster came in 5.1. The non-extended # definition differs too much from the traditional Perl one to use. - if (defined $gcb && defined $gcb->table('SpacingMark')) { + if (defined $gcb->table('SpacingMark')) { - # Note that assumes HST is defined; it came in an earlier release than + # Note that assumes hst is defined; it came in an earlier release than # GCB. In the line below, two negatives means: yes hangul $begin += ~ property_ref('Hangul_Syllable_Type') ->table('Not_Applicable') @@ -12189,16 +12869,7 @@ sub compile_perl() { # We set things up so the Perl core degrades gracefully, possibly with # placeholders that match nothing. - if (! defined $gcb) { - $gcb = Property->new('GCB', Status => $PLACEHOLDER); - } - my $hst = property_ref('HST'); - if (!defined $hst) { - $hst = Property->new('HST', Status => $PLACEHOLDER); - $hst->add_match_table('Not_Applicable', - Initialize => $Any, - Matches_All => 1); - } + my $hst = property_ref('Hangul_Syllable_Type'); # On some releases, here we may not have the needed tables for the # perl core, in some releases we may. @@ -12209,11 +12880,9 @@ sub compile_perl() { push @tables_that_may_be_empty, $table->complete_name; } - # The HST property predates the GCB one, and has identical tables + # The hst property predates the GCB one, and has identical tables # for some of them, so use it if we can. - if ($table->is_empty - && defined $hst - && defined $hst->table($name)) + if ($table->is_empty && defined $hst->table($name)) { $table += $hst->table($name); } @@ -12230,7 +12899,7 @@ sub compile_perl() { push @tables_that_may_be_empty, $lv_lvt_v->complete_name; } else { $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V'); - $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V'); + $lv_lvt_v->add_comment('For use in \X; matches: hst=LV | hst=LVT | hst=V'); } # Was previously constructed to contain both Name and Unicode_1_Name @@ -12244,72 +12913,93 @@ sub compile_perl() { } my $alias_sentence = ""; + my %abbreviations; my $alias = property_ref('Name_Alias'); - if (defined $alias) { - push @composition, 'Name_Alias'; - $perl_charname->set_proxy_for('Name_Alias'); - my $unicode_1 = property_ref('Unicode_1_Name'); - my %abbreviations; - - # Add each entry in Name_Alias to Perl_Charnames. Where these go with - # respect to any existing entry depends on the entry type. - # Corrections go before said entry, as they should be returned in - # preference over the existing entry. (A correction to a correction - # should be later in the Name_Alias table, so it will correctly - # precede the erroneous correction in Perl_Charnames.) - # - # Abbreviations go after everything else, so they are saved - # temporarily in a hash for later. - # - # Controls are currently added afterwards. This is because Perl has - # previously used the Unicode1 name, and so should still use that. - # (Most of them will be the same anyway, in which case we don't add a - # duplicate) - - $alias->reset_each_range; - while (my ($range) = $alias->each_range) { - next if $range->value eq ""; - my $code_point = $range->start; - if ($code_point != $range->end) { - Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); - } - my ($value, $type) = split ': ', $range->value; - my $replace_type; - if ($type eq 'correction') { - $replace_type = $MULTIPLE_BEFORE; - } - elsif ($type eq 'abbreviation') { - - # Save for later - $abbreviations{$value} = $code_point; - next; - } - elsif ($type eq 'control') { - my $unicode_1_value = $unicode_1->value_of($code_point); - next if $unicode_1_value eq $value; - $replace_type = $MULTIPLE_AFTER; - } - else { - $replace_type = $MULTIPLE_AFTER; - } - - # Actually add; before or after current entry(ies) as determined - # above. - $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type); + push @composition, 'Name_Alias'; + $perl_charname->set_proxy_for('Name_Alias'); + + # Add each entry in Name_Alias to Perl_Charnames. Where these go with + # respect to any existing entry depends on the entry type. Corrections go + # before said entry, as they should be returned in preference over the + # existing entry. (A correction to a correction should be later in the + # Name_Alias table, so it will correctly precede the erroneous correction + # in Perl_Charnames.) + # + # Abbreviations go after everything else, so they are saved temporarily in + # a hash for later. + # + # Controls are currently added afterwards. This is because Perl has + # previously used the Unicode1 name, and so should still use that. (Most + # of them will be the same anyway, in which case we don't add a duplicate) + + $alias->reset_each_range; + while (my ($range) = $alias->each_range) { + next if $range->value eq ""; + my $code_point = $range->start; + if ($code_point != $range->end) { + Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); + } + my ($value, $type) = split ': ', $range->value; + my $replace_type; + if ($type eq 'correction') { + $replace_type = $MULTIPLE_BEFORE; + } + elsif ($type eq 'abbreviation') { + + # Save for later + $abbreviations{$value} = $code_point; + next; } - - # Now that have everything added, add in abbreviations after - # everything else. - foreach my $value (keys %abbreviations) { - $perl_charname->add_duplicate($abbreviations{$value}, $value, Replace => $MULTIPLE_AFTER); + else { + $replace_type = $MULTIPLE_AFTER; } - $alias_sentence = <add_duplicate($code_point, $value, Replace => $replace_type); + } + $alias_sentence = <ranges) { + my $code_point = $range->start; + my $unicode_1_value = $range->value; + next if $unicode_1_value eq ""; # Skip if name doesn't exist. + + if ($code_point != $range->end) { + Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;"); + } + + # To handle EBCDIC, we don't hard code in the code points of the + # controls; instead realizing that all of them are below 256. + last if $code_point > 255; + + # We only add in the controls. + next if $gc->value_of($code_point) ne 'Cc'; + + # This won't add an exact duplicate. + $perl_charname->add_duplicate($code_point, $unicode_1_value, + Replace => $before_or_after); + } + + # Now that have everything added, add in abbreviations after + # everything else. + foreach my $value (keys %abbreviations) { + $perl_charname->add_duplicate($abbreviations{$value}, $value, + Replace => $MULTIPLE_AFTER); } my $comment; @@ -12413,6 +13103,31 @@ END $unassigned->set_equivalent_to($age_default, Related => 1); } + # See L + my $quotemeta = $perl->add_match_table('_Perl_Quotemeta', + Perl_Extension => 1, + Fate => $INTERNAL_ONLY, + + # Initialize to what's common in + # all Unicode releases. + Initialize => + $Space + + $gc->table('Control') + ); + + # In early releases without the proper Unicode properties, just set to \W. + if (! defined (my $patsyn = property_ref('Pattern_Syntax')) + || ! defined (my $patws = property_ref('Pattern_White_Space')) + || ! defined (my $di = property_ref('Default_Ignorable_Code_Point'))) + { + $quotemeta += ~ $Word; + } + else { + $quotemeta += $patsyn->table('Y') + + $patws->table('Y') + + $di->table('Y') + + ((~ $Word) & $ASCII); + } # Finished creating all the perl properties. All non-internal non-string # ones have a synonym of 'Is_' prefixed. (Internal properties begin with @@ -13532,7 +14247,7 @@ sub make_ucd_table_pod_entries { || $ucd_pod{$standard}{'perl_extension'} == $perl_extension || $output_this == $perl_extension) { - Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations. Proceeding anyway."); + Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway."); } # We modifiy the info column of the one being output to @@ -14137,7 +14852,8 @@ Certain properties are accessible also via core function calls. These are: Uppercase_Mapping uc() Also, Case_Folding is accessible through the C modifier in regular -expressions. +expressions, the C<\\F> transliteration escape, and the C> +operator. And, the Name and Name_Aliases properties are accessible through the C<\\N{}> interpolation in double-quoted strings and regular expressions; and functions @@ -14628,33 +15344,36 @@ sub make_UCD () { # Make a list of all combinations of properties/values that are suppressed. my @suppressed; - foreach my $property_name (keys %why_suppressed) { + if (! $debug_skip) { # This tends to fail in this debug mode + foreach my $property_name (keys %why_suppressed) { - # Just the value - my $value_name = $1 if $property_name =~ s/ = ( .* ) //x; + # Just the value + my $value_name = $1 if $property_name =~ s/ = ( .* ) //x; - # The hash may contain properties not in this release of Unicode - next unless defined (my $property = property_ref($property_name)); + # The hash may contain properties not in this release of Unicode + next unless defined (my $property = property_ref($property_name)); - # Find all combinations - foreach my $prop_alias ($property->aliases) { - my $prop_alias_name = standardize($prop_alias->name); + # Find all combinations + foreach my $prop_alias ($property->aliases) { + my $prop_alias_name = standardize($prop_alias->name); - # If no =value, there's just one combination possibe for this - if (! $value_name) { + # If no =value, there's just one combination possibe for this + if (! $value_name) { - # The property may be suppressed, but there may be a proxy for - # it, so it shouldn't be listed as suppressed - next if $prop_alias->ucd; - push @suppressed, $prop_alias_name; - } - else { # Otherwise - foreach my $value_alias ($property->table($value_name)->aliases) - { - next if $value_alias->ucd; + # The property may be suppressed, but there may be a proxy + # for it, so it shouldn't be listed as suppressed + next if $prop_alias->ucd; + push @suppressed, $prop_alias_name; + } + else { # Otherwise + foreach my $value_alias + ($property->table($value_name)->aliases) + { + next if $value_alias->ucd; - push @suppressed, "$prop_alias_name=" - . standardize($value_alias->name); + push @suppressed, "$prop_alias_name=" + . standardize($value_alias->name); + } } } } @@ -14985,10 +15704,11 @@ sub write_all_tables() { } } } - elsif ($count == $MAX_UNICODE_CODEPOINTS) { - if ($table == $property || $table->leader == $table) { + elsif ($count == $MAX_UNICODE_CODEPOINTS + && ($table == $property || $table->leader == $table) + && $table->property->status != $PLACEHOLDER) + { Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway."); - } } if ($table->fate == $SUPPRESSED) { @@ -15261,6 +15981,7 @@ sub write_all_tables() { make_UCD; make_property_test_script() if $make_test_script; + make_normalization_test_script() if $make_norm_test_script; return; } @@ -15711,6 +16432,82 @@ sub make_property_test_script() { return; } +sub make_normalization_test_script() { + print "Making normalization test script\n" if $verbosity >= $PROGRESS; + + my $n_path = 'TestNorm.pl'; + + unshift @normalization_tests, <<'END'; +use utf8; +use Test::More; + +sub ord_string { # Convert packed ords to printable string + use charnames (); + return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' } + unpack "U*", shift) . "'"; + #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'"; +} + +sub Test_N { + my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_; + my $display_source = ord_string($source); + my $display_nfc = ord_string($nfc); + my $display_nfd = ord_string($nfd); + my $display_nfkc = ord_string($nfkc); + my $display_nfkd = ord_string($nfkd); + + use Unicode::Normalize; + # NFC + # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd) + # nfkc == toNFC(nfkc) == toNFC(nfkd) + # + # NFD + # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd) + # nfkd == toNFD(nfkc) == toNFD(nfkd) + # + # NFKC + # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) == + # toNFKC(nfkc) == toNFKC(nfkd) + # + # NFKD + # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) == + # toNFKD(nfkc) == toNFKD(nfkd) + + is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc"); + is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc"); + is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc"); + is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc"); + is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc"); + + is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd"); + is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd"); + is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd"); + is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd"); + is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd"); + + is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc"); + is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc"); + is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc"); + is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc"); + is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc"); + + is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd"); + is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd"); + is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd"); + is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd"); + is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd"); +} +END + + &write($n_path, + 1, # Is utf8; + [ + @normalization_tests, + 'done_testing();' + ]); + return; +} + # This is a list of the input files and how to handle them. The files are # processed in their order in this list. Some reordering is possible if # desired, but the v0 files should be first, and the extracted before the @@ -15861,8 +16658,9 @@ my @input_file_objects = ( Input_file->new('BidiMirroring.txt', v3.0.1, Property => 'Bidi_Mirroring_Glyph', ), - Input_file->new("NormalizationTest.txt", v3.0.1, - Skip => 'Validation Tests', + Input_file->new("NormTest.txt", v3.0.0, + Handler => \&process_NormalizationsTest, + Skip => ($make_norm_test_script) ? 0 : 'Validation Tests', ), Input_file->new('CaseFolding.txt', v3.0.1, Pre_Handler => \&setup_case_folding, @@ -15920,7 +16718,7 @@ my @input_file_objects = ( Input_file->new('NamedSequences.txt', v4.1.0, Handler => \&process_NamedSequences ), - Input_file->new('NameAliases.txt', v5.0.0, + Input_file->new('NameAliases.txt', v0, Property => 'Name_Alias', Pre_Handler => ($v_version le v6.0.0) ? \&setup_early_name_alias