$0 = File::Spec->canonpath($0);
my $make_test_script = 0; # ? Should we output a test script
+my $make_norm_test_script = 0; # ? Should we output a normalization test script
my $write_unchanged_files = 0; # ? Should we update the output files even if
# we don't think they have changed
my $use_directory = ""; # ? Should we chdir somewhere.
{
$make_test_script = 1;
}
+ elsif ($arg eq '-makenormtest')
+ {
+ $make_norm_test_script = 1;
+ }
elsif ($arg eq '-makelist') {
$make_list = 1;
}
# Enum values for to_output_map() method in the Map_Table package.
my $EXTERNAL_MAP = 1;
my $INTERNAL_MAP = 2;
+my $OUTPUT_ADJUSTED = 3;
# To override computed values for writing the map tables for these properties.
# The default for enum map tables is to write them out, so that the Unicode
my $HEX_FORMAT = 'x';
my $RATIONAL_FORMAT = 'r';
my $STRING_FORMAT = 's';
+my $ADJUST_FORMAT = 'a';
my $DECOMP_STRING_FORMAT = 'c';
my $STRING_WHITE_SPACE_LIST = 'sw';
$HEX_FORMAT => 'non-negative hex whole number; a code point',
$RATIONAL_FORMAT => 'rational: an integer or a fraction',
$STRING_FORMAT => 'string',
+ $ADJUST_FORMAT => 'some entries need adjustment',
$DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
$STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
);
# anonymous hash.
my @code_points_ending_in_code_point;
+# To hold Unicode's normalization test suite
+my @normalization_tests;
+
# Boolean: does this Unicode version have the hangul syllables, and are we
# writing out a table for them?
my $has_hangul_syllables = 0;
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
# either a constructor or a method. If called as a method, the result
# will be a new() instance of the calling object, containing the union
# of that object with the other parameter's code points; if called as
- # a constructor, the first parameter gives the class the new object
+ # a constructor, the first parameter gives the class that the new object
# should be, and the second parameter gives the code points to go into
# it.
# In either case, there are two parameters looked at by this routine;
# just a single code point.
#
# If they are ranges, this routine doesn't make any effort to preserve
- # the range values of one input over the other. Therefore this base
- # class should not allow _union to be called from other than
+ # the range values and types of one input over the other. Therefore
+ # this base class should not allow _union to be called from other than
# initialization code, so as to prevent two tables from being added
# together where the range values matter. The general form of this
# routine therefore belongs in a derived class, but it was moved here
# to avoid duplication of code. The failure to overload this in this
# class keeps it safe.
#
+ # It does make the effort during initialization to accept tables with
+ # multiple values for the same code point, and to preserve the order
+ # of these. If there is only one input range or range set, it doesn't
+ # sort (as it should already be sorted to the desired order), and will
+ # accept multiple values per code point. Otherwise it will merge
+ # multiple values into a single one.
my $self;
my @args; # Arguments to pass to the constructor
# Accumulate all records from both lists.
my @records;
+ my $input_count = 0;
for my $arg (@args) {
#local $to_trace = 0 if main::DEBUG;
trace "argument = $arg" if main::DEBUG && $to_trace;
Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
return;
}
+
$arg = [ $arg ] if ! ref $arg;
my $type = ref $arg;
if ($type eq 'ARRAY') {
foreach my $element (@$arg) {
push @records, Range->new($element, $element);
+ $input_count++;
}
}
elsif ($arg->isa('Range')) {
push @records, $arg;
+ $input_count++;
}
elsif ($arg->can('ranges')) {
push @records, $arg->ranges;
+ $input_count++;
}
else {
my $message = "";
# Sort with the range containing the lowest ordinal first, but if
# two ranges start at the same code point, sort with the bigger range
# of the two first, because it takes fewer cycles.
- @records = sort { ($a->start <=> $b->start)
+ if ($input_count > 1) {
+ @records = sort { ($a->start <=> $b->start)
or
# if b is shorter than a, b->end will be
# less than a->end, and we want to select
# a, so want to return -1
($b->end <=> $a->end)
} @records;
+ }
my $new = $class->new(@_);
for my $set (@records) {
my $start = $set->start;
my $end = $set->end;
- my $value = $set->value;
+ my $value = $set->value;
+ my $type = $set->type;
if ($start > $new->max) {
- $new->_add_delete('+', $start, $end, $value);
+ $new->_add_delete('+', $start, $end, $value, Type => $type);
}
elsif ($end > $new->max) {
- $new->_add_delete('+', $new->max +1, $end, $value);
+ $new->_add_delete('+', $new->max +1, $end, $value,
+ Type => $type);
+ }
+ elsif ($input_count == 1) {
+ # Here, overlaps existing range, but is from a single input,
+ # so preserve the multiple values from that input.
+ $new->_add_delete('+', $start, $end, $value, Type => $type,
+ Replace => $MULTIPLE_AFTER);
}
}
# new and old values are identical, the
# replacement is skipped to save cycles
# => $IF_NOT_EQUIVALENT means to replace the existing values
- # with this one if they are not equivalent.
+ # (the default) with this one if they are not equivalent.
# Ranges are equivalent if their types are the
# same, and they are the same string; or if
# both are type 0 ranges, if their Unicode
# multiple times. They are stored LIFO, so
# that the final one inserted is the first one
# returned in an ordered search of the table.
+ # If this is an exact duplicate, including the
+ # value, the original will be moved to be
+ # first, before any other duplicate ranges
+ # with different values.
# => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
# FIFO, so that this one is inserted after all
- # others that currently exist.
+ # others that currently exist. If this is an
+ # exact duplicate, including value, of an
+ # existing range, this one is discarded
+ # (leaving the existing one in its original,
+ # higher priority position
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
# "same value" means identical for non-type-0 ranges, and it means
Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
return;
}
+ if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
+ Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway");
+ }
#local $to_trace = 1 if main::DEBUG;
if ($operation eq '-') {
}
# If to place this new record after, move to beyond all existing
- # ones.
+ # ones; but don't add this one if identical to any of them, as it
+ # isn't really a multiple. This leaves the original order, so
+ # that the current request is ignored. The reasoning is that the
+ # previous request that wanted this record to have high priority
+ # should have precedence.
if ($replace == $MULTIPLE_AFTER) {
while ($i < @$r && $r->[$i]->start == $start) {
+ return if $value eq $r->[$i]->value
+ && $type eq $r->[$i]->type;
$i++;
}
}
+ else {
+ # If instead we are to place this new record before any
+ # existing ones, remove any identical ones that come after it.
+ # This changes the existing order so that the new one is
+ # first, as is being requested.
+ for (my $j = $i + 1;
+ $j < @$r && $r->[$j]->start == $start;
+ $j++)
+ {
+ if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
+ splice @$r, $j, 1;
+ last; # There should only be one instance, so no
+ # need to keep looking
+ }
+ }
+ }
trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
my @return = splice @$r,
return $self->_union($other)
},
+ '+=' => sub { my $self = shift;
+ my $other = shift;
+ my $reversed = shift;
+
+ if ($reversed) {
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' += '
+ . ref($self)
+ . "'. undef returned.");
+ return;
+ }
+
+ return $self->_union($other)
+ },
'&' => sub { my $self = shift;
my $other = shift;
use overload
fallback => 0,
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
'!=' => \&main::_operator_not_equal,
'==' => \&main::_operator_equal,
;
my $status = delete $args{'Status'};
$status = $NORMAL unless defined $status;
- my $ucd = delete $args{'UCD'} // 1;
+ # An internal name does not get documented, unless overridden by the
+ # input.
+ my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
# Write a representation of the table to its file. It calls several
# functions furnished by sub-classes of this abstract base class to
# handle non-normal ranges, to add stuff before the table, and at its
- # end.
+ # end. If the table is to be written so that adjustments are
+ # required, this does that conversion.
my $self = shift;
+ my $use_adjustments = shift; # ? output in adjusted format or not
my $tab_stops = shift; # The number of tab stops over to put any
# comment.
my $suppress_value = shift; # Optional, if the value associated with
if ($annotate) {
- # if annotating each code point, must print 1 per line.
+ # If annotating each code point, must print 1 per line.
# The variable could point to a subroutine, and we don't want
# to lose that fact, so only set if not set already
$range_size_1 = 1 if ! $range_size_1;
);
}
+ # Values for previous time through the loop. Initialize to
+ # something that won't be adjacent to the first iteration;
+ # only $previous_end matters for that.
+ my $previous_start;
+ my $previous_end = -2;
+ my $previous_value;
+
+ # Values for next time through the portion of the loop that splits
+ # the range. 0 in $next_start means there is no remaining portion
+ # to deal with.
+ my $next_start = 0;
+ my $next_end;
+ my $next_value;
+ my $offset = 0;
+
# Output each range as part of the here document.
RANGE:
for my $set ($range_list{$addr}->ranges) {
next RANGE if defined $suppress_value
&& $value eq $suppress_value;
- # If there is a range and doesn't need a single point range
- # output
- if ($start != $end && ! $range_size_1) {
- push @OUT, sprintf "%04X\t%04X", $start, $end;
- $OUT[-1] .= "\t$value" if $value ne "";
-
- # Add a comment with the size of the range, if requested.
- # Expand Tabs to make sure they all start in the same
- # column, and then unexpand to use mostly tabs.
- if (! $output_range_counts{$addr}) {
- $OUT[-1] .= "\n";
+ { # This bare block encloses the scope where we may need to
+ # split a range (when outputting adjusteds), and each time
+ # through we handle the next portion of the original by
+ # ending the block with a 'redo'. The values to use for
+ # that next time through are set up just below in the
+ # scalars whose names begin with '$next_'.
+
+ if ($use_adjustments) {
+
+ # When converting to use adjustments, we can handle
+ # only single element ranges. Set up so that this
+ # time through the loop, we look at the first element,
+ # and the next time through, we start off with the
+ # remainder. Thus each time through we look at the
+ # first element of the range
+ if ($end != $start) {
+ $next_start = $start + 1;
+ $next_end = $end;
+ $next_value = $value;
+ $end = $start;
+ }
+
+ # The values for some of these tables are stored as
+ # hex strings. Convert those to decimal
+ $value = hex($value)
+ if $self->default_map eq $CODE_POINT
+ && $value =~ / ^ [A-Fa-f0-9]+ $ /x;
+
+ # If this range is adjacent to the previous one, and
+ # the values in each are integers that are also
+ # adjacent (differ by 1), then this range really
+ # extends the previous one that is already in element
+ # $OUT[-1]. So we pop that element, and pretend that
+ # the range starts with whatever it started with.
+ # $offset is incremented by 1 each time so that it
+ # gives the current offset from the first element in
+ # the accumulating range, and we keep in $value the
+ # value of that first element.
+ if ($start == $previous_end + 1
+ && $value =~ /^ -? \d+ $/xa
+ && $previous_value =~ /^ -? \d+ $/xa
+ && ($value == ($previous_value + ++$offset)))
+ {
+ pop @OUT;
+ $start = $previous_start;
+ $value = $previous_value;
+ }
+ else {
+ $offset = 0;
+ }
+
+ # Save the current values for the next time through
+ # the loop.
+ $previous_start = $start;
+ $previous_end = $end;
+ $previous_value = $value;
}
- else {
- $OUT[-1] = Text::Tabs::expand($OUT[-1]);
- my $count = main::clarify_number($end - $start + 1);
- use integer;
-
- my $width = $tab_stops * 8 - 1;
- $OUT[-1] = sprintf("%-*s # [%s]\n",
- $width,
- $OUT[-1],
- $count);
- $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
+
+ # If there is a range and doesn't need a single point range
+ # output
+ if ($start != $end && ! $range_size_1) {
+ push @OUT, sprintf "%04X\t%04X", $start, $end;
+ $OUT[-1] .= "\t$value" if $value ne "";
+
+ # Add a comment with the size of the range, if
+ # requested. Expand Tabs to make sure they all start
+ # in the same column, and then unexpand to use mostly
+ # tabs.
+ if (! $output_range_counts{$addr}) {
+ $OUT[-1] .= "\n";
+ }
+ else {
+ $OUT[-1] = Text::Tabs::expand($OUT[-1]);
+ my $count = main::clarify_number($end - $start + 1);
+ use integer;
+
+ my $width = $tab_stops * 8 - 1;
+ $OUT[-1] = sprintf("%-*s # [%s]\n",
+ $width,
+ $OUT[-1],
+ $count);
+ $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
+ }
}
- next RANGE;
- }
- # Here to output a single code point per line
+ # Here to output a single code point per line.
+ # If not to annotate, use the simple formats
+ elsif (! $annotate) {
- # If not to annotate, use the simple formats
- if (! $annotate) {
+ # Use any passed in subroutine to output.
+ if (ref $range_size_1 eq 'CODE') {
+ for my $i ($start .. $end) {
+ push @OUT, &{$range_size_1}($i, $value);
+ }
+ }
+ else {
- # Use any passed in subroutine to output.
- if (ref $range_size_1 eq 'CODE') {
- for my $i ($start .. $end) {
- push @OUT, &{$range_size_1}($i, $value);
+ # Here, caller is ok with default output.
+ for (my $i = $start; $i <= $end; $i++) {
+ push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
+ }
}
}
else {
- # Here, caller is ok with default output.
+ # Here, wants annotation.
for (my $i = $start; $i <= $end; $i++) {
- push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
- }
- }
- next RANGE;
- }
- # Here, wants annotation.
- for (my $i = $start; $i <= $end; $i++) {
-
- # Get character information if don't have it already
- main::populate_char_info($i)
- if ! defined $viacode[$i];
- my $type = $annotate_char_type[$i];
-
- # Figure out if should output the next code points as part
- # of a range or not. If this is not in an annotation
- # range, then won't output as a range, so returns $i.
- # Otherwise use the end of the annotation range, but no
- # further than the maximum possible end point of the loop.
- my $range_end = main::min($annotate_ranges->value_of($i)
- || $i,
- $end);
-
- # Use a range if it is a range, and either is one of the
- # special annotation ranges, or the range is at most 3
- # long. This last case causes the algorithmically named
- # code points to be output individually in spans of at
- # most 3, as they are the ones whose $type is > 0.
- if ($range_end != $i
- && ( $type < 0 || $range_end - $i > 2))
- {
- # Here is to output a range. We don't allow a
- # caller-specified output format--just use the
- # standard one.
- push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
+ # Get character information if don't have it already
+ main::populate_char_info($i)
+ if ! defined $viacode[$i];
+ my $type = $annotate_char_type[$i];
+
+ # Figure out if should output the next code points
+ # as part of a range or not. If this is not in an
+ # annotation range, then won't output as a range,
+ # so returns $i. Otherwise use the end of the
+ # annotation range, but no further than the
+ # maximum possible end point of the loop.
+ my $range_end = main::min(
+ $annotate_ranges->value_of($i) || $i,
+ $end);
+
+ # Use a range if it is a range, and either is one
+ # of the special annotation ranges, or the range
+ # is at most 3 long. This last case causes the
+ # algorithmically named code points to be output
+ # individually in spans of at most 3, as they are
+ # the ones whose $type is > 0.
+ if ($range_end != $i
+ && ( $type < 0 || $range_end - $i > 2))
+ {
+ # Here is to output a range. We don't allow a
+ # caller-specified output format--just use the
+ # standard one.
+ push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
$range_end,
$value;
- my $range_name = $viacode[$i];
-
- # For the code points which end in their hex value, we
- # eliminate that from the output annotation, and
- # capitalize only the first letter of each word.
- if ($type == $CP_IN_NAME) {
- my $hex = sprintf "%04X", $i;
- $range_name =~ s/-$hex$//;
- my @words = split " ", $range_name;
- for my $word (@words) {
- $word = ucfirst(lc($word)) if $word ne 'CJK';
- }
- $range_name = join " ", @words;
- }
- elsif ($type == $HANGUL_SYLLABLE) {
- $range_name = "Hangul Syllable";
- }
+ my $range_name = $viacode[$i];
+
+ # For the code points which end in their hex
+ # value, we eliminate that from the output
+ # annotation, and capitalize only the first
+ # letter of each word.
+ if ($type == $CP_IN_NAME) {
+ my $hex = sprintf "%04X", $i;
+ $range_name =~ s/-$hex$//;
+ my @words = split " ", $range_name;
+ for my $word (@words) {
+ $word =
+ ucfirst(lc($word)) if $word ne 'CJK';
+ }
+ $range_name = join " ", @words;
+ }
+ elsif ($type == $HANGUL_SYLLABLE) {
+ $range_name = "Hangul Syllable";
+ }
- $OUT[-1] .= " $range_name" if $range_name;
+ $OUT[-1] .= " $range_name" if $range_name;
- # Include the number of code points in the range
- my $count = main::clarify_number($range_end - $i + 1);
- $OUT[-1] .= " [$count]\n";
+ # Include the number of code points in the
+ # range
+ my $count =
+ main::clarify_number($range_end - $i + 1);
+ $OUT[-1] .= " [$count]\n";
- # Skip to the end of the range
- $i = $range_end;
- }
- else { # Not in a range.
- my $comment = "";
-
- # When outputting the names of each character, use
- # the character itself if printable
- $comment .= "'" . chr($i) . "' " if $printable[$i];
-
- # To make it more readable, use a minimum indentation
- my $comment_indent;
-
- # Determine the annotation
- if ($format eq $DECOMP_STRING_FORMAT) {
-
- # This is very specialized, with the type of
- # decomposition beginning the line enclosed in
- # <...>, and the code points that the code point
- # decomposes to separated by blanks. Create two
- # strings, one of the printable characters, and
- # one of their official names.
- (my $map = $value) =~ s/ \ * < .*? > \ +//x;
- my $tostr = "";
- my $to_name = "";
- my $to_chr = "";
- foreach my $to (split " ", $map) {
- $to = CORE::hex $to;
- $to_name .= " + " if $to_name;
- $to_chr .= chr($to);
- main::populate_char_info($to)
- if ! defined $viacode[$to];
- $to_name .= $viacode[$to];
+ # Skip to the end of the range
+ $i = $range_end;
}
+ else { # Not in a range.
+ my $comment = "";
+
+ # When outputting the names of each character,
+ # use the character itself if printable
+ $comment .= "'" . chr($i) . "' "
+ if $printable[$i];
+
+ # To make it more readable, use a minimum
+ # indentation
+ my $comment_indent;
+
+ # Determine the annotation
+ if ($format eq $DECOMP_STRING_FORMAT) {
+
+ # This is very specialized, with the type
+ # of decomposition beginning the line
+ # enclosed in <...>, and the code points
+ # that the code point decomposes to
+ # separated by blanks. Create two
+ # strings, one of the printable
+ # characters, and one of their official
+ # names.
+ (my $map = $value) =~ s/ \ * < .*? > \ +//x;
+ my $tostr = "";
+ my $to_name = "";
+ my $to_chr = "";
+ foreach my $to (split " ", $map) {
+ $to = CORE::hex $to;
+ $to_name .= " + " if $to_name;
+ $to_chr .= chr($to);
+ main::populate_char_info($to)
+ if ! defined $viacode[$to];
+ $to_name .= $viacode[$to];
+ }
- $comment .=
+ $comment .=
"=> '$to_chr'; $viacode[$i] => $to_name";
- $comment_indent = 25; # Determined by experiment
- }
- else {
-
- # Assume that any table that has hex format is a
- # mapping of one code point to another.
- if ($format eq $HEX_FORMAT) {
- my $decimal_value = CORE::hex $value;
- main::populate_char_info($decimal_value)
+ $comment_indent = 25; # Determined by
+ # experiment
+ }
+ else {
+
+ # Assume that any table that has hex
+ # format is a mapping of one code point to
+ # another.
+ if ($format eq $HEX_FORMAT) {
+ my $decimal_value = CORE::hex $value;
+ main::populate_char_info($decimal_value)
if ! defined $viacode[$decimal_value];
- $comment .= "=> '"
- . chr($decimal_value)
- . "'; " if $printable[$decimal_value];
- }
- $comment .= $viacode[$i] if $include_name
- && $viacode[$i];
- if ($format eq $HEX_FORMAT) {
- my $decimal_value = CORE::hex $value;
- $comment .= " => $viacode[$decimal_value]"
- if $viacode[$decimal_value];
- }
+ $comment .= "=> '"
+ . chr($decimal_value)
+ . "'; " if $printable[$decimal_value];
+ }
+ $comment .= $viacode[$i] if $include_name
+ && $viacode[$i];
+ if ($format eq $HEX_FORMAT) {
+ my $decimal_value = CORE::hex $value;
+ $comment .=
+ " => $viacode[$decimal_value]"
+ if $viacode[$decimal_value];
+ }
- # If including the name, no need to indent, as the
- # name will already be way across the line.
- $comment_indent = ($include_name) ? 0 : 60;
- }
+ # If including the name, no need to
+ # indent, as the name will already be way
+ # across the line.
+ $comment_indent = ($include_name) ? 0 : 60;
+ }
- # Use any passed in routine to output the base part of
- # the line.
- if (ref $range_size_1 eq 'CODE') {
- my $base_part = &{$range_size_1}($i, $value);
- chomp $base_part;
- push @OUT, $base_part;
- }
- else {
- push @OUT, sprintf "%04X\t\t%s", $i, $value;
+ # Use any passed in routine to output the base
+ # part of the line.
+ if (ref $range_size_1 eq 'CODE') {
+ my $base_part=&{$range_size_1}($i, $value);
+ chomp $base_part;
+ push @OUT, $base_part;
+ }
+ else {
+ push @OUT, sprintf "%04X\t\t%s", $i, $value;
+ }
+
+ # And add the annotation.
+ $OUT[-1] = sprintf "%-*s\t# %s",
+ $comment_indent,
+ $OUT[-1],
+ $comment
+ if $comment;
+ $OUT[-1] .= "\n";
+ }
}
+ }
- # And add the annotation.
- $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
- $OUT[-1],
- $comment if $comment;
- $OUT[-1] .= "\n";
+ # If we split the range, set up so the next time through
+ # we get the remainder, and redo.
+ if ($next_start) {
+ $start = $next_start;
+ $end = $next_end;
+ $value = $next_value;
+ $next_start = 0;
+ redo;
}
}
} # End of loop through all the table's ranges
'readable_array');
my %to_output_map;
- # Enum as to whether or not to write out this map table:
+ # Enum as to whether or not to write out this map table, and how:
# 0 don't output
# $EXTERNAL_MAP means its existence is noted in the documentation, and
# it should not be removed nor its format changed. This
# output.
# $INTERNAL_MAP means Perl reserves the right to do anything it wants
# with this file
+ # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
+ # outputting the actual mappings as-is, we adjust things
+ # to create a much more compact table. Only those few
+ # tables where the mapping is convertible at least to an
+ # integer and compacting makes a big difference should
+ # have this. Hence, the default is to not do this
+ # unless the table's default mapping is to $CODE_POINT,
+ # and the range size is not 1.
main::set_access('to_output_map', \%to_output_map, 's');
-
sub new {
my $class = shift;
my $name = shift;
my $default_map = delete $args{'Default_Map'};
my $property = delete $args{'_Property'};
my $full_name = delete $args{'Full_Name'};
+ my $to_output_map = delete $args{'To_Output_Map'};
# Rest of parameters passed on
$anomalous_entries{$addr} = [];
$default_map{$addr} = $default_map;
+ $to_output_map{$addr} = $to_output_map;
$self->initialize($initialize) if defined $initialize;
# Don't want to output binary map tables even for debugging.
return 0 if $type == $BINARY;
- # But do want to output string ones.
- return $EXTERNAL_MAP if $type == $STRING;
+ # But do want to output string ones. All the ones that remain to
+ # be dealt with (i.e. which haven't explicitly been set to external)
+ # are for internal Perl use only. The default for those that map to
+ # $CODE_POINT and haven't been restricted to a single element range
+ # is to use the adjusted form.
+ if ($type == $STRING) {
+ return $INTERNAL_MAP if $self->range_size_1
+ || $default_map{$addr} ne $CODE_POINT;
+ return $OUTPUT_ADJUSTED;
+ }
# Otherwise is an $ENUM, do output it, for Perl's purposes
return $INTERNAL_MAP;
my $return = $self->SUPER::header();
- if ($self->to_output_map == $INTERNAL_MAP) {
+ if ($self->to_output_map >= $INTERNAL_MAP) {
$return .= $INTERNAL_ONLY_HEADER;
}
else {
- my $property_name = $self->property->full_name;
+ my $property_name = $self->property->full_name =~ s/Legacy_//r;
$return .= <<END;
# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
my $comment = "";
my $status = $self->status;
- if ($status) {
+ if ($status && $status ne $PLACEHOLDER) {
my $warn = uc $status_past_participles{$status};
$comment .= <<END;
}
$comment .= "This file returns the $mapping:\n";
+ my $ucd_accessible_name = "";
+ my $full_name = $self->property->full_name;
for my $i (0 .. @property_aliases - 1) {
- $comment .= sprintf("%-8s%s\n",
- " ",
- $property_aliases[$i]->name . '(cp)'
- );
+ my $name = $property_aliases[$i]->name;
+ $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
+ if ($property_aliases[$i]->ucd) {
+ if ($name eq $full_name) {
+ $ucd_accessible_name = $full_name;
+ }
+ elsif (! $ucd_accessible_name) {
+ $ucd_accessible_name = $name;
+ }
+ }
+ }
+ $comment .= "\nwhere 'cp' is $cp.";
+ if ($ucd_accessible_name) {
+ $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
}
- my $full_name = $self->property->full_name;
- $comment .= "\nwhere 'cp' is $cp. Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
# And append any commentary already set from the actual property.
$comment .= "\n\n" . $self->comment if $self->comment;
my $format = $self->format;
- my $return = <<END;
+ my $return = "";
+
+ my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
+ if ($output_adjusted) {
+ if ($specials_name) {
+ $return .= <<END;
+# The mappings in the non-hash portion of this file must be modified to get the
+# correct values by adding the code point ordinal number to each one that is
+# numeric.
+END
+ }
+ else {
+ $return .= <<END;
+# The mappings must be modified to get the correct values by adding the code
+# point ordinal number to each one that is numeric.
+END
+ }
+ }
+
+ $return .= <<END;
+
# The name this swash is to be known by, with the format of the mappings in
# the main body of the table, and what all code points missing from this file
# map to.
\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
END
if ($specials_name) {
- $return .= <<END;
+ $return .= <<END;
\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
END
}
my $default_map = $default_map{$addr};
- $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
+
+ # For $CODE_POINT default maps and using adjustments, instead the default
+ # becomes zero.
+ $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
+ . (($output_adjusted && $default_map eq $CODE_POINT)
+ ? "0"
+ : $default_map)
+ . "';";
if ($default_map eq $CODE_POINT) {
$return .= ' # code point maps to itself';
if $format eq $FLOAT_FORMAT
&& $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
$format = $HEX_FORMAT
- if $format eq $RATIONAL_FORMAT
- && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
+ if ($format eq $RATIONAL_FORMAT
+ && $map !~
+ m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
+ # Assume a leading zero means hex,
+ # even if all digits are 0-9
+ || ($format eq $INTEGER_FORMAT
+ && $map =~ /^0[0-9A-F]/);
$format = $STRING_FORMAT if $format eq $HEX_FORMAT
&& $map =~ /[^0-9A-F]/;
}
Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
}
- $self->_set_format($format);
+ # If the output is to be adjusted, the format of the table that gets
+ # output is actually 'a' instead of whatever it is stored internally
+ # as.
+ my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
+ if ($output_adjusted) {
+ $format = $ADJUST_FORMAT;
+ }
- # Core Perl has a different definition of mapping ranges than we do,
- # that is applicable mainly to mapping code points, so for tables
- # where it is possible that core Perl could be used to read it,
- # make it range size 1 to prevent possible confusion
- $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
+ $self->_set_format($format);
return $self->SUPER::write(
+ $output_adjusted,
($self->property == $block)
? 7 # block file needs more tab stops
: 3,
'+=' => sub {
my $self = shift;
my $other = shift;
+ my $reversed = shift;
+
+ if ($reversed) {
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' += '
+ . ref($self)
+ . "'. undef returned.");
+ return;
+ }
return if $self->carp_if_locked;
}
return $self;
},
+ '&=' => sub {
+ my $self = shift;
+ my $other = shift;
+
+ return if $self->carp_if_locked;
+ $self->_set_range_list($self->_range_list & $other);
+ return $self;
+ },
'-' => sub { my $self = shift;
my $other = shift;
my $reversed = shift;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- return $self->SUPER::write(2); # 2 tab stops
+ return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops
}
sub set_final_comment {
# \p{}'s
my @global_comments; # List of all the tables' comments that are
# there before this routine was called.
+ my $has_ucd_alias = 0; # If there is an alias that is accessible via
+ # Unicode::UCD. If not, then don't say it is
+ # in the comment
# Get list of all the parent tables that are equivalent to this one
# (including itself).
[$i % @table_aliases];
my $table_alias = $table_alias_object->name;
my $loose_match = $table_alias_object->loose_match;
+ $has_ucd_alias |= $table_alias_object->ucd;
if ($table_alias !~ /\D/) { # Clarify large numbers.
$table_alias = main::clarify_number($table_alias)
my $flag = $property->status
|| $table->status
|| $table_alias_object->status;
- $flags{$flag} = $status_past_participles{$flag} if $flag;
+ if ($flag && $flag ne $PLACEHOLDER) {
+ $flags{$flag} = $status_past_participles{$flag};
+ }
$loose_count++;
$any_of_these = 'any of these'
}
- my $comment = "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
+ my $comment = "";
+ if ($has_ucd_alias) {
+ $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
+ }
if ($has_unrelated) {
$comment .= <<END;
This file is for tables that are not necessarily related: To conserve
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
'==' => \&main::_operator_equal,
'!=' => \&main::_operator_not_equal,
'=' => sub { return shift },
: "$self$other";
}
+sub _operator_dot_equal {
+ # Overloaded '.=' method that is common to all packages.
+
+ my $self = shift;
+ my $other = shift;
+ my $reversed = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ $other = "" unless defined $other;
+
+ if ($reversed) {
+ return $other .= "$self";
+ }
+ else {
+ return "$self" . "$other";
+ }
+}
+
sub _operator_equal {
# Generic overloaded '==' routine. To be equal, they must be the exact
# same object
# Perl adds this alias.
$gc->add_alias('Category');
- # For backwards compatibility, these property files have particular names.
- property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what
- # utf8.c calls it
- property_ref('Lowercase_Mapping')->set_file('Lower');
- property_ref('Titlecase_Mapping')->set_file('Title');
-
- my $fold = property_ref('Case_Folding');
- $fold->set_file('Fold') if defined $fold;
-
# Unicode::Normalize expects this file with this name and directory.
my $ccc = property_ref('Canonical_Combining_Class');
if (defined $ccc) {
$ccc->set_directory(File::Spec->curdir());
}
- # utf8.c has a different meaning for non range-size-1 for map properties
- # that this program doesn't currently handle; and even if it were changed
- # to do so, some other code may be using them expecting range size 1.
- foreach my $property (qw {
- Case_Folding
- Lowercase_Mapping
- Titlecase_Mapping
- Uppercase_Mapping
- })
- {
- property_ref($property)->set_range_size_1(1);
- }
-
# These two properties aren't actually used in the core, but unfortunately
# the names just above that are in the core interfere with these, so
# choose different names. These aren't a problem unless the map tables
$urs->add_alias('kRSUnicode');
}
}
+
+ # For backwards compatibility with applications that may read the mapping
+ # file directly (it was documented in 5.12 and 5.14 as being thusly
+ # usable), keep it from being adjusted. (range_size_1 is
+ # used to force the traditional format.)
+ if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
+ $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
+ $nfkc_cf->set_range_size_1(1);
+ }
+ if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
+ $bmg->set_to_output_map($EXTERNAL_MAP);
+ $bmg->set_range_size_1(1);
+ }
+
+ property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
+
return;
}
# Process each line of the file ...
while ($file->next_line) {
+ # Fix typo in input file
+ s/CCC133/CCC132/g if $v_version eq v6.1.0;
+
my ($property, @data) = split /\s*;\s*/;
# The ccc property has an extra field at the beginning, which is the
return @return;
}
+sub process_NormalizationsTest {
+
+ # Each line looks like:
+ # source code point; NFC; NFD; NFKC; NFKD
+ # e.g.
+ # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
+
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ # Process each line of the file ...
+ while ($file->next_line) {
+
+ next if /^@/;
+
+ my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
+
+ foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
+ $$var = pack "U0U*", map { hex } split " ", $$var;
+ $$var =~ s/(\\)/$1$1/g;
+ }
+
+ push @normalization_tests,
+ "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
+ } # End of looping through the file
+}
+
sub output_perl_charnames_line ($$) {
# Output the entries in Perl_charnames specially, using 5 digits instead
my $input_field_count = $i;
# This routine in addition outputs these extra fields:
+
my $DECOMP_TYPE = $i++; # Decomposition type
# These fields are modifications of ones above, and are usually
# body of the table
Map_Type => $COMPUTE_NO_MULTI_CP,
Type => $STRING,
+ To_Output_Map => $INTERNAL_MAP,
);
$Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
$Perl_decomp->add_comment(join_lines(<<END
my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
Default_Map => "",
Perl_Extension => 1,
- File => 'Digit', # Trad. location
Directory => $map_directory,
Type => $STRING,
- Range_Size_1 => 1,
+ To_Output_Map => $OUTPUT_ADJUSTED,
);
$Decimal_Digit->add_comment(join_lines(<<END
This file gives the mapping of all code points which represent a single
-decimal digit [0-9] to their respective digits. For example, the code point
-U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
-that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
-numerals.
+decimal digit [0-9] to their respective digits, but it has ranges of 10 code
+points, and the mapping of each non-initial element of each range is actually
+not to "0", but to the offset that element has from its corresponding DIGIT 0.
+These code points are those that have Numeric_Type=Decimal; not special
+things, like subscripts nor Roman numerals.
END
));
$file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
$file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
+ $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
$fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
}
elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
# Some code points in this file have the pseudo-name
# '<control>', but the official name for such ones is the null
- # string. For charnames.pm, we use the Unicode version 1 name
- $fields[$NAME] = "";
- $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
+ # string.
+ $fields[$NAME] = $fields[$CHARNAME] = "";
# We had better not be in between range lines.
if ($in_range) {
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Flush the buffers.
- foreach my $i (1 .. $last_field) {
+ foreach my $i (0 .. $last_field) {
$file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
}
# http://www.unicode.org/versions/corrigendum8.html
$fields[$BIDI] = "AL";
}
- elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
+ elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
$fields[$CHARNAME] = "";
}
# simple ones are in UnicodeData.txt, which should already have been
# read in to the full property data structures, so as to initialize
# these with the simple ones. Then the SpecialCasing.txt entries
- # overwrite the ones which have different full mappings.
+ # add or overwrite the ones which have different full mappings.
# This routine sees if the simple mappings are to be output, and if
# so, copies what has already been put into the full mapping tables,
# relatively few entries in them that have different full mappings,
# and thus skip the simple mapping tables altogether.
- # New tables with just the simple mappings that are overridden by the
- # full ones are constructed. These are for Unicode::UCD, which
- # requires the simple mappings. The Case_Folding table is a combined
- # table of both the simple and full mappings, with the full ones being
- # in the hash, and the simple ones, even those overridden by the hash,
- # being in the base table. That same mechanism could have been
- # employed here, except that the docs have said that the generated
- # files are usuable directly by programs, so we dare not change the
- # format in any way.
-
my $file= shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
$uc = property_ref('uc');
# For each of the case change mappings...
- foreach my $case_table ($lc, $tc, $uc) {
- my $case = $case_table->name;
- my $full = property_ref($case);
- unless (defined $full && ! $full->is_empty) {
+ foreach my $full_table ($lc, $tc, $uc) {
+ my $full_name = $full_table->name;
+ unless (defined $full_table && ! $full_table->is_empty) {
Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
}
- # The simple version's name in each mapping merely has an 's' in
- # front of the full one's
- my $simple_name = 's' . $case;
- my $simple = property_ref($simple_name);
- $simple->initialize($full) if $simple->to_output_map();
+ # Create a table in the old-style format and with the original
+ # file name for backwards compatibility with applications that
+ # read it directly. The new tables contain both the simple and
+ # full maps, and the old are missing simple maps when there is a
+ # conflicting full one. Probably it would have been ok to add
+ # those to the legacy version, as was already done in 5.14 to the
+ # case folding one, but this was not done, out of an abundance of
+ # caution. The tables are set up here before we deal with the
+ # full maps so that as we handle those, we can override the simple
+ # maps for them in the legacy table, and merely add them in the
+ # new-style one.
+ my $legacy = Property->new("Legacy_" . $full_table->full_name,
+ File => $full_table->full_name =~
+ s/case_Mapping//r,
+ Range_Size_1 => 1,
+ Format => $HEX_FORMAT,
+ Default_Map => $CODE_POINT,
+ UCD => 0,
+ Initialize => $full_table,
+ To_Output_Map => $EXTERNAL_MAP,
+ );
- my $simple_only = Property->new("_s$case",
- Type => $STRING,
- Default_Map => $CODE_POINT,
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Description => "This contains the simple mappings for $case for just the code points that have different full mappings");
- $simple_only->set_to_output_map($INTERNAL_MAP);
- $simple_only->add_comment(join_lines( <<END
-This file is for UCD.pm so that it can construct simple mappings that would
-otherwise be lost because they are overridden by full mappings.
+ $full_table->add_comment(join_lines( <<END
+This file includes both the simple and full case changing maps. The simple
+ones are in the main body of the table below, and the full ones adding to or
+overriding them are in the hash.
END
));
+ # The simple version's name in each mapping merely has an 's' in
+ # front of the full one's
+ my $simple_name = 's' . $full_name;
+ my $simple = property_ref($simple_name);
+ $simple->initialize($full_table) if $simple->to_output_map();
+
unless ($simple->to_output_map()) {
- $simple_only->set_proxy_for($simple_name);
+ $full_table->set_proxy_for($simple_name);
}
}
return;
}
- $_ = "$fields[0]; lc; $fields[1]";
- $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
- $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
+ my $decimal_code_point = hex $fields[0];
- # Copy any simple case change to the special tables constructed if
- # being overridden by a multi-character case change.
- if ($fields[1] ne $fields[0]
- && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _slc; $value");
- }
- if ($fields[2] ne $fields[0]
- && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _stc; $value");
- }
- if ($fields[3] ne $fields[0]
- && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _suc; $value");
+ # Loop to handle each of the three mappings in the input line, in
+ # order, with $i indicating the current field number.
+ my $i = 0;
+ for my $object ($lc, $tc, $uc) {
+ $i++; # First time through, $i = 0 ... 3rd time = 3
+
+ my $value = $object->value_of($decimal_code_point);
+ $value = ($value eq $CODE_POINT)
+ ? $decimal_code_point
+ : hex $value;
+
+ # If this isn't a multi-character mapping, it should already have
+ # been read in.
+ if ($fields[$i] !~ / /) {
+ if ($value != hex $fields[$i]) {
+ Carp::my_carp("Bad news. UnicodeData.txt thinks "
+ . $object->name
+ . "(0x$fields[0]) is $value"
+ . " and SpecialCasing.txt thinks it is "
+ . hex($fields[$i])
+ . ". Good luck. Retaining UnicodeData value, and proceeding anyway.");
+ }
+ }
+ else {
+
+ # The mapping goes into both the legacy table, in which it
+ # replaces the simple one...
+ $file->insert_adjusted_lines("$fields[0]; Legacy_"
+ . $object->full_name
+ . "; $fields[$i]");
+
+ # ... and, the The regular table, in which it is additional,
+ # beyond the simple mapping.
+ $file->insert_adjusted_lines("$fields[0]; "
+ . $object->name
+ . "; "
+ . $CMD_DELIM
+ . "$REPLACE_CMD=$MULTIPLE_BEFORE"
+ . $CMD_DELIM
+ . $fields[$i]);
+ }
}
+ # Everything has been handled by the insert_adjusted_lines()
+ $_ = "";
+
return;
}
}
$non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
Perl_Extension => 1,
Fate => $INTERNAL_ONLY,
- Description => "Code points that particpate in a multi-char fold not in the final position",
+ Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
);
# If we ever wanted to show that these tables were combined, a new
my $scx = property_ref("Script_Extensions");
$scx = Property->new("scx", Full_Name => "Script_Extensions")
- if ! defined $scx;
+ if ! defined $scx;
$scx->_set_format($STRING_WHITE_SPACE_LIST);
$scx->initialize($script);
$scx->set_default_map($script->default_map);
# 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
my @fields = split /\s*;\s*/;
+
+ # This script was erroneously omitted in this Unicode version.
+ $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
+
my @full_names;
foreach my $short_name (split " ", $fields[1]) {
push @full_names, $script->table($short_name)->full_name;
}
sub setup_early_name_alias {
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ # This has the effect of pretending that the Name_Alias property was
+ # available in all Unicode releases. Strictly speaking, this property
+ # should not be availabe in early releases, but doing this allows
+ # charnames.pm to work on older releases without change. Prior to v5.16
+ # it had these names hard-coded inside it. Unicode 6.1 came along and
+ # created these names, and so they were removed from charnames.
+
my $aliases = property_ref('Name_Alias');
- $aliases = Property->new('Name_Alias') if ! defined $aliases;
+ if (! defined $aliases) {
+ $aliases = Property->new('Name_Alias', Default_Map => "");
+ }
+
+ $file->insert_lines(get_old_name_aliases());
- # Before 6.0, this wasn't a problem, and after it, this alias is part of
- # the Unicode-delivered file.
- $aliases->add_map(7, 7, "ALERT: control") if $v_version eq v6.0.0;
return;
}
+sub get_old_name_aliases () {
+
+ # The Unicode_1_Name field, contains most of these names. One would
+ # expect, given the field's name, that its values would be fixed across
+ # versions, giving the true Unicode version 1 name for the character.
+ # Sadly, this is not the case. Actually Version 1.1.5 had no names for
+ # any of the controls; Version 2.0 introduced names for the C0 controls,
+ # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2
+ # changed some names: it
+ # changed to parenthesized versions like "NEXT LINE" to
+ # "NEXT LINE (NEL)";
+ # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
+ # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
+ # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
+ # This list contains all the names that were defined so that
+ # charnames::vianame(), etc. understand them all EVEN if this version of
+ # Unicode didn't specify them (this could be construed as a bug).
+ # mktables elsewhere gives preference to the Unicode_1_Name field over
+ # these names, so that viacode() will return the correct value for that
+ # version of Unicode, except when that version doesn't define a name,
+ # viacode() will return one anyway (this also could be construed as a
+ # bug). But these potential "bugs" allow for the smooth working of code
+ # on earlier Unicode releases.
+
+ my @return = split /\n/, <<'END';
+0000;NULL;control
+0000;NUL;abbreviation
+0001;START OF HEADING;control
+0001;SOH;abbreviation
+0002;START OF TEXT;control
+0002;STX;abbreviation
+0003;END OF TEXT;control
+0003;ETX;abbreviation
+0004;END OF TRANSMISSION;control
+0004;EOT;abbreviation
+0005;ENQUIRY;control
+0005;ENQ;abbreviation
+0006;ACKNOWLEDGE;control
+0006;ACK;abbreviation
+0007;BELL;control
+0007;BEL;abbreviation
+0008;BACKSPACE;control
+0008;BS;abbreviation
+0009;CHARACTER TABULATION;control
+0009;HORIZONTAL TABULATION;control
+0009;HT;abbreviation
+0009;TAB;abbreviation
+000A;LINE FEED;control
+000A;LINE FEED (LF);control
+000A;NEW LINE;control
+000A;END OF LINE;control
+000A;LF;abbreviation
+000A;NL;abbreviation
+000A;EOL;abbreviation
+000B;LINE TABULATION;control
+000B;VERTICAL TABULATION;control
+000B;VT;abbreviation
+000C;FORM FEED;control
+000C;FORM FEED (FF);control
+000C;FF;abbreviation
+000D;CARRIAGE RETURN;control
+000D;CARRIAGE RETURN (CR);control
+000D;CR;abbreviation
+000E;SHIFT OUT;control
+000E;LOCKING-SHIFT ONE;control
+000E;SO;abbreviation
+000F;SHIFT IN;control
+000F;LOCKING-SHIFT ZERO;control
+000F;SI;abbreviation
+0010;DATA LINK ESCAPE;control
+0010;DLE;abbreviation
+0011;DEVICE CONTROL ONE;control
+0011;DC1;abbreviation
+0012;DEVICE CONTROL TWO;control
+0012;DC2;abbreviation
+0013;DEVICE CONTROL THREE;control
+0013;DC3;abbreviation
+0014;DEVICE CONTROL FOUR;control
+0014;DC4;abbreviation
+0015;NEGATIVE ACKNOWLEDGE;control
+0015;NAK;abbreviation
+0016;SYNCHRONOUS IDLE;control
+0016;SYN;abbreviation
+0017;END OF TRANSMISSION BLOCK;control
+0017;ETB;abbreviation
+0018;CANCEL;control
+0018;CAN;abbreviation
+0019;END OF MEDIUM;control
+0019;EOM;abbreviation
+001A;SUBSTITUTE;control
+001A;SUB;abbreviation
+001B;ESCAPE;control
+001B;ESC;abbreviation
+001C;INFORMATION SEPARATOR FOUR;control
+001C;FILE SEPARATOR;control
+001C;FS;abbreviation
+001D;INFORMATION SEPARATOR THREE;control
+001D;GROUP SEPARATOR;control
+001D;GS;abbreviation
+001E;INFORMATION SEPARATOR TWO;control
+001E;RECORD SEPARATOR;control
+001E;RS;abbreviation
+001F;INFORMATION SEPARATOR ONE;control
+001F;UNIT SEPARATOR;control
+001F;US;abbreviation
+0020;SP;abbreviation
+007F;DELETE;control
+007F;DEL;abbreviation
+0080;PADDING CHARACTER;figment
+0080;PAD;abbreviation
+0081;HIGH OCTET PRESET;figment
+0081;HOP;abbreviation
+0082;BREAK PERMITTED HERE;control
+0082;BPH;abbreviation
+0083;NO BREAK HERE;control
+0083;NBH;abbreviation
+0084;INDEX;control
+0084;IND;abbreviation
+0085;NEXT LINE;control
+0085;NEXT LINE (NEL);control
+0085;NEL;abbreviation
+0086;START OF SELECTED AREA;control
+0086;SSA;abbreviation
+0087;END OF SELECTED AREA;control
+0087;ESA;abbreviation
+0088;CHARACTER TABULATION SET;control
+0088;HORIZONTAL TABULATION SET;control
+0088;HTS;abbreviation
+0089;CHARACTER TABULATION WITH JUSTIFICATION;control
+0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
+0089;HTJ;abbreviation
+008A;LINE TABULATION SET;control
+008A;VERTICAL TABULATION SET;control
+008A;VTS;abbreviation
+008B;PARTIAL LINE FORWARD;control
+008B;PARTIAL LINE DOWN;control
+008B;PLD;abbreviation
+008C;PARTIAL LINE BACKWARD;control
+008C;PARTIAL LINE UP;control
+008C;PLU;abbreviation
+008D;REVERSE LINE FEED;control
+008D;REVERSE INDEX;control
+008D;RI;abbreviation
+008E;SINGLE SHIFT TWO;control
+008E;SINGLE-SHIFT-2;control
+008E;SS2;abbreviation
+008F;SINGLE SHIFT THREE;control
+008F;SINGLE-SHIFT-3;control
+008F;SS3;abbreviation
+0090;DEVICE CONTROL STRING;control
+0090;DCS;abbreviation
+0091;PRIVATE USE ONE;control
+0091;PRIVATE USE-1;control
+0091;PU1;abbreviation
+0092;PRIVATE USE TWO;control
+0092;PRIVATE USE-2;control
+0092;PU2;abbreviation
+0093;SET TRANSMIT STATE;control
+0093;STS;abbreviation
+0094;CANCEL CHARACTER;control
+0094;CCH;abbreviation
+0095;MESSAGE WAITING;control
+0095;MW;abbreviation
+0096;START OF GUARDED AREA;control
+0096;START OF PROTECTED AREA;control
+0096;SPA;abbreviation
+0097;END OF GUARDED AREA;control
+0097;END OF PROTECTED AREA;control
+0097;EPA;abbreviation
+0098;START OF STRING;control
+0098;SOS;abbreviation
+0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
+0099;SGC;abbreviation
+009A;SINGLE CHARACTER INTRODUCER;control
+009A;SCI;abbreviation
+009B;CONTROL SEQUENCE INTRODUCER;control
+009B;CSI;abbreviation
+009C;STRING TERMINATOR;control
+009C;ST;abbreviation
+009D;OPERATING SYSTEM COMMAND;control
+009D;OSC;abbreviation
+009E;PRIVACY MESSAGE;control
+009E;PM;abbreviation
+009F;APPLICATION PROGRAM COMMAND;control
+009F;APC;abbreviation
+00A0;NBSP;abbreviation
+00AD;SHY;abbreviation
+200B;ZWSP;abbreviation
+200C;ZWNJ;abbreviation
+200D;ZWJ;abbreviation
+200E;LRM;abbreviation
+200F;RLM;abbreviation
+202A;LRE;abbreviation
+202B;RLE;abbreviation
+202C;PDF;abbreviation
+202D;LRO;abbreviation
+202E;RLO;abbreviation
+FEFF;BYTE ORDER MARK;alternate
+FEFF;BOM;abbreviation
+FEFF;ZWNBSP;abbreviation
+END
+
+ if ($v_version ge v3.0.0) {
+ push @return, split /\n/, <<'END';
+180B; FVS1; abbreviation
+180C; FVS2; abbreviation
+180D; FVS3; abbreviation
+180E; MVS; abbreviation
+202F; NNBSP; abbreviation
+END
+ }
+
+ if ($v_version ge v3.2.0) {
+ push @return, split /\n/, <<'END';
+034F; CGJ; abbreviation
+205F; MMSP; abbreviation
+2060; WJ; abbreviation
+END
+ # Add in VS1..VS16
+ my $cp = 0xFE00 - 1;
+ for my $i (1..16) {
+ push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
+ }
+ }
+ if ($v_version ge v4.0.0) { # Add in VS17..VS256
+ my $cp = 0xE0100 - 17;
+ for my $i (17..256) {
+ push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
+ }
+ }
+
+ # ALERT did not come along until 6.0, at which point it became preferred
+ # over BELL, and was never in the Unicode_1_Name field. For the same
+ # reasons, that the other names are made known to all releases by this
+ # function, we make ALERT known too. By inserting it
+ # last in early releases, BELL is preferred over it; and vice-vers in 6.0
+ my $alert = '0007; ALERT; control';
+ if ($v_version lt v6.0.0) {
+ push @return, $alert;
+ }
+ else {
+ unshift @return, $alert;
+ }
+
+ return @return;
+}
+
sub filter_later_version_name_alias_line {
# This file has an extra entry per line for the alias type. This is
sub filter_early_version_name_alias_line {
# Early versions did not have the trailing alias type field; implicitly it
- # was 'correction'
- $_ .= "; correction";
+ # was 'correction'. But our synthetic lines we add in this program do
+ # have it, so test for the type field.
+ $_ .= "; correction" if $_ !~ /;.*;/;
+
filter_later_version_name_alias_line;
return;
}
sub finish_Unicode() {
# This routine should be called after all the Unicode files have been read
# in. It:
- # 1) Adds the mappings for code points missing from the files which have
+ # 1) Creates properties that are missing from the version of Unicode being
+ # compiled, and which, for whatever reason, are needed for the Perl
+ # core to function properly. These are minimally populated as
+ # necessary.
+ # 2) Adds the mappings for code points missing from the files which have
# defaults specified for them.
- # 2) At this this point all mappings are known, so it computes the type of
+ # 3) At this this point all mappings are known, so it computes the type of
# each property whose type hasn't been determined yet.
- # 3) Calculates all the regular expression match tables based on the
+ # 4) Calculates all the regular expression match tables based on the
# mappings.
- # 3) Calculates and adds the tables which are defined by Unicode, but
- # which aren't derived by them
+ # 5) Calculates and adds the tables which are defined by Unicode, but
+ # which aren't derived by them, and certain derived tables that Perl
+ # uses.
+
+ # GCB and hst are not in early Unicode releases; create dummy ones if
+ # they don't exist, as the core needs tables generated from them.
+ my $gcb = property_ref('Grapheme_Cluster_Break');
+ if (! defined $gcb) {
+ $gcb = Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
+ Status => $PLACEHOLDER,
+ Type => $ENUM,
+ Default_Map => 'Other');
+ }
+ my $hst = property_ref('Hangul_Syllable_Type');
+ if (!defined $hst) {
+ $hst = Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
+ Status => $PLACEHOLDER,
+ Type => $ENUM,
+ Default_Map => 'Not_Applicable');
+ }
# For each property, fill in any missing mappings, and calculate the re
# match tables. If a property has more than one missing mapping, the
Lowercase_Mapping
Titlecase_Mapping
Case_Folding
- } ) {
+ } )
+ {
my $full = property_ref($map);
if ($full->is_empty) {
my $simple = property_ref('Simple_' . $map);
}
}
+ # Create digit and case fold tables with the original file names for
+ # backwards compatibility with applications that read them directly.
+ my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
+ Default_Map => "",
+ Perl_Extension => 1,
+ File => 'Digit', # Trad. location
+ Directory => $map_directory,
+ UCD => 0,
+ Type => $STRING,
+ To_Output_Map => $EXTERNAL_MAP,
+ Range_Size_1 => 1,
+ Initialize => property_ref('Perl_Decimal_Digit'),
+ );
+ $Digit->add_comment(join_lines(<<END
+This file gives the mapping of all code points which represent a single
+decimal digit [0-9] to their respective digits. For example, the code point
+U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
+that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
+numerals.
+END
+ ));
+
+ Property->new('Legacy_Case_Folding',
+ File => "Fold",
+ Directory => $map_directory,
+ Default_Map => $CODE_POINT,
+ UCD => 0,
+ Range_Size_1 => 1,
+ Type => $STRING,
+ To_Output_Map => $EXTERNAL_MAP,
+ Format => $HEX_FORMAT,
+ Initialize => property_ref('cf'),
+ );
+
# The Script_Extensions property started out as a clone of the Script
# property. But processing its data file caused some elements to be
# replaced with different data. (These elements were for the Common and
# identical code points, but their caseless equivalents are not the same,
# one being 'Cased' and the other being 'LC', and so now must be kept as
# separate entities.
- $Title += $lt if defined $lt;
+ if (defined $lt) {
+ $Title += $lt;
+ }
+ else {
+ push @tables_that_may_be_empty, $Title->complete_name;
+ }
# If this Unicode version doesn't have Cased, set up our own. From
# Unicode 5.1: Definition D120: A character C is defined to be cased if
# Perl's traditional space doesn't include Vertical Tab
my $XPerlSpace = $perl->add_match_table('XPerlSpace',
Description => '\s, including beyond ASCII',
- Initialize => $Space - 0x000B,
+ #Initialize => $Space - 0x000B,
+ Initialize => $Space,
);
$XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
my $PerlSpace = $perl->add_match_table('PerlSpace',
- Description => '\s, restricted to ASCII = [ \f\n\r\t]',
+ Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
Initialize => $XPerlSpace & $ASCII,
);
# The 'extended' grapheme cluster came in 5.1. The non-extended
# definition differs too much from the traditional Perl one to use.
- if (defined $gcb && defined $gcb->table('SpacingMark')) {
+ if (defined $gcb->table('SpacingMark')) {
- # Note that assumes HST is defined; it came in an earlier release than
+ # Note that assumes hst is defined; it came in an earlier release than
# GCB. In the line below, two negatives means: yes hangul
$begin += ~ property_ref('Hangul_Syllable_Type')
->table('Not_Applicable')
# We set things up so the Perl core degrades gracefully, possibly with
# placeholders that match nothing.
- if (! defined $gcb) {
- $gcb = Property->new('GCB', Status => $PLACEHOLDER);
- }
- my $hst = property_ref('HST');
- if (!defined $hst) {
- $hst = Property->new('HST', Status => $PLACEHOLDER);
- $hst->add_match_table('Not_Applicable',
- Initialize => $Any,
- Matches_All => 1);
- }
+ my $hst = property_ref('Hangul_Syllable_Type');
# On some releases, here we may not have the needed tables for the
# perl core, in some releases we may.
push @tables_that_may_be_empty, $table->complete_name;
}
- # The HST property predates the GCB one, and has identical tables
+ # The hst property predates the GCB one, and has identical tables
# for some of them, so use it if we can.
- if ($table->is_empty
- && defined $hst
- && defined $hst->table($name))
+ if ($table->is_empty && defined $hst->table($name))
{
$table += $hst->table($name);
}
push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
} else {
$lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
- $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
+ $lv_lvt_v->add_comment('For use in \X; matches: hst=LV | hst=LVT | hst=V');
}
# Was previously constructed to contain both Name and Unicode_1_Name
}
my $alias_sentence = "";
+ my %abbreviations;
my $alias = property_ref('Name_Alias');
- if (defined $alias) {
- push @composition, 'Name_Alias';
- $perl_charname->set_proxy_for('Name_Alias');
- my $unicode_1 = property_ref('Unicode_1_Name');
- my %abbreviations;
-
- # Add each entry in Name_Alias to Perl_Charnames. Where these go with
- # respect to any existing entry depends on the entry type.
- # Corrections go before said entry, as they should be returned in
- # preference over the existing entry. (A correction to a correction
- # should be later in the Name_Alias table, so it will correctly
- # precede the erroneous correction in Perl_Charnames.)
- #
- # Abbreviations go after everything else, so they are saved
- # temporarily in a hash for later.
- #
- # Controls are currently added afterwards. This is because Perl has
- # previously used the Unicode1 name, and so should still use that.
- # (Most of them will be the same anyway, in which case we don't add a
- # duplicate)
-
- $alias->reset_each_range;
- while (my ($range) = $alias->each_range) {
- next if $range->value eq "";
- my $code_point = $range->start;
- if ($code_point != $range->end) {
- Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
- }
- my ($value, $type) = split ': ', $range->value;
- my $replace_type;
- if ($type eq 'correction') {
- $replace_type = $MULTIPLE_BEFORE;
- }
- elsif ($type eq 'abbreviation') {
-
- # Save for later
- $abbreviations{$value} = $code_point;
- next;
- }
- elsif ($type eq 'control') {
- my $unicode_1_value = $unicode_1->value_of($code_point);
- next if $unicode_1_value eq $value;
- $replace_type = $MULTIPLE_AFTER;
- }
- else {
- $replace_type = $MULTIPLE_AFTER;
- }
-
- # Actually add; before or after current entry(ies) as determined
- # above.
- $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ push @composition, 'Name_Alias';
+ $perl_charname->set_proxy_for('Name_Alias');
+
+ # Add each entry in Name_Alias to Perl_Charnames. Where these go with
+ # respect to any existing entry depends on the entry type. Corrections go
+ # before said entry, as they should be returned in preference over the
+ # existing entry. (A correction to a correction should be later in the
+ # Name_Alias table, so it will correctly precede the erroneous correction
+ # in Perl_Charnames.)
+ #
+ # Abbreviations go after everything else, so they are saved temporarily in
+ # a hash for later.
+ #
+ # Controls are currently added afterwards. This is because Perl has
+ # previously used the Unicode1 name, and so should still use that. (Most
+ # of them will be the same anyway, in which case we don't add a duplicate)
+
+ $alias->reset_each_range;
+ while (my ($range) = $alias->each_range) {
+ next if $range->value eq "";
+ my $code_point = $range->start;
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+ my ($value, $type) = split ': ', $range->value;
+ my $replace_type;
+ if ($type eq 'correction') {
+ $replace_type = $MULTIPLE_BEFORE;
+ }
+ elsif ($type eq 'abbreviation') {
+
+ # Save for later
+ $abbreviations{$value} = $code_point;
+ next;
}
-
- # Now that have everything added, add in abbreviations after
- # everything else.
- foreach my $value (keys %abbreviations) {
- $perl_charname->add_duplicate($abbreviations{$value}, $value, Replace => $MULTIPLE_AFTER);
+ else {
+ $replace_type = $MULTIPLE_AFTER;
}
- $alias_sentence = <<END;
+
+ # Actually add; before or after current entry(ies) as determined
+ # above.
+
+ $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ }
+ $alias_sentence = <<END;
The Name_Alias property adds duplicate code point entries that are
alternatives to the original name. If an addition is a corrected
name, it will be physically first in the table. The original (less correct,
but still valid) name will be next; then any alternatives, in no particular
order; and finally any abbreviations, again in no particular order.
END
+
+ # Now add the Unicode_1 names for the controls. The Unicode_1 names had
+ # precedence before 6.1, so should be first in the file; the other names
+ # have precedence starting in 6.1,
+ my $before_or_after = ($v_version lt v6.1.0)
+ ? $MULTIPLE_BEFORE
+ : $MULTIPLE_AFTER;
+
+ foreach my $range (property_ref('Unicode_1_Name')->ranges) {
+ my $code_point = $range->start;
+ my $unicode_1_value = $range->value;
+ next if $unicode_1_value eq ""; # Skip if name doesn't exist.
+
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+
+ # To handle EBCDIC, we don't hard code in the code points of the
+ # controls; instead realizing that all of them are below 256.
+ last if $code_point > 255;
+
+ # We only add in the controls.
+ next if $gc->value_of($code_point) ne 'Cc';
+
+ # This won't add an exact duplicate.
+ $perl_charname->add_duplicate($code_point, $unicode_1_value,
+ Replace => $before_or_after);
+ }
+
+ # Now that have everything added, add in abbreviations after
+ # everything else.
+ foreach my $value (keys %abbreviations) {
+ $perl_charname->add_duplicate($abbreviations{$value}, $value,
+ Replace => $MULTIPLE_AFTER);
}
my $comment;
$unassigned->set_equivalent_to($age_default, Related => 1);
}
+ # See L<perlfunc/quotemeta>
+ my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+
+ # Initialize to what's common in
+ # all Unicode releases.
+ Initialize =>
+ $Space
+ + $gc->table('Control')
+ );
+
+ # In early releases without the proper Unicode properties, just set to \W.
+ if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
+ || ! defined (my $patws = property_ref('Pattern_White_Space'))
+ || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
+ {
+ $quotemeta += ~ $Word;
+ }
+ else {
+ $quotemeta += $patsyn->table('Y')
+ + $patws->table('Y')
+ + $di->table('Y')
+ + ((~ $Word) & $ASCII);
+ }
# Finished creating all the perl properties. All non-internal non-string
# ones have a synonym of 'Is_' prefixed. (Internal properties begin with
|| $ucd_pod{$standard}{'perl_extension'} == $perl_extension
|| $output_this == $perl_extension)
{
- Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations. Proceeding anyway.");
+ Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
}
# We modifiy the info column of the one being output to
Uppercase_Mapping uc()
Also, Case_Folding is accessible through the C</i> modifier in regular
-expressions.
+expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
+operator.
And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
interpolation in double-quoted strings and regular expressions; and functions
# Make a list of all combinations of properties/values that are suppressed.
my @suppressed;
- foreach my $property_name (keys %why_suppressed) {
+ if (! $debug_skip) { # This tends to fail in this debug mode
+ foreach my $property_name (keys %why_suppressed) {
- # Just the value
- my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
+ # Just the value
+ my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
- # The hash may contain properties not in this release of Unicode
- next unless defined (my $property = property_ref($property_name));
+ # The hash may contain properties not in this release of Unicode
+ next unless defined (my $property = property_ref($property_name));
- # Find all combinations
- foreach my $prop_alias ($property->aliases) {
- my $prop_alias_name = standardize($prop_alias->name);
+ # Find all combinations
+ foreach my $prop_alias ($property->aliases) {
+ my $prop_alias_name = standardize($prop_alias->name);
- # If no =value, there's just one combination possibe for this
- if (! $value_name) {
+ # If no =value, there's just one combination possibe for this
+ if (! $value_name) {
- # The property may be suppressed, but there may be a proxy for
- # it, so it shouldn't be listed as suppressed
- next if $prop_alias->ucd;
- push @suppressed, $prop_alias_name;
- }
- else { # Otherwise
- foreach my $value_alias ($property->table($value_name)->aliases)
- {
- next if $value_alias->ucd;
+ # The property may be suppressed, but there may be a proxy
+ # for it, so it shouldn't be listed as suppressed
+ next if $prop_alias->ucd;
+ push @suppressed, $prop_alias_name;
+ }
+ else { # Otherwise
+ foreach my $value_alias
+ ($property->table($value_name)->aliases)
+ {
+ next if $value_alias->ucd;
- push @suppressed, "$prop_alias_name="
- . standardize($value_alias->name);
+ push @suppressed, "$prop_alias_name="
+ . standardize($value_alias->name);
+ }
}
}
}
}
}
}
- elsif ($count == $MAX_UNICODE_CODEPOINTS) {
- if ($table == $property || $table->leader == $table) {
+ elsif ($count == $MAX_UNICODE_CODEPOINTS
+ && ($table == $property || $table->leader == $table)
+ && $table->property->status != $PLACEHOLDER)
+ {
Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
- }
}
if ($table->fate == $SUPPRESSED) {
make_UCD;
make_property_test_script() if $make_test_script;
+ make_normalization_test_script() if $make_norm_test_script;
return;
}
return;
}
+sub make_normalization_test_script() {
+ print "Making normalization test script\n" if $verbosity >= $PROGRESS;
+
+ my $n_path = 'TestNorm.pl';
+
+ unshift @normalization_tests, <<'END';
+use utf8;
+use Test::More;
+
+sub ord_string { # Convert packed ords to printable string
+ use charnames ();
+ return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
+ unpack "U*", shift) . "'";
+ #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
+}
+
+sub Test_N {
+ my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
+ my $display_source = ord_string($source);
+ my $display_nfc = ord_string($nfc);
+ my $display_nfd = ord_string($nfd);
+ my $display_nfkc = ord_string($nfkc);
+ my $display_nfkd = ord_string($nfkd);
+
+ use Unicode::Normalize;
+ # NFC
+ # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
+ # nfkc == toNFC(nfkc) == toNFC(nfkd)
+ #
+ # NFD
+ # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
+ # nfkd == toNFD(nfkc) == toNFD(nfkd)
+ #
+ # NFKC
+ # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
+ # toNFKC(nfkc) == toNFKC(nfkd)
+ #
+ # NFKD
+ # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
+ # toNFKD(nfkc) == toNFKD(nfkd)
+
+ is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
+ is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
+ is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
+ is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
+ is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
+
+ is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
+ is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
+ is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
+ is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
+ is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
+
+ is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
+ is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
+ is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
+ is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
+ is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
+
+ is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
+ is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
+ is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
+ is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
+ is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
+}
+END
+
+ &write($n_path,
+ 1, # Is utf8;
+ [
+ @normalization_tests,
+ 'done_testing();'
+ ]);
+ return;
+}
+
# This is a list of the input files and how to handle them. The files are
# processed in their order in this list. Some reordering is possible if
# desired, but the v0 files should be first, and the extracted before the
Input_file->new('BidiMirroring.txt', v3.0.1,
Property => 'Bidi_Mirroring_Glyph',
),
- Input_file->new("NormalizationTest.txt", v3.0.1,
- Skip => 'Validation Tests',
+ Input_file->new("NormTest.txt", v3.0.0,
+ Handler => \&process_NormalizationsTest,
+ Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
),
Input_file->new('CaseFolding.txt', v3.0.1,
Pre_Handler => \&setup_case_folding,
Input_file->new('NamedSequences.txt', v4.1.0,
Handler => \&process_NamedSequences
),
- Input_file->new('NameAliases.txt', v5.0.0,
+ Input_file->new('NameAliases.txt', v0,
Property => 'Name_Alias',
Pre_Handler => ($v_version le v6.0.0)
? \&setup_early_name_alias