# As stated earlier, this program will work on any release of Unicode so far.
# Most obvious problems in earlier data have NOT been corrected except when
-# necessary to make Perl or this program work reasonably. For example, no
+# necessary to make Perl or this program work reasonably, and to keep out
+# potential security issues. For example, no
# folding information was given in early releases, so this program substitutes
# lower case instead, just so that a regular expression with the /i option
# will do something that actually gives the right results in many cases.
$0 = File::Spec->canonpath($0);
my $make_test_script = 0; # ? Should we output a test script
+my $make_norm_test_script = 0; # ? Should we output a normalization test script
my $write_unchanged_files = 0; # ? Should we update the output files even if
# we don't think they have changed
my $use_directory = ""; # ? Should we chdir somewhere.
{
$make_test_script = 1;
}
+ elsif ($arg eq '-makenormtest')
+ {
+ $make_norm_test_script = 1;
+ }
elsif ($arg eq '-makelist') {
$make_list = 1;
}
if $v_version ge v6.0.0;
push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
if $v_version ge v6.1.0;
-push @tables_that_may_be_empty, '_stc';
# The lists below are hashes, so the key is the item in the list, and the
# value is the reason why it is in the list. This makes generation of
# Enum values for to_output_map() method in the Map_Table package.
my $EXTERNAL_MAP = 1;
my $INTERNAL_MAP = 2;
+my $OUTPUT_ADJUSTED = 3;
# To override computed values for writing the map tables for these properties.
# The default for enum map tables is to write them out, so that the Unicode
my $MAP_PROXIED = 1; # The map table for the property isn't written out,
# but there is a file written that can be used to
# reconstruct this table
-my $SUPPRESSED = 3; # The file for this table is not written out.
-my $INTERNAL_ONLY = 4; # The file for this table is written out, but it is
+my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is
# for Perl's internal use only
-my $PLACEHOLDER = 5; # A property that is defined as a placeholder in a
- # Unicode version that doesn't have it, but we need it
- # to be defined, if empty, to have things work.
- # Implies no pod entry generated
+my $SUPPRESSED = 3; # The file for this table is not written out, and as a
+ # result, we don't bother to do many computations on
+ # it.
+my $PLACEHOLDER = 4; # Like $SUPPRESSED, but we go through all the
+ # computations anyway, as the values are needed for
+ # things to work. This happens when we have Perl
+ # extensions that depend on Unicode tables that
+ # wouldn't normally be in a given Unicode version.
# The format of the values of the tables:
my $EMPTY_FORMAT = "";
my $HEX_FORMAT = 'x';
my $RATIONAL_FORMAT = 'r';
my $STRING_FORMAT = 's';
+my $ADJUST_FORMAT = 'a';
my $DECOMP_STRING_FORMAT = 'c';
my $STRING_WHITE_SPACE_LIST = 'sw';
$HEX_FORMAT => 'non-negative hex whole number; a code point',
$RATIONAL_FORMAT => 'rational: an integer or a fraction',
$STRING_FORMAT => 'string',
+ $ADJUST_FORMAT => 'some entries need adjustment',
$DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
$STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
);
# anonymous hash.
my @code_points_ending_in_code_point;
+# To hold Unicode's normalization test suite
+my @normalization_tests;
+
# Boolean: does this Unicode version have the hangul syllables, and are we
# writing out a table for them?
my $has_hangul_syllables = 0;
# point of the range.
my $end;
if (! $viacode[$i]) {
- if ($gc-> table('Surrogate')->contains($i)) {
- $viacode[$i] = 'Surrogate';
- $annotate_char_type[$i] = $SURROGATE_TYPE;
- $printable[$i] = 0;
- $end = $gc->table('Surrogate')->containing_range($i)->end;
- }
- elsif ($gc-> table('Private_use')->contains($i)) {
+ my $nonchar;
+ if ($gc-> table('Private_use')->contains($i)) {
$viacode[$i] = 'Private Use';
$annotate_char_type[$i] = $PRIVATE_USE_TYPE;
$printable[$i] = 0;
$end = $gc->table('Private_Use')->containing_range($i)->end;
}
- elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
- contains($i))
+ elsif ((defined ($nonchar =
+ Property::property_ref('Noncharacter_Code_Point'))
+ && $nonchar->table('Y')->contains($i)))
{
$viacode[$i] = 'Noncharacter';
$annotate_char_type[$i] = $NONCHARACTER_TYPE;
$end = 0x81 if $i == 0x80; # Hard-code this one known case
}
elsif ($gc-> table('Unassigned')->contains($i)) {
- $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
$annotate_char_type[$i] = $UNASSIGNED_TYPE;
$printable[$i] = 0;
-
- # Because we name the unassigned by the blocks they are in, it
- # can't go past the end of that block, and it also can't go past
- # the unassigned range it is in. The special table makes sure
- # that the non-characters, which are unassigned, are separated
- # out.
- $end = min($block->containing_range($i)->end,
- $unassigned_sans_noncharacters-> containing_range($i)->
- end);
+ if ($v_version lt v2.0.0) { # No blocks in earliest releases
+ $viacode[$i] = 'Unassigned';
+ $end = $gc-> table('Unassigned')->containing_range($i)->end;
+ }
+ else {
+ $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
+
+ # Because we name the unassigned by the blocks they are in, it
+ # can't go past the end of that block, and it also can't go
+ # past the unassigned range it is in. The special table makes
+ # sure that the non-characters, which are unassigned, are
+ # separated out.
+ $end = min($block->containing_range($i)->end,
+ $unassigned_sans_noncharacters->
+ containing_range($i)->end);
+ }
+ }
+ elsif ($v_version lt v2.0.0) { # No surrogates in earliest releases
+ $viacode[$i] = $gc->value_of($i);
+ $annotate_char_type[$i] = $UNKNOWN_TYPE;
+ $printable[$i] = 0;
+ }
+ elsif ($gc-> table('Surrogate')->contains($i)) {
+ $viacode[$i] = 'Surrogate';
+ $annotate_char_type[$i] = $SURROGATE_TYPE;
+ $printable[$i] = 0;
+ $end = $gc->table('Surrogate')->containing_range($i)->end;
}
else {
Carp::my_carp_bug("Can't figure out how to annotate "
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
no overloading;
$message .= $owner_name_of{pack 'J', $self};
}
- Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
+ Carp::my_carp_bug($message . "Undefined argument to _union. No union done.");
return;
}
# multiple times. They are stored LIFO, so
# that the final one inserted is the first one
# returned in an ordered search of the table.
+ # If this is an exact duplicate, including the
+ # value, the original will be moved to be
+ # first, before any other duplicate ranges
+ # with different values.
# => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
# FIFO, so that this one is inserted after all
- # others that currently exist.
+ # others that currently exist. If this is an
+ # exact duplicate, including value, of an
+ # existing range, this one is discarded
+ # (leaving the existing one in its original,
+ # higher priority position
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
# "same value" means identical for non-type-0 ranges, and it means
Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
return;
}
+ if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
+ Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway");
+ }
#local $to_trace = 1 if main::DEBUG;
if ($operation eq '-') {
# structured so this is common.
if ($start > $max) {
- trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
+ trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
return if $operation eq '-'; # Deleting a non-existing range is a
# no-op
}
# If to place this new record after, move to beyond all existing
- # ones.
+ # ones; but don't add this one if identical to any of them, as it
+ # isn't really a multiple. This leaves the original order, so
+ # that the current request is ignored. The reasoning is that the
+ # previous request that wanted this record to have high priority
+ # should have precedence.
if ($replace == $MULTIPLE_AFTER) {
while ($i < @$r && $r->[$i]->start == $start) {
+ return if $value eq $r->[$i]->value
+ && $type eq $r->[$i]->type;
$i++;
}
}
+ else {
+ # If instead we are to place this new record before any
+ # existing ones, remove any identical ones that come after it.
+ # This changes the existing order so that the new one is
+ # first, as is being requested.
+ for (my $j = $i + 1;
+ $j < @$r && $r->[$j]->start == $start;
+ $j++)
+ {
+ if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
+ splice @$r, $j, 1;
+ last; # There should only be one instance, so no
+ # need to keep looking
+ }
+ }
+ }
trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
my @return = splice @$r,
return $self->_union($other)
},
+ '+=' => sub { my $self = shift;
+ my $other = shift;
+ my $reversed = shift;
+
+ if ($reversed) {
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' += '
+ . ref($self)
+ . "'. undef returned.");
+ return;
+ }
+
+ return $self->_union($other)
+ },
'&' => sub { my $self = shift;
my $other = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
if ($reversed) {
- Carp::my_carp_bug("Can't cope with a "
- . __PACKAGE__
- . " being the second parameter in a '-'. Subtraction ignored.");
- return $self;
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' - '
+ . ref($self)
+ . "'. undef returned.");
+ return;
}
my $new = Range_List->new(Initialize => $self);
use overload
fallback => 0,
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
'!=' => \&main::_operator_not_equal,
'==' => \&main::_operator_equal,
;
my $status = delete $args{'Status'};
$status = $NORMAL unless defined $status;
- my $ucd = delete $args{'UCD'} // 1;
+ # An internal name does not get documented, unless overridden by the
+ # input.
+ my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
# Write a representation of the table to its file. It calls several
# functions furnished by sub-classes of this abstract base class to
# handle non-normal ranges, to add stuff before the table, and at its
- # end.
+ # end. If the table is to be written so that adjustments are
+ # required, this does that conversion.
my $self = shift;
+ my $use_adjustments = shift; # ? output in adjusted format or not
my $tab_stops = shift; # The number of tab stops over to put any
# comment.
my $suppress_value = shift; # Optional, if the value associated with
);
}
+ # Values for previous time through the loop. Initialize to
+ # something that won't be adjacent to the first iteration;
+ # only $previous_end matters for that.
+ my $previous_start;
+ my $previous_end = -2;
+ my $previous_value;
+
+ # Values for next time through the portion of the loop that splits
+ # the range. 0 in $next_start means there is no remaining portion
+ # to deal with.
+ my $next_start = 0;
+ my $next_end;
+ my $next_value;
+ my $offset = 0;
+
# Output each range as part of the here document.
RANGE:
for my $set ($range_list{$addr}->ranges) {
next RANGE if defined $suppress_value
&& $value eq $suppress_value;
- # If there is a range and doesn't need a single point range
- # output
- if ($start != $end && ! $range_size_1) {
- push @OUT, sprintf "%04X\t%04X", $start, $end;
- $OUT[-1] .= "\t$value" if $value ne "";
-
- # Add a comment with the size of the range, if requested.
- # Expand Tabs to make sure they all start in the same
- # column, and then unexpand to use mostly tabs.
- if (! $output_range_counts{$addr}) {
- $OUT[-1] .= "\n";
+ { # This bare block encloses the scope where we may need to
+ # split a range (when outputting adjusteds), and each time
+ # through we handle the next portion of the original by
+ # ending the block with a 'redo'. The values to use for
+ # that next time through are set up just below in the
+ # scalars whose names begin with '$next_'.
+
+ if ($use_adjustments) {
+
+ # When converting to use adjustments, we can handle
+ # only single element ranges. Set up so that this
+ # time through the loop, we look at the first element,
+ # and the next time through, we start off with the
+ # remainder. Thus each time through we look at the
+ # first element of the range
+ if ($end != $start) {
+ $next_start = $start + 1;
+ $next_end = $end;
+ $next_value = $value;
+ $end = $start;
+ }
+
+ # The values for some of these tables are stored as
+ # hex strings. Convert those to decimal
+ $value = hex($value)
+ if $self->default_map eq $CODE_POINT
+ && $value =~ / ^ [A-Fa-f0-9]+ $ /x;
+
+ # If this range is adjacent to the previous one, and
+ # the values in each are integers that are also
+ # adjacent (differ by 1), then this range really
+ # extends the previous one that is already in element
+ # $OUT[-1]. So we pop that element, and pretend that
+ # the range starts with whatever it started with.
+ # $offset is incremented by 1 each time so that it
+ # gives the current offset from the first element in
+ # the accumulating range, and we keep in $value the
+ # value of that first element.
+ if ($start == $previous_end + 1
+ && $value =~ /^ -? \d+ $/xa
+ && $previous_value =~ /^ -? \d+ $/xa
+ && ($value == ($previous_value + ++$offset)))
+ {
+ pop @OUT;
+ $start = $previous_start;
+ $value = $previous_value;
+ }
+ else {
+ $offset = 0;
+ }
+
+ # Save the current values for the next time through
+ # the loop.
+ $previous_start = $start;
+ $previous_end = $end;
+ $previous_value = $value;
}
- else {
- $OUT[-1] = Text::Tabs::expand($OUT[-1]);
- my $count = main::clarify_number($end - $start + 1);
- use integer;
-
- my $width = $tab_stops * 8 - 1;
- $OUT[-1] = sprintf("%-*s # [%s]\n",
- $width,
- $OUT[-1],
- $count);
- $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
+
+ # If there is a range and doesn't need a single point range
+ # output
+ if ($start != $end && ! $range_size_1) {
+ push @OUT, sprintf "%04X\t%04X", $start, $end;
+ $OUT[-1] .= "\t$value" if $value ne "";
+
+ # Add a comment with the size of the range, if
+ # requested. Expand Tabs to make sure they all start
+ # in the same column, and then unexpand to use mostly
+ # tabs.
+ if (! $output_range_counts{$addr}) {
+ $OUT[-1] .= "\n";
+ }
+ else {
+ $OUT[-1] = Text::Tabs::expand($OUT[-1]);
+ my $count = main::clarify_number($end - $start + 1);
+ use integer;
+
+ my $width = $tab_stops * 8 - 1;
+ $OUT[-1] = sprintf("%-*s # [%s]\n",
+ $width,
+ $OUT[-1],
+ $count);
+ $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
+ }
}
- next RANGE;
- }
- # Here to output a single code point per line
+ # Here to output a single code point per line.
+ # If not to annotate, use the simple formats
+ elsif (! $annotate) {
- # If not to annotate, use the simple formats
- if (! $annotate) {
+ # Use any passed in subroutine to output.
+ if (ref $range_size_1 eq 'CODE') {
+ for my $i ($start .. $end) {
+ push @OUT, &{$range_size_1}($i, $value);
+ }
+ }
+ else {
- # Use any passed in subroutine to output.
- if (ref $range_size_1 eq 'CODE') {
- for my $i ($start .. $end) {
- push @OUT, &{$range_size_1}($i, $value);
+ # Here, caller is ok with default output.
+ for (my $i = $start; $i <= $end; $i++) {
+ push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
+ }
}
}
else {
- # Here, caller is ok with default output.
+ # Here, wants annotation.
for (my $i = $start; $i <= $end; $i++) {
- push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
- }
- }
- next RANGE;
- }
- # Here, wants annotation.
- for (my $i = $start; $i <= $end; $i++) {
-
- # Get character information if don't have it already
- main::populate_char_info($i)
- if ! defined $viacode[$i];
- my $type = $annotate_char_type[$i];
-
- # Figure out if should output the next code points as part
- # of a range or not. If this is not in an annotation
- # range, then won't output as a range, so returns $i.
- # Otherwise use the end of the annotation range, but no
- # further than the maximum possible end point of the loop.
- my $range_end = main::min($annotate_ranges->value_of($i)
- || $i,
- $end);
-
- # Use a range if it is a range, and either is one of the
- # special annotation ranges, or the range is at most 3
- # long. This last case causes the algorithmically named
- # code points to be output individually in spans of at
- # most 3, as they are the ones whose $type is > 0.
- if ($range_end != $i
- && ( $type < 0 || $range_end - $i > 2))
- {
- # Here is to output a range. We don't allow a
- # caller-specified output format--just use the
- # standard one.
- push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
+ # Get character information if don't have it already
+ main::populate_char_info($i)
+ if ! defined $viacode[$i];
+ my $type = $annotate_char_type[$i];
+
+ # Figure out if should output the next code points
+ # as part of a range or not. If this is not in an
+ # annotation range, then won't output as a range,
+ # so returns $i. Otherwise use the end of the
+ # annotation range, but no further than the
+ # maximum possible end point of the loop.
+ my $range_end = main::min(
+ $annotate_ranges->value_of($i) || $i,
+ $end);
+
+ # Use a range if it is a range, and either is one
+ # of the special annotation ranges, or the range
+ # is at most 3 long. This last case causes the
+ # algorithmically named code points to be output
+ # individually in spans of at most 3, as they are
+ # the ones whose $type is > 0.
+ if ($range_end != $i
+ && ( $type < 0 || $range_end - $i > 2))
+ {
+ # Here is to output a range. We don't allow a
+ # caller-specified output format--just use the
+ # standard one.
+ push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
$range_end,
$value;
- my $range_name = $viacode[$i];
-
- # For the code points which end in their hex value, we
- # eliminate that from the output annotation, and
- # capitalize only the first letter of each word.
- if ($type == $CP_IN_NAME) {
- my $hex = sprintf "%04X", $i;
- $range_name =~ s/-$hex$//;
- my @words = split " ", $range_name;
- for my $word (@words) {
- $word = ucfirst(lc($word)) if $word ne 'CJK';
- }
- $range_name = join " ", @words;
- }
- elsif ($type == $HANGUL_SYLLABLE) {
- $range_name = "Hangul Syllable";
- }
+ my $range_name = $viacode[$i];
+
+ # For the code points which end in their hex
+ # value, we eliminate that from the output
+ # annotation, and capitalize only the first
+ # letter of each word.
+ if ($type == $CP_IN_NAME) {
+ my $hex = sprintf "%04X", $i;
+ $range_name =~ s/-$hex$//;
+ my @words = split " ", $range_name;
+ for my $word (@words) {
+ $word =
+ ucfirst(lc($word)) if $word ne 'CJK';
+ }
+ $range_name = join " ", @words;
+ }
+ elsif ($type == $HANGUL_SYLLABLE) {
+ $range_name = "Hangul Syllable";
+ }
- $OUT[-1] .= " $range_name" if $range_name;
+ $OUT[-1] .= " $range_name" if $range_name;
- # Include the number of code points in the range
- my $count = main::clarify_number($range_end - $i + 1);
- $OUT[-1] .= " [$count]\n";
+ # Include the number of code points in the
+ # range
+ my $count =
+ main::clarify_number($range_end - $i + 1);
+ $OUT[-1] .= " [$count]\n";
- # Skip to the end of the range
- $i = $range_end;
- }
- else { # Not in a range.
- my $comment = "";
-
- # When outputting the names of each character, use
- # the character itself if printable
- $comment .= "'" . chr($i) . "' " if $printable[$i];
-
- # To make it more readable, use a minimum indentation
- my $comment_indent;
-
- # Determine the annotation
- if ($format eq $DECOMP_STRING_FORMAT) {
-
- # This is very specialized, with the type of
- # decomposition beginning the line enclosed in
- # <...>, and the code points that the code point
- # decomposes to separated by blanks. Create two
- # strings, one of the printable characters, and
- # one of their official names.
- (my $map = $value) =~ s/ \ * < .*? > \ +//x;
- my $tostr = "";
- my $to_name = "";
- my $to_chr = "";
- foreach my $to (split " ", $map) {
- $to = CORE::hex $to;
- $to_name .= " + " if $to_name;
- $to_chr .= chr($to);
- main::populate_char_info($to)
- if ! defined $viacode[$to];
- $to_name .= $viacode[$to];
+ # Skip to the end of the range
+ $i = $range_end;
}
+ else { # Not in a range.
+ my $comment = "";
+
+ # When outputting the names of each character,
+ # use the character itself if printable
+ $comment .= "'" . chr($i) . "' "
+ if $printable[$i];
+
+ # To make it more readable, use a minimum
+ # indentation
+ my $comment_indent;
+
+ # Determine the annotation
+ if ($format eq $DECOMP_STRING_FORMAT) {
+
+ # This is very specialized, with the type
+ # of decomposition beginning the line
+ # enclosed in <...>, and the code points
+ # that the code point decomposes to
+ # separated by blanks. Create two
+ # strings, one of the printable
+ # characters, and one of their official
+ # names.
+ (my $map = $value) =~ s/ \ * < .*? > \ +//x;
+ my $tostr = "";
+ my $to_name = "";
+ my $to_chr = "";
+ foreach my $to (split " ", $map) {
+ $to = CORE::hex $to;
+ $to_name .= " + " if $to_name;
+ $to_chr .= chr($to);
+ main::populate_char_info($to)
+ if ! defined $viacode[$to];
+ $to_name .= $viacode[$to];
+ }
- $comment .=
+ $comment .=
"=> '$to_chr'; $viacode[$i] => $to_name";
- $comment_indent = 25; # Determined by experiment
- }
- else {
-
- # Assume that any table that has hex format is a
- # mapping of one code point to another.
- if ($format eq $HEX_FORMAT) {
- my $decimal_value = CORE::hex $value;
- main::populate_char_info($decimal_value)
+ $comment_indent = 25; # Determined by
+ # experiment
+ }
+ else {
+
+ # Assume that any table that has hex
+ # format is a mapping of one code point to
+ # another.
+ if ($format eq $HEX_FORMAT) {
+ my $decimal_value = CORE::hex $value;
+ main::populate_char_info($decimal_value)
if ! defined $viacode[$decimal_value];
- $comment .= "=> '"
- . chr($decimal_value)
- . "'; " if $printable[$decimal_value];
- }
- $comment .= $viacode[$i] if $include_name
- && $viacode[$i];
- if ($format eq $HEX_FORMAT) {
- my $decimal_value = CORE::hex $value;
- $comment .= " => $viacode[$decimal_value]"
- if $viacode[$decimal_value];
- }
+ $comment .= "=> '"
+ . chr($decimal_value)
+ . "'; " if $printable[$decimal_value];
+ }
+ $comment .= $viacode[$i] if $include_name
+ && $viacode[$i];
+ if ($format eq $HEX_FORMAT) {
+ my $decimal_value = CORE::hex $value;
+ $comment .=
+ " => $viacode[$decimal_value]"
+ if $viacode[$decimal_value];
+ }
- # If including the name, no need to indent, as the
- # name will already be way across the line.
- $comment_indent = ($include_name) ? 0 : 60;
- }
+ # If including the name, no need to
+ # indent, as the name will already be way
+ # across the line.
+ $comment_indent = ($include_name) ? 0 : 60;
+ }
- # Use any passed in routine to output the base part of
- # the line.
- if (ref $range_size_1 eq 'CODE') {
- my $base_part = &{$range_size_1}($i, $value);
- chomp $base_part;
- push @OUT, $base_part;
- }
- else {
- push @OUT, sprintf "%04X\t\t%s", $i, $value;
+ # Use any passed in routine to output the base
+ # part of the line.
+ if (ref $range_size_1 eq 'CODE') {
+ my $base_part=&{$range_size_1}($i, $value);
+ chomp $base_part;
+ push @OUT, $base_part;
+ }
+ else {
+ push @OUT, sprintf "%04X\t\t%s", $i, $value;
+ }
+
+ # And add the annotation.
+ $OUT[-1] = sprintf "%-*s\t# %s",
+ $comment_indent,
+ $OUT[-1],
+ $comment
+ if $comment;
+ $OUT[-1] .= "\n";
+ }
}
+ }
- # And add the annotation.
- $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
- $OUT[-1],
- $comment if $comment;
- $OUT[-1] .= "\n";
+ # If we split the range, set up so the next time through
+ # we get the remainder, and redo.
+ if ($next_start) {
+ $start = $next_start;
+ $end = $next_end;
+ $value = $next_value;
+ $next_start = 0;
+ redo;
}
}
} # End of loop through all the table's ranges
'readable_array');
my %to_output_map;
- # Enum as to whether or not to write out this map table:
+ # Enum as to whether or not to write out this map table, and how:
# 0 don't output
# $EXTERNAL_MAP means its existence is noted in the documentation, and
# it should not be removed nor its format changed. This
# output.
# $INTERNAL_MAP means Perl reserves the right to do anything it wants
# with this file
+ # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
+ # outputting the actual mappings as-is, we adjust things
+ # to create a much more compact table. Only those few
+ # tables where the mapping is convertible at least to an
+ # integer and compacting makes a big difference should
+ # have this. Hence, the default is to not do this
+ # unless the table's default mapping is to $CODE_POINT,
+ # and the range size is not 1.
main::set_access('to_output_map', \%to_output_map, 's');
-
sub new {
my $class = shift;
my $name = shift;
my $default_map = delete $args{'Default_Map'};
my $property = delete $args{'_Property'};
my $full_name = delete $args{'Full_Name'};
+ my $to_output_map = delete $args{'To_Output_Map'};
# Rest of parameters passed on
$anomalous_entries{$addr} = [];
$default_map{$addr} = $default_map;
+ $to_output_map{$addr} = $to_output_map;
$self->initialize($initialize) if defined $initialize;
# Don't want to output binary map tables even for debugging.
return 0 if $type == $BINARY;
- # But do want to output string ones.
- return $EXTERNAL_MAP if $type == $STRING;
+ # But do want to output string ones. All the ones that remain to
+ # be dealt with (i.e. which haven't explicitly been set to external)
+ # are for internal Perl use only. The default for those that map to
+ # $CODE_POINT and haven't been restricted to a single element range
+ # is to use the adjusted form.
+ if ($type == $STRING) {
+ return $INTERNAL_MAP if $self->range_size_1
+ || $default_map{$addr} ne $CODE_POINT;
+ return $OUTPUT_ADJUSTED;
+ }
# Otherwise is an $ENUM, do output it, for Perl's purposes
return $INTERNAL_MAP;
my $return = $self->SUPER::header();
- if ($self->to_output_map == $INTERNAL_MAP) {
+ if ($self->to_output_map >= $INTERNAL_MAP) {
$return .= $INTERNAL_ONLY_HEADER;
}
else {
- my $property_name = $self->property->full_name;
+ my $property_name = $self->property->full_name =~ s/Legacy_//r;
$return .= <<END;
# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
my $comment = "";
my $status = $self->status;
- if ($status) {
+ if ($status && $status ne $PLACEHOLDER) {
my $warn = uc $status_past_participles{$status};
$comment .= <<END;
}
$comment .= "This file returns the $mapping:\n";
+ my $ucd_accessible_name = "";
+ my $full_name = $self->property->full_name;
for my $i (0 .. @property_aliases - 1) {
- $comment .= sprintf("%-8s%s\n",
- " ",
- $property_aliases[$i]->name . '(cp)'
- );
+ my $name = $property_aliases[$i]->name;
+ $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
+ if ($property_aliases[$i]->ucd) {
+ if ($name eq $full_name) {
+ $ucd_accessible_name = $full_name;
+ }
+ elsif (! $ucd_accessible_name) {
+ $ucd_accessible_name = $name;
+ }
+ }
+ }
+ $comment .= "\nwhere 'cp' is $cp.";
+ if ($ucd_accessible_name) {
+ $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
}
- my $full_name = $self->property->full_name;
- $comment .= "\nwhere 'cp' is $cp. Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
# And append any commentary already set from the actual property.
$comment .= "\n\n" . $self->comment if $self->comment;
my $format = $self->format;
- my $return = <<END;
+ my $return = "";
+
+ my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
+ if ($output_adjusted) {
+ if ($specials_name) {
+ $return .= <<END;
+# The mappings in the non-hash portion of this file must be modified to get the
+# correct values by adding the code point ordinal number to each one that is
+# numeric.
+END
+ }
+ else {
+ $return .= <<END;
+# The mappings must be modified to get the correct values by adding the code
+# point ordinal number to each one that is numeric.
+END
+ }
+ }
+
+ $return .= <<END;
+
# The name this swash is to be known by, with the format of the mappings in
# the main body of the table, and what all code points missing from this file
# map to.
END
}
my $default_map = $default_map{$addr};
- $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
+
+ # For $CODE_POINT default maps and using adjustments, instead the default
+ # becomes zero.
+ $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
+ . (($output_adjusted && $default_map eq $CODE_POINT)
+ ? "0"
+ : $default_map)
+ . "';";
if ($default_map eq $CODE_POINT) {
$return .= ' # code point maps to itself';
# Assume a leading zero means hex,
# even if all digits are 0-9
|| ($format eq $INTEGER_FORMAT
- && $map =~ /^0/);
+ && $map =~ /^0[0-9A-F]/);
$format = $STRING_FORMAT if $format eq $HEX_FORMAT
&& $map =~ /[^0-9A-F]/;
}
Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
}
- $self->_set_format($format);
+ # If the output is to be adjusted, the format of the table that gets
+ # output is actually 'a' instead of whatever it is stored internally
+ # as.
+ my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
+ if ($output_adjusted) {
+ $format = $ADJUST_FORMAT;
+ }
- # Core Perl has a different definition of mapping ranges than we do,
- # that is applicable mainly to mapping code points, so for tables
- # where it is possible that core Perl could be used to read it,
- # make it range size 1 to prevent possible confusion
- $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
+ $self->_set_format($format);
return $self->SUPER::write(
+ $output_adjusted,
($self->property == $block)
? 7 # block file needs more tab stops
: 3,
'+=' => sub {
my $self = shift;
my $other = shift;
+ my $reversed = shift;
+
+ if ($reversed) {
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' += '
+ . ref($self)
+ . "'. undef returned.");
+ return;
+ }
return if $self->carp_if_locked;
}
return $self;
},
+ '&=' => sub {
+ my $self = shift;
+ my $other = shift;
+
+ return if $self->carp_if_locked;
+ $self->_set_range_list($self->_range_list & $other);
+ return $self;
+ },
'-' => sub { my $self = shift;
my $other = shift;
my $reversed = shift;
-
if ($reversed) {
- Carp::my_carp_bug("Can't cope with a "
- . __PACKAGE__
- . " being the first parameter in a '-'. Subtraction ignored.");
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' - '
+ . ref($self)
+ . "'. undef returned.");
return;
}
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- return $self->SUPER::write(2); # 2 tab stops
+ return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops
}
sub set_final_comment {
# \p{}'s
my @global_comments; # List of all the tables' comments that are
# there before this routine was called.
+ my $has_ucd_alias = 0; # If there is an alias that is accessible via
+ # Unicode::UCD. If not, then don't say it is
+ # in the comment
# Get list of all the parent tables that are equivalent to this one
# (including itself).
[$i % @table_aliases];
my $table_alias = $table_alias_object->name;
my $loose_match = $table_alias_object->loose_match;
+ $has_ucd_alias |= $table_alias_object->ucd;
if ($table_alias !~ /\D/) { # Clarify large numbers.
$table_alias = main::clarify_number($table_alias)
my $flag = $property->status
|| $table->status
|| $table_alias_object->status;
- $flags{$flag} = $status_past_participles{$flag} if $flag;
+ if ($flag && $flag ne $PLACEHOLDER) {
+ $flags{$flag} = $status_past_participles{$flag};
+ }
$loose_count++;
$any_of_these = 'any of these'
}
- my $comment = "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
+ my $comment = "";
+ if ($has_ucd_alias) {
+ $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
+ }
if ($has_unrelated) {
$comment .= <<END;
This file is for tables that are not necessarily related: To conserve
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
'==' => \&main::_operator_equal,
'!=' => \&main::_operator_not_equal,
'=' => sub { return shift },
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
if (ref $other) {
- Carp::my_carp_bug("Can't cope with a "
+ Carp::my_carp_bug("Bad news. Can't cope with a "
. ref($other)
. " argument to '-='. Subtraction ignored.");
return $self;
}
elsif ($reversed) { # Shouldn't happen in a -=, but just in case
- Carp::my_carp_bug("Can't cope with a "
- . __PACKAGE__
- . " being the first parameter in a '-='. Subtraction ignored.");
- return $self;
+ Carp::my_carp_bug("Bad news. Can't cope with subtracting a "
+ . ref $self
+ . " from a non-object. undef returned.");
+ return;
}
else {
no overloading;
: "$self$other";
}
+sub _operator_dot_equal {
+ # Overloaded '.=' method that is common to all packages.
+
+ my $self = shift;
+ my $other = shift;
+ my $reversed = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ $other = "" unless defined $other;
+
+ if ($reversed) {
+ return $other .= "$self";
+ }
+ else {
+ return "$self" . "$other";
+ }
+}
+
sub _operator_equal {
# Generic overloaded '==' routine. To be equal, they must be the exact
# same object
}
}
+
+ my $scf = property_ref("Simple_Case_Folding");
+ $scf->add_alias("scf");
+ $scf->add_alias("sfc");
+
return;
}
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# This entry was missing from this file in earlier Unicode versions
- if (-e 'Jamo.txt') {
- my $jsn = property_ref('JSN');
- if (! defined $jsn) {
- $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
- }
+ if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
+ Property->new('JSN', Full_Name => 'Jamo_Short_Name');
}
# These are used so much, that we set globals for them.
$ccc->set_directory(File::Spec->curdir());
}
- # utf8.c has a different meaning for non range-size-1 for map properties
- # that this program doesn't currently handle; and even if it were changed
- # to do so, some other code may be using them expecting range size 1.
- foreach my $property (qw {
- Case_Folding
- Lowercase_Mapping
- Titlecase_Mapping
- Uppercase_Mapping
- })
- {
- property_ref($property)->set_range_size_1(1);
- }
-
# These two properties aren't actually used in the core, but unfortunately
# the names just above that are in the core interfere with these, so
# choose different names. These aren't a problem unless the map tables
$urs->add_alias('kRSUnicode');
}
}
+
+ # For backwards compatibility with applications that may read the mapping
+ # file directly (it was documented in 5.12 and 5.14 as being thusly
+ # usable), keep it from being adjusted. (range_size_1 is
+ # used to force the traditional format.)
+ if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
+ $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
+ $nfkc_cf->set_range_size_1(1);
+ }
+ if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
+ $bmg->set_to_output_map($EXTERNAL_MAP);
+ $bmg->set_range_size_1(1);
+ }
+
+ property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
+
return;
}
na1 ; Unicode_1_Name
nt ; Numeric_Type
nv ; Numeric_Value
-sfc ; Simple_Case_Folding
+scf ; Simple_Case_Folding
slc ; Simple_Lowercase_Mapping
stc ; Simple_Titlecase_Mapping
suc ; Simple_Uppercase_Mapping
# This first set is in the original old-style proplist.
push @return, split /\n/, <<'END';
-Alpha ; Alphabetic
Bidi_C ; Bidi_Control
Dash ; Dash
Dia ; Diacritic
}
if (-e 'DCoreProperties.txt') {
push @return, split /\n/, <<'END';
+Alpha ; Alphabetic
IDS ; ID_Start
XIDC ; XID_Continue
XIDS ; XID_Start
# Process each line of the file ...
while ($file->next_line) {
+ # Fix typo in input file
+ s/CCC133/CCC132/g if $v_version eq v6.1.0;
+
my ($property, @data) = split /\s*;\s*/;
# The ccc property has an extra field at the beginning, which is the
bc ; R ; Right_To_Left
bc ; WS ; White_Space
+Bidi_M; N; No; F; False
+Bidi_M; Y; Yes; T; True
+
# The standard combining classes are very much different in v1, so only use
# ones that look right (not checked thoroughly)
ccc; 0; NR ; Not_Reordered
return @return;
}
+sub process_NormalizationsTest {
+
+ # Each line looks like:
+ # source code point; NFC; NFD; NFKC; NFKD
+ # e.g.
+ # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
+
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ # Process each line of the file ...
+ while ($file->next_line) {
+
+ next if /^@/;
+
+ my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
+
+ foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
+ $$var = pack "U0U*", map { hex } split " ", $$var;
+ $$var =~ s/(\\)/$1$1/g;
+ }
+
+ push @normalization_tests,
+ "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
+ } # End of looping through the file
+}
+
sub output_perl_charnames_line ($$) {
# Output the entries in Perl_charnames specially, using 5 digits instead
# body of the table
Map_Type => $COMPUTE_NO_MULTI_CP,
Type => $STRING,
+ To_Output_Map => $INTERNAL_MAP,
);
$Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
$Perl_decomp->add_comment(join_lines(<<END
Perl_Extension => 1,
Directory => $map_directory,
Type => $STRING,
- Range_Size_1 => 1,
+ To_Output_Map => $OUTPUT_ADJUSTED,
);
$Decimal_Digit->add_comment(join_lines(<<END
This file gives the mapping of all code points which represent a single
-decimal digit [0-9] to their respective digits. For example, the code point
-U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
-that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
-numerals.
+decimal digit [0-9] to their respective digits, but it has ranges of 10 code
+points, and the mapping of each non-initial element of each range is actually
+not to "0", but to the offset that element has from its corresponding DIGIT 0.
+These code points are those that have Numeric_Type=Decimal; not special
+things, like subscripts nor Roman numerals.
END
));
$file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
$file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
+ $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
$fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
}
elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
# Some code points in this file have the pseudo-name
# '<control>', but the official name for such ones is the null
- # string. For charnames.pm, we use the Unicode version 1 name
- $fields[$NAME] = "";
- $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
+ # string.
+ $fields[$NAME] = $fields[$CHARNAME] = "";
# We had better not be in between range lines.
if ($in_range) {
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Flush the buffers.
- foreach my $i (1 .. $last_field) {
+ foreach my $i (0 .. $last_field) {
$file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
}
# into it the Hangul syllable mappings. This is to avoid having
# to publish a subroutine in it to compute them. (which would
# essentially be this code.) This uses the algorithm published by
- # Unicode.
- if (property_ref('Decomposition_Mapping')->to_output_map) {
+ # Unicode. (No hangul syllables in version 1)
+ if ($v_version ge v2.0.0
+ && property_ref('Decomposition_Mapping')->to_output_map) {
for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
use integer;
my $SIndex = $S - $SBase;
# the syntax is changed as well as the types to their later
# terminology. Otherwise normalize.pm would be very unhappy
# 5) Many ccc classes are different. These are left intact.
- # 6) U+FF10 - U+FF19 are missing their numeric values in all three
+ # 6) U+FF10..U+FF19 are missing their numeric values in all three
# fields. These are unchanged because it doesn't really cause
# problems for Perl.
# 7) A number of code points, such as controls, don't have their
- # Unicode Version 1 Names in this file. These are unchanged.
+ # Unicode Version 1 Names in this file. These are added.
+ # 8) A number of Symbols were marked as Lm. This changes those in
+ # the Latin1 range, so that regexes work.
+ # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are
+ # referred to by their lc equivalents. Not fixed.
my @corrected_lines = split /\n/, <<'END';
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
$file->insert_lines(@copy);
}
+ elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
+ # There are no Lm characters in Latin1; these should be 'Sk', but
+ # there isn't that in V1.
+ $fields[$CATEGORY] = 'So';
+ }
if ($fields[$NUMERIC] eq '-') {
$fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
# If is like '<+circled> 0052 <-circled>', convert to
# '<circled> 0052'
$fields[$PERL_DECOMPOSITION] =~
- s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
+ s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
# Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
$fields[$PERL_DECOMPOSITION] =~
# One entry has weird braces
$fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
+
+ # One entry at U+2116 has an extra <sup>
+ $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
}
$_ = join ';', $code_point, @fields;
return;
}
+ sub filter_bad_Nd_ucd {
+ # Early versions specified a value in the decimal digit field even
+ # though the code point wasn't a decimal digit. Clear the field in
+ # that situation, so that the main code doesn't think it is a decimal
+ # digit.
+
+ my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
+ if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
+ $fields[$PERL_DECIMAL_DIGIT] = "";
+ $_ = join ';', $code_point, @fields;
+ }
+ return;
+ }
+
+ my @U1_control_names = split /\n/, <<'END';
+NULL
+START OF HEADING
+START OF TEXT
+END OF TEXT
+END OF TRANSMISSION
+ENQUIRY
+ACKNOWLEDGE
+BELL
+BACKSPACE
+HORIZONTAL TABULATION
+LINE FEED
+VERTICAL TABULATION
+FORM FEED
+CARRIAGE RETURN
+SHIFT OUT
+SHIFT IN
+DATA LINK ESCAPE
+DEVICE CONTROL ONE
+DEVICE CONTROL TWO
+DEVICE CONTROL THREE
+DEVICE CONTROL FOUR
+NEGATIVE ACKNOWLEDGE
+SYNCHRONOUS IDLE
+END OF TRANSMISSION BLOCK
+CANCEL
+END OF MEDIUM
+SUBSTITUTE
+ESCAPE
+FILE SEPARATOR
+GROUP SEPARATOR
+RECORD SEPARATOR
+UNIT SEPARATOR
+DELETE
+BREAK PERMITTED HERE
+NO BREAK HERE
+INDEX
+NEXT LINE
+START OF SELECTED AREA
+END OF SELECTED AREA
+CHARACTER TABULATION SET
+CHARACTER TABULATION WITH JUSTIFICATION
+LINE TABULATION SET
+PARTIAL LINE DOWN
+PARTIAL LINE UP
+REVERSE LINE FEED
+SINGLE SHIFT TWO
+SINGLE SHIFT THREE
+DEVICE CONTROL STRING
+PRIVATE USE ONE
+PRIVATE USE TWO
+SET TRANSMIT STATE
+CANCEL CHARACTER
+MESSAGE WAITING
+START OF GUARDED AREA
+END OF GUARDED AREA
+START OF STRING
+SINGLE CHARACTER INTRODUCER
+CONTROL SEQUENCE INTRODUCER
+STRING TERMINATOR
+OPERATING SYSTEM COMMAND
+PRIVACY MESSAGE
+APPLICATION PROGRAM COMMAND
+END
+
+ sub filter_early_U1_names {
+ # Very early versions did not have the Unicode_1_name field specified.
+ # They differed in which ones were present; make sure a U1 name
+ # exists, so that Unicode::UCD::charinfo will work
+
+ my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
+
+
+ # @U1_control names above are entirely positional, so we pull them out
+ # in the exact order required, with gaps for the ones that don't have
+ # names.
+ if ($code_point =~ /^00[01]/
+ || $code_point eq '007F'
+ || $code_point =~ /^008[2-9A-F]/
+ || $code_point =~ /^009[0-8A-F]/)
+ {
+ my $u1_name = shift @U1_control_names;
+ $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
+ $_ = join ';', $code_point, @fields;
+ }
+ return;
+ }
+
sub filter_v2_1_5_ucd {
# A dozen entries in this 2.1.5 file had the mirrored and numeric
# columns swapped; These all had mirrored be 'N'. So if the numeric
# http://www.unicode.org/versions/corrigendum8.html
$fields[$BIDI] = "AL";
}
- elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
+ elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
$fields[$CHARNAME] = "";
}
my $lc; # Table for lowercase mapping
my $tc;
my $uc;
+ my %special_casing_code_points;
sub setup_special_casing {
# SpecialCasing.txt contains the non-simple case change mappings. The
# simple ones are in UnicodeData.txt, which should already have been
# read in to the full property data structures, so as to initialize
# these with the simple ones. Then the SpecialCasing.txt entries
- # overwrite the ones which have different full mappings.
+ # add or overwrite the ones which have different full mappings.
# This routine sees if the simple mappings are to be output, and if
# so, copies what has already been put into the full mapping tables,
# relatively few entries in them that have different full mappings,
# and thus skip the simple mapping tables altogether.
- # New tables with just the simple mappings that are overridden by the
- # full ones are constructed. These are for Unicode::UCD, which
- # requires the simple mappings. The Case_Folding table is a combined
- # table of both the simple and full mappings, with the full ones being
- # in the hash, and the simple ones, even those overridden by the hash,
- # being in the base table. That same mechanism could have been
- # employed here, except that the docs have said that the generated
- # files are usuable directly by programs, so we dare not change the
- # format in any way.
-
my $file= shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Create a table in the old-style format and with the original
# file name for backwards compatibility with applications that
- # read it directly.
+ # read it directly. The new tables contain both the simple and
+ # full maps, and the old are missing simple maps when there is a
+ # conflicting full one. Probably it would have been ok to add
+ # those to the legacy version, as was already done in 5.14 to the
+ # case folding one, but this was not done, out of an abundance of
+ # caution. The tables are set up here before we deal with the
+ # full maps so that as we handle those, we can override the simple
+ # maps for them in the legacy table, and merely add them in the
+ # new-style one.
my $legacy = Property->new("Legacy_" . $full_table->full_name,
File => $full_table->full_name =~
s/case_Mapping//r,
Default_Map => $CODE_POINT,
UCD => 0,
Initialize => $full_table,
+ To_Output_Map => $EXTERNAL_MAP,
);
+ $full_table->add_comment(join_lines( <<END
+This file includes both the simple and full case changing maps. The simple
+ones are in the main body of the table below, and the full ones adding to or
+overriding them are in the hash.
+END
+ ));
+
# The simple version's name in each mapping merely has an 's' in
# front of the full one's
my $simple_name = 's' . $full_name;
my $simple = property_ref($simple_name);
$simple->initialize($full_table) if $simple->to_output_map();
+ }
- my $simple_only = Property->new("_s$full_name",
- Type => $STRING,
- Default_Map => $CODE_POINT,
- Perl_Extension => 1,
- Range_Size_1 => 1,
- Fate => $INTERNAL_ONLY,
- Description => "This contains the simple mappings for $full_name for just the code points that have different full mappings");
- $simple_only->set_to_output_map($INTERNAL_MAP);
- $simple_only->add_comment(join_lines( <<END
-This file is for UCD.pm so that it can construct simple mappings that would
-otherwise be lost because they are overridden by full mappings.
-END
- ));
+ return;
+ }
- unless ($simple->to_output_map()) {
- $simple_only->set_proxy_for($simple_name);
- }
+ sub filter_2_1_8_special_casing_line {
+
+ # This version had duplicate entries in this file. Delete all but the
+ # first one
+ my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
+ # fields
+ if (exists $special_casing_code_points{$fields[0]}) {
+ $_ = "";
+ return;
}
- return;
+ $special_casing_code_points{$fields[0]} = 1;
+ filter_special_casing_line(@_);
}
sub filter_special_casing_line {
. $object->name
. "(0x$fields[0]) is $value"
. " and SpecialCasing.txt thinks it is "
- . hex $fields[$i]
- . ". Good luck. Proceeding anyway.");
+ . hex($fields[$i])
+ . ". Good luck. Retaining UnicodeData value, and proceeding anyway.");
}
}
else {
- # The mapping goes into both the legacy table ...
+ # The mapping goes into both the legacy table, in which it
+ # replaces the simple one...
$file->insert_adjusted_lines("$fields[0]; Legacy_"
. $object->full_name
. "; $fields[$i]");
- # ... and, the The regular table
+ # ... and, the The regular table, in which it is additional,
+ # beyond the simple mapping.
$file->insert_adjusted_lines("$fields[0]; "
. $object->name
. "; "
+ . $CMD_DELIM
+ . "$REPLACE_CMD=$MULTIPLE_BEFORE"
+ . $CMD_DELIM
. $fields[$i]);
-
- # Copy any simple case change to the special tables
- # constructed if being overridden by a multi-character case
- # change.
- if ($value != $decimal_code_point) {
- $file->insert_adjusted_lines(sprintf("%s; _s%s; %04X",
- $fields[0],
- $object->name,
- $value));
- }
}
}
return;
}
- if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
+ if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent
$_ = "";
return;
}
# PropList.txt has been in Unicode since version 2.0. Until 3.1, it
# was in a completely different syntax. Ken Whistler of Unicode says
# that it was something he used as an aid for his own purposes, but
- # was never an official part of the standard. However, comments in
- # DAge.txt indicate that non-character code points were available in
- # the UCD as of 3.1. It is unclear to me (khw) how they could be
- # there except through this file (but on the other hand, they first
- # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
- # not. But the claim is that it was published as an aid to others who
- # might want some more information than was given in the official UCD
- # of the time. Many of the properties in it were incorporated into
- # the later PropList.txt, but some were not. This program uses this
- # early file to generate property tables that are otherwise not
- # accessible in the early UCD's, and most were probably not really
- # official at that time, so one could argue that it should be ignored,
- # and you can easily modify things to skip this. And there are bugs
- # in this file in various versions. (For example, the 2.1.9 version
- # removes from Alphabetic the CJK range starting at 4E00, and they
- # weren't added back in until 3.1.0.) Many of this file's properties
- # were later sanctioned, so this code generates tables for those
- # properties that aren't otherwise in the UCD of the time but
- # eventually did become official, and throws away the rest. Here is a
- # list of all the ones that are thrown away:
+ # was never an official part of the standard. Many of the properties
+ # in it were incorporated into the later PropList.txt, but some were
+ # not. This program uses this early file to generate property tables
+ # that are otherwise not accessible in the early UCD's. It does this
+ # for the ones that eventually became official, and don't appear to be
+ # too different in their contents from the later official version, and
+ # throws away the rest. It could be argued that the ones it generates
+ # were probably not really official at that time, so should be
+ # ignored. You can easily modify things to skip all of them by
+ # changing this function to just set $_ to "", and return; and to skip
+ # certain of them by by simply removing their declarations from
+ # get_old_property_aliases().
+ #
+ # Here is a list of all the ones that are thrown away:
+ # Alphabetic The definitions for this are very
+ # defective, so better to not mislead
+ # people into thinking it works.
+ # Instead the Perl extension of the
+ # same name is constructed from first
+ # principles.
# Bidi=* duplicates UnicodeData.txt
# Combining never made into official property;
# is \P{ccc=0}
# Space different definition than eventual
# one.
# Titlecase duplicates UnicodeData.txt: gc=lt
- # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
+ # Unassigned Code Value duplicates UnicodeData.txt: gc=cn
# Zero-width never made into official property;
# subset of gc=cf
# Most of the properties have the same names in this file as in later
# 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
my @fields = split /\s*;\s*/;
+
+ # This script was erroneously omitted in this Unicode version.
+ $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
+
my @full_names;
foreach my $short_name (split " ", $fields[1]) {
push @full_names, $script->table($short_name)->full_name;
}
sub setup_early_name_alias {
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ # This has the effect of pretending that the Name_Alias property was
+ # available in all Unicode releases. Strictly speaking, this property
+ # should not be availabe in early releases, but doing this allows
+ # charnames.pm to work on older releases without change. Prior to v5.16
+ # it had these names hard-coded inside it. Unicode 6.1 came along and
+ # created these names, and so they were removed from charnames.
+
my $aliases = property_ref('Name_Alias');
- $aliases = Property->new('Name_Alias') if ! defined $aliases;
+ if (! defined $aliases) {
+ $aliases = Property->new('Name_Alias', Default_Map => "");
+ }
+
+ $file->insert_lines(get_old_name_aliases());
- # Before 6.0, this wasn't a problem, and after it, this alias is part of
- # the Unicode-delivered file.
- $aliases->add_map(7, 7, "ALERT: control") if $v_version eq v6.0.0;
return;
}
+sub get_old_name_aliases () {
+
+ # The Unicode_1_Name field, contains most of these names. One would
+ # expect, given the field's name, that its values would be fixed across
+ # versions, giving the true Unicode version 1 name for the character.
+ # Sadly, this is not the case. Actually Version 1.1.5 had no names for
+ # any of the controls; Version 2.0 introduced names for the C0 controls,
+ # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2
+ # changed some names: it
+ # changed to parenthesized versions like "NEXT LINE" to
+ # "NEXT LINE (NEL)";
+ # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
+ # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
+ # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
+ # This list contains all the names that were defined so that
+ # charnames::vianame(), etc. understand them all EVEN if this version of
+ # Unicode didn't specify them (this could be construed as a bug).
+ # mktables elsewhere gives preference to the Unicode_1_Name field over
+ # these names, so that viacode() will return the correct value for that
+ # version of Unicode, except when that version doesn't define a name,
+ # viacode() will return one anyway (this also could be construed as a
+ # bug). But these potential "bugs" allow for the smooth working of code
+ # on earlier Unicode releases.
+
+ my @return = split /\n/, <<'END';
+0000;NULL;control
+0000;NUL;abbreviation
+0001;START OF HEADING;control
+0001;SOH;abbreviation
+0002;START OF TEXT;control
+0002;STX;abbreviation
+0003;END OF TEXT;control
+0003;ETX;abbreviation
+0004;END OF TRANSMISSION;control
+0004;EOT;abbreviation
+0005;ENQUIRY;control
+0005;ENQ;abbreviation
+0006;ACKNOWLEDGE;control
+0006;ACK;abbreviation
+0007;BELL;control
+0007;BEL;abbreviation
+0008;BACKSPACE;control
+0008;BS;abbreviation
+0009;CHARACTER TABULATION;control
+0009;HORIZONTAL TABULATION;control
+0009;HT;abbreviation
+0009;TAB;abbreviation
+000A;LINE FEED;control
+000A;LINE FEED (LF);control
+000A;NEW LINE;control
+000A;END OF LINE;control
+000A;LF;abbreviation
+000A;NL;abbreviation
+000A;EOL;abbreviation
+000B;LINE TABULATION;control
+000B;VERTICAL TABULATION;control
+000B;VT;abbreviation
+000C;FORM FEED;control
+000C;FORM FEED (FF);control
+000C;FF;abbreviation
+000D;CARRIAGE RETURN;control
+000D;CARRIAGE RETURN (CR);control
+000D;CR;abbreviation
+000E;SHIFT OUT;control
+000E;LOCKING-SHIFT ONE;control
+000E;SO;abbreviation
+000F;SHIFT IN;control
+000F;LOCKING-SHIFT ZERO;control
+000F;SI;abbreviation
+0010;DATA LINK ESCAPE;control
+0010;DLE;abbreviation
+0011;DEVICE CONTROL ONE;control
+0011;DC1;abbreviation
+0012;DEVICE CONTROL TWO;control
+0012;DC2;abbreviation
+0013;DEVICE CONTROL THREE;control
+0013;DC3;abbreviation
+0014;DEVICE CONTROL FOUR;control
+0014;DC4;abbreviation
+0015;NEGATIVE ACKNOWLEDGE;control
+0015;NAK;abbreviation
+0016;SYNCHRONOUS IDLE;control
+0016;SYN;abbreviation
+0017;END OF TRANSMISSION BLOCK;control
+0017;ETB;abbreviation
+0018;CANCEL;control
+0018;CAN;abbreviation
+0019;END OF MEDIUM;control
+0019;EOM;abbreviation
+001A;SUBSTITUTE;control
+001A;SUB;abbreviation
+001B;ESCAPE;control
+001B;ESC;abbreviation
+001C;INFORMATION SEPARATOR FOUR;control
+001C;FILE SEPARATOR;control
+001C;FS;abbreviation
+001D;INFORMATION SEPARATOR THREE;control
+001D;GROUP SEPARATOR;control
+001D;GS;abbreviation
+001E;INFORMATION SEPARATOR TWO;control
+001E;RECORD SEPARATOR;control
+001E;RS;abbreviation
+001F;INFORMATION SEPARATOR ONE;control
+001F;UNIT SEPARATOR;control
+001F;US;abbreviation
+0020;SP;abbreviation
+007F;DELETE;control
+007F;DEL;abbreviation
+0080;PADDING CHARACTER;figment
+0080;PAD;abbreviation
+0081;HIGH OCTET PRESET;figment
+0081;HOP;abbreviation
+0082;BREAK PERMITTED HERE;control
+0082;BPH;abbreviation
+0083;NO BREAK HERE;control
+0083;NBH;abbreviation
+0084;INDEX;control
+0084;IND;abbreviation
+0085;NEXT LINE;control
+0085;NEXT LINE (NEL);control
+0085;NEL;abbreviation
+0086;START OF SELECTED AREA;control
+0086;SSA;abbreviation
+0087;END OF SELECTED AREA;control
+0087;ESA;abbreviation
+0088;CHARACTER TABULATION SET;control
+0088;HORIZONTAL TABULATION SET;control
+0088;HTS;abbreviation
+0089;CHARACTER TABULATION WITH JUSTIFICATION;control
+0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
+0089;HTJ;abbreviation
+008A;LINE TABULATION SET;control
+008A;VERTICAL TABULATION SET;control
+008A;VTS;abbreviation
+008B;PARTIAL LINE FORWARD;control
+008B;PARTIAL LINE DOWN;control
+008B;PLD;abbreviation
+008C;PARTIAL LINE BACKWARD;control
+008C;PARTIAL LINE UP;control
+008C;PLU;abbreviation
+008D;REVERSE LINE FEED;control
+008D;REVERSE INDEX;control
+008D;RI;abbreviation
+008E;SINGLE SHIFT TWO;control
+008E;SINGLE-SHIFT-2;control
+008E;SS2;abbreviation
+008F;SINGLE SHIFT THREE;control
+008F;SINGLE-SHIFT-3;control
+008F;SS3;abbreviation
+0090;DEVICE CONTROL STRING;control
+0090;DCS;abbreviation
+0091;PRIVATE USE ONE;control
+0091;PRIVATE USE-1;control
+0091;PU1;abbreviation
+0092;PRIVATE USE TWO;control
+0092;PRIVATE USE-2;control
+0092;PU2;abbreviation
+0093;SET TRANSMIT STATE;control
+0093;STS;abbreviation
+0094;CANCEL CHARACTER;control
+0094;CCH;abbreviation
+0095;MESSAGE WAITING;control
+0095;MW;abbreviation
+0096;START OF GUARDED AREA;control
+0096;START OF PROTECTED AREA;control
+0096;SPA;abbreviation
+0097;END OF GUARDED AREA;control
+0097;END OF PROTECTED AREA;control
+0097;EPA;abbreviation
+0098;START OF STRING;control
+0098;SOS;abbreviation
+0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
+0099;SGC;abbreviation
+009A;SINGLE CHARACTER INTRODUCER;control
+009A;SCI;abbreviation
+009B;CONTROL SEQUENCE INTRODUCER;control
+009B;CSI;abbreviation
+009C;STRING TERMINATOR;control
+009C;ST;abbreviation
+009D;OPERATING SYSTEM COMMAND;control
+009D;OSC;abbreviation
+009E;PRIVACY MESSAGE;control
+009E;PM;abbreviation
+009F;APPLICATION PROGRAM COMMAND;control
+009F;APC;abbreviation
+00A0;NBSP;abbreviation
+00AD;SHY;abbreviation
+200B;ZWSP;abbreviation
+200C;ZWNJ;abbreviation
+200D;ZWJ;abbreviation
+200E;LRM;abbreviation
+200F;RLM;abbreviation
+202A;LRE;abbreviation
+202B;RLE;abbreviation
+202C;PDF;abbreviation
+202D;LRO;abbreviation
+202E;RLO;abbreviation
+FEFF;BYTE ORDER MARK;alternate
+FEFF;BOM;abbreviation
+FEFF;ZWNBSP;abbreviation
+END
+
+ if ($v_version ge v3.0.0) {
+ push @return, split /\n/, <<'END';
+180B; FVS1; abbreviation
+180C; FVS2; abbreviation
+180D; FVS3; abbreviation
+180E; MVS; abbreviation
+202F; NNBSP; abbreviation
+END
+ }
+
+ if ($v_version ge v3.2.0) {
+ push @return, split /\n/, <<'END';
+034F; CGJ; abbreviation
+205F; MMSP; abbreviation
+2060; WJ; abbreviation
+END
+ # Add in VS1..VS16
+ my $cp = 0xFE00 - 1;
+ for my $i (1..16) {
+ push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
+ }
+ }
+ if ($v_version ge v4.0.0) { # Add in VS17..VS256
+ my $cp = 0xE0100 - 17;
+ for my $i (17..256) {
+ push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
+ }
+ }
+
+ # ALERT did not come along until 6.0, at which point it became preferred
+ # over BELL, and was never in the Unicode_1_Name field. For the same
+ # reasons, that the other names are made known to all releases by this
+ # function, we make ALERT known too. By inserting it
+ # last in early releases, BELL is preferred over it; and vice-vers in 6.0
+ my $alert = '0007; ALERT; control';
+ if ($v_version lt v6.0.0) {
+ push @return, $alert;
+ }
+ else {
+ unshift @return, $alert;
+ }
+
+ return @return;
+}
+
sub filter_later_version_name_alias_line {
# This file has an extra entry per line for the alias type. This is
sub filter_early_version_name_alias_line {
# Early versions did not have the trailing alias type field; implicitly it
- # was 'correction'
- $_ .= "; correction";
+ # was 'correction'. But our synthetic lines we add in this program do
+ # have it, so test for the type field.
+ $_ .= "; correction" if $_ !~ /;.*;/;
+
filter_later_version_name_alias_line;
return;
}
sub finish_Unicode() {
# This routine should be called after all the Unicode files have been read
# in. It:
- # 1) Adds the mappings for code points missing from the files which have
+ # 1) Creates properties that are missing from the version of Unicode being
+ # compiled, and which, for whatever reason, are needed for the Perl
+ # core to function properly. These are minimally populated as
+ # necessary.
+ # 2) Adds the mappings for code points missing from the files which have
# defaults specified for them.
- # 2) At this this point all mappings are known, so it computes the type of
+ # 3) At this this point all mappings are known, so it computes the type of
# each property whose type hasn't been determined yet.
- # 3) Calculates all the regular expression match tables based on the
+ # 4) Calculates all the regular expression match tables based on the
# mappings.
- # 3) Calculates and adds the tables which are defined by Unicode, but
+ # 5) Calculates and adds the tables which are defined by Unicode, but
# which aren't derived by them, and certain derived tables that Perl
# uses.
+ # Folding information was introduced later into Unicode data. To get
+ # Perl's case ignore (/i) to work at all in releases that don't have
+ # folding, use the best available alternative, which is lower casing.
+ my $fold = property_ref('Case_Folding');
+ if ($fold->is_empty) {
+ $fold->initialize(property_ref('Lowercase_Mapping'));
+ $fold->add_note(join_lines(<<END
+WARNING: This table uses lower case as a substitute for missing fold
+information
+END
+ ));
+ }
+
+ # Multiple-character mapping was introduced later into Unicode data, so it
+ # is by default the simple version. If to output the simple versions and
+ # not present, just use the regular (which in these Unicode versions is
+ # the simple as well).
+ foreach my $map (qw { Uppercase_Mapping
+ Lowercase_Mapping
+ Titlecase_Mapping
+ Case_Folding
+ } )
+ {
+ my $simple = property_ref("Simple_$map");
+ next if ! $simple->is_empty;
+ if ($simple->to_output_map) {
+ $simple->initialize(property_ref($map));
+ }
+ else {
+ property_ref($map)->set_proxy_for($simple->name);
+ }
+ }
+
+ # GCB and hst are not in early Unicode releases; create dummy ones if
+ # they don't exist, as the core needs tables generated from them.
+ my $gcb = property_ref('Grapheme_Cluster_Break');
+ if (! defined $gcb) {
+ $gcb = Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
+ Status => $PLACEHOLDER,
+ Type => $ENUM,
+ Default_Map => 'Other');
+ }
+ my $hst = property_ref('Hangul_Syllable_Type');
+ if (!defined $hst) {
+ $hst = Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
+ Status => $PLACEHOLDER,
+ Type => $ENUM,
+ Default_Map => 'Not_Applicable');
+ }
+
# For each property, fill in any missing mappings, and calculate the re
# match tables. If a property has more than one missing mapping, the
# default is a reference to a data structure, and requires data from other
my $Cs = $gc->table('Cs');
-
- # Folding information was introduced later into Unicode data. To get
- # Perl's case ignore (/i) to work at all in releases that don't have
- # folding, use the best available alternative, which is lower casing.
- my $fold = property_ref('Simple_Case_Folding');
- if ($fold->is_empty) {
- $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
- $fold->add_note(join_lines(<<END
-WARNING: This table uses lower case as a substitute for missing fold
-information
-END
- ));
- }
-
- # Multiple-character mapping was introduced later into Unicode data. If
- # missing, use the single-characters maps as best available alternative
- foreach my $map (qw { Uppercase_Mapping
- Lowercase_Mapping
- Titlecase_Mapping
- Case_Folding
- } )
- {
- my $full = property_ref($map);
- if ($full->is_empty) {
- my $simple = property_ref('Simple_' . $map);
- $full->initialize($simple);
- $full->add_comment($simple->comment) if ($simple->comment);
- $full->add_note(join_lines(<<END
-WARNING: This table uses simple mapping (single-character only) as a
-substitute for missing multiple-character information
-END
- ));
- }
- }
-
# Create digit and case fold tables with the original file names for
# backwards compatibility with applications that read them directly.
my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
Directory => $map_directory,
UCD => 0,
Type => $STRING,
+ To_Output_Map => $EXTERNAL_MAP,
Range_Size_1 => 1,
Initialize => property_ref('Perl_Decimal_Digit'),
);
UCD => 0,
Range_Size_1 => 1,
Type => $STRING,
+ To_Output_Map => $EXTERNAL_MAP,
Format => $HEX_FORMAT,
Initialize => property_ref('cf'),
);
return;
}
+sub pre_3_dot_1_Nl () {
+
+ # Return a range list for gc=nl for Unicode versions prior to 3.1, which
+ # is when Unicode's became fully usable. These code points were
+ # determined by inspection and experimentation. gc=nl is important for
+ # certain Perl-extension properties that should be available in all
+ # releases.
+
+ my $Nl = Range_List->new();
+ if (defined (my $official = $gc->table('Nl'))) {
+ $Nl += $official;
+ }
+ else {
+ $Nl->add_range(0x2160, 0x2182);
+ $Nl->add_range(0x3007, 0x3007);
+ $Nl->add_range(0x3021, 0x3029);
+ }
+ $Nl->add_range(0xFE20, 0xFE23);
+ $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
+ # these were added
+ return $Nl;
+}
+
sub compile_perl() {
# Create perl-defined tables. Almost all are part of the pseudo-property
# named 'perl' internally to this program. Many of these are recommended
# Very early releases didn't have blocks, so initialize ASCII ourselves if
# necessary
if ($ASCII->is_empty) {
- $ASCII->initialize([ 0..127 ]);
+ $ASCII->add_range(0, 127);
}
# Get the best available case definitions. Early Unicode versions didn't
# have Uppercase and Lowercase defined, so use the general category
- # instead for them.
+ # instead for them, modified by hard-coding in the code points each is
+ # missing.
my $Lower = $perl->add_match_table('Lower');
my $Unicode_Lower = property_ref('Lowercase');
if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
$Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
- $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
- $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
- $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
}
else {
- $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
- Related => 1);
+ $Lower += $gc->table('Lowercase_Letter');
+
+ # There are quite a few code points in Lower, that aren't in gc=lc,
+ # and not all are in all releases.
+ foreach my $code_point ( 0x00AA,
+ 0x00BA,
+ 0x02B0 .. 0x02B8,
+ 0x02C0 .. 0x02C1,
+ 0x02E0 .. 0x02E4,
+ 0x0345,
+ 0x037A,
+ 0x1D2C .. 0x1D6A,
+ 0x1D78,
+ 0x1D9B .. 0x1DBF,
+ 0x2071,
+ 0x207F,
+ 0x2090 .. 0x209C,
+ 0x2170 .. 0x217F,
+ 0x24D0 .. 0x24E9,
+ 0x2C7C .. 0x2C7D,
+ 0xA770,
+ 0xA7F8 .. 0xA7F9,
+ ) {
+ # Don't include the code point unless it is assigned in this
+ # release
+ my $category = $gc->value_of(hex $code_point);
+ next if ! defined $category || $category eq 'Cn';
+
+ $Lower += $code_point;
+ }
}
$Lower->add_alias('XPosixLower');
my $Posix_Lower = $perl->add_match_table("PosixLower",
my $Unicode_Upper = property_ref('Uppercase');
if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
$Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
- $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
- $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
- $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
}
else {
- $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
- Related => 1);
+
+ # Unlike Lower, there are only two ranges in Upper that aren't in
+ # gc=Lu, and all code points were assigned in all releases.
+ $Upper += $gc->table('Uppercase_Letter');
+ $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
+ $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
}
$Upper->add_alias('XPosixUpper');
my $Posix_Upper = $perl->add_match_table("PosixUpper",
# identical code points, but their caseless equivalents are not the same,
# one being 'Cased' and the other being 'LC', and so now must be kept as
# separate entities.
- $Title += $lt if defined $lt;
+ if (defined $lt) {
+ $Title += $lt;
+ }
+ else {
+ push @tables_that_may_be_empty, $Title->complete_name;
+ }
- # If this Unicode version doesn't have Cased, set up our own. From
- # Unicode 5.1: Definition D120: A character C is defined to be cased if
- # and only if C has the Lowercase or Uppercase property or has a
- # General_Category value of Titlecase_Letter.
my $Unicode_Cased = property_ref('Cased');
- unless (defined $Unicode_Cased) {
+ if (defined $Unicode_Cased) {
+ my $yes = $Unicode_Cased->table('Y');
+ my $no = $Unicode_Cased->table('N');
+ $Title->set_caseless_equivalent($yes);
+ if (defined $Unicode_Upper) {
+ $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
+ $Unicode_Upper->table('N')->set_caseless_equivalent($no);
+ }
+ $Upper->set_caseless_equivalent($yes);
+ if (defined $Unicode_Lower) {
+ $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
+ $Unicode_Lower->table('N')->set_caseless_equivalent($no);
+ }
+ $Lower->set_caseless_equivalent($yes);
+ }
+ else {
+ # If this Unicode version doesn't have Cased, set up the Perl
+ # extension from first principles. From Unicode 5.1: Definition D120:
+ # A character C is defined to be cased if and only if C has the
+ # Lowercase or Uppercase property or has a General_Category value of
+ # Titlecase_Letter.
my $cased = $perl->add_match_table('Cased',
Initialize => $Lower + $Upper + $Title,
Description => 'Uppercase or Lowercase or Titlecase',
);
- $Unicode_Cased = $cased;
+ # $notcased is purely for the caseless equivalents below
+ my $notcased = $perl->add_match_table('_Not_Cased',
+ Initialize => ~ $cased,
+ Fate => $INTERNAL_ONLY,
+ Description => 'All not-cased code points');
+ $Title->set_caseless_equivalent($cased);
+ if (defined $Unicode_Upper) {
+ $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
+ $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
+ }
+ $Upper->set_caseless_equivalent($cased);
+ if (defined $Unicode_Lower) {
+ $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
+ $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
+ }
+ $Lower->set_caseless_equivalent($cased);
}
- $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
# Similarly, set up our own Case_Ignorable property if this Unicode
# version doesn't have it. From Unicode 5.1: Definition D121: A character
}
else {
- # For early releases, we don't get it exactly right. The below
- # includes more than it should, which in 5.2 terms is: L + Nl +
- # Other_Alphabetic. Other_Alphabetic contains many characters from
- # Mn and Mc. It's better to match more than we should, than less than
- # we should.
+ # The Alphabetic property doesn't exist for early releases, so
+ # generate it. The actual definition, in 5.2 terms is:
+ #
+ # gc=L + gc=Nl + Other_Alphabetic
+ #
+ # Other_Alphabetic is also not defined in these early releases, but it
+ # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
+ # those last two as well, then subtract the relatively few of them that
+ # shouldn't have been added. (The gc=So range is the circled capital
+ # Latin characters. Early releases mistakenly didn't also include the
+ # lower-case versions of these characters, and so we don't either, to
+ # maintain consistency with those releases that first had this
+ # property.
$Alpha->initialize($gc->table('Letter')
- + $gc->table('Mn')
- + $gc->table('Mc'));
- $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
+ + pre_3_dot_1_Nl()
+ + $gc->table('Mn')
+ + $gc->table('Mc')
+ );
+ $Alpha->add_range(0x24D0, 0x24E9); # gc=So
+ foreach my $range ( [ 0x0300, 0x0344 ],
+ [ 0x0346, 0x034E ],
+ [ 0x0360, 0x0362 ],
+ [ 0x0483, 0x0486 ],
+ [ 0x0591, 0x05AF ],
+ [ 0x06DF, 0x06E0 ],
+ [ 0x06EA, 0x06EC ],
+ [ 0x0740, 0x074A ],
+ 0x093C,
+ 0x094D,
+ [ 0x0951, 0x0954 ],
+ 0x09BC,
+ 0x09CD,
+ 0x0A3C,
+ 0x0A4D,
+ 0x0ABC,
+ 0x0ACD,
+ 0x0B3C,
+ 0x0B4D,
+ 0x0BCD,
+ 0x0C4D,
+ 0x0CCD,
+ 0x0D4D,
+ 0x0DCA,
+ [ 0x0E47, 0x0E4C ],
+ 0x0E4E,
+ [ 0x0EC8, 0x0ECC ],
+ [ 0x0F18, 0x0F19 ],
+ 0x0F35,
+ 0x0F37,
+ 0x0F39,
+ [ 0x0F3E, 0x0F3F ],
+ [ 0x0F82, 0x0F84 ],
+ [ 0x0F86, 0x0F87 ],
+ 0x0FC6,
+ 0x1037,
+ 0x1039,
+ [ 0x17C9, 0x17D3 ],
+ [ 0x20D0, 0x20DC ],
+ 0x20E1,
+ [ 0x302A, 0x302F ],
+ [ 0x3099, 0x309A ],
+ [ 0xFE20, 0xFE23 ],
+ [ 0x1D165, 0x1D169 ],
+ [ 0x1D16D, 0x1D172 ],
+ [ 0x1D17B, 0x1D182 ],
+ [ 0x1D185, 0x1D18B ],
+ [ 0x1D1AA, 0x1D1AD ],
+ ) {
+ if (ref $range) {
+ $Alpha->delete_range($range->[0], $range->[1]);
+ }
+ else {
+ $Alpha->delete_range($range, $range);
+ }
+ }
$Alpha->add_description('Alphabetic');
+ $Alpha->add_alias('Alphabetic');
}
$Alpha->add_alias('XPosixAlpha');
my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
);
$Word->add_alias('XPosixWord');
my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
- $Word += $Pc if defined $Pc;
+ if (defined $Pc) {
+ $Word += $Pc;
+ }
+ else {
+ $Word += ord('_'); # Make sure this is a $Word
+ }
# This is a Perl extension, so the name doesn't begin with Posix.
my $PerlWord = $perl->add_match_table('PerlWord',
# Perl's traditional space doesn't include Vertical Tab
my $XPerlSpace = $perl->add_match_table('XPerlSpace',
Description => '\s, including beyond ASCII',
- Initialize => $Space - 0x000B,
+ #Initialize => $Space - 0x000B,
+ Initialize => $Space,
);
$XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
my $PerlSpace = $perl->add_match_table('PerlSpace',
- Description => '\s, restricted to ASCII = [ \f\n\r\t]',
+ Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
Initialize => $XPerlSpace & $ASCII,
);
}
else {
$PosixXDigit->initialize($Xdigit & $ASCII);
+ $PosixXDigit->add_alias('AHex');
+ $PosixXDigit->add_alias('Ascii_Hex_Digit');
}
$PosixXDigit->add_description('[0-9A-Fa-f]');
}
else {
- # This list came from 3.2 Soft_Dotted.
+ # This list came from 3.2 Soft_Dotted; all of these code points are in
+ # all releases
$CanonDCIJ->initialize([ 0x0069,
0x006A,
0x012F,
$CanonDCIJ = $CanonDCIJ & $Assigned;
}
+ # For backward compatibility, Perl has its own definition for IDStart.
+ # It is regular XID_Start plus the underscore, but all characters must be
+ # Word characters as well
+ my $XID_Start = property_ref('XID_Start');
+ my $perl_xids = $perl->add_match_table('_Perl_IDStart',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Initialize => ord('_')
+ );
+ if (defined $XID_Start
+ || defined ($XID_Start = property_ref('ID_Start')))
+ {
+ $perl_xids += $XID_Start->table('Y');
+ }
+ else {
+ # For Unicode versions that don't have the property, construct our own
+ # from first principles. The actual definition is:
+ # Letters
+ # + letter numbers (Nl)
+ # - Pattern_Syntax
+ # - Pattern_White_Space
+ # + stability extensions
+ # - NKFC modifications
+ #
+ # What we do in the code below is to include the identical code points
+ # that are in the first release that had Unicode's version of this
+ # property, essentially extrapolating backwards. There were no
+ # stability extensions until v4.1, so none are included; likewise in
+ # no Unicode version so far do subtracting PatSyn and PatWS make any
+ # difference, so those also are ignored.
+ $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
+
+ # We do subtract the NFKC modifications that are in the first version
+ # that had this property. We don't bother to test if they are in the
+ # version in question, because if they aren't, the operation is a
+ # no-op. The NKFC modifications are discussed in
+ # http://www.unicode.org/reports/tr31/#NFKC_Modifications
+ foreach my $range ( 0x037A,
+ 0x0E33,
+ 0x0EB3,
+ [ 0xFC5E, 0xFC63 ],
+ [ 0xFDFA, 0xFE70 ],
+ [ 0xFE72, 0xFE76 ],
+ 0xFE78,
+ 0xFE7A,
+ 0xFE7C,
+ 0xFE7E,
+ [ 0xFF9E, 0xFF9F ],
+ ) {
+ if (ref $range) {
+ $perl_xids->delete_range($range->[0], $range->[1]);
+ }
+ else {
+ $perl_xids->delete_range($range, $range);
+ }
+ }
+ }
+
+ $perl_xids &= $Word;
+
+ my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
+ my $XIDC = property_ref('XID_Continue');
+ if (defined $XIDC
+ || defined ($XIDC = property_ref('ID_Continue')))
+ {
+ $perl_xidc += $XIDC->table('Y');
+ }
+ else {
+ # Similarly, we construct our own XIDC if necessary for early Unicode
+ # versions. The definition is:
+ # everything in XIDS
+ # + Gc=Mn
+ # + Gc=Mc
+ # + Gc=Nd
+ # + Gc=Pc
+ # - Pattern_Syntax
+ # - Pattern_White_Space
+ # + stability extensions
+ # - NFKC modifications
+ #
+ # The same thing applies to this as with XIDS for the PatSyn, PatWS,
+ # and stability extensions. There is a somewhat different set of NFKC
+ # mods to remove (and add in this case). The ones below make this
+ # have identical code points as in the first release that defined it.
+ $perl_xidc += $perl_xids
+ + $gc->table('L')
+ + $gc->table('Mn')
+ + $gc->table('Mc')
+ + $gc->table('Nd')
+ + 0x00B7
+ ;
+ if (defined (my $pc = $gc->table('Pc'))) {
+ $perl_xidc += $pc;
+ }
+ else { # 1.1.5 didn't have Pc, but these should have been in it
+ $perl_xidc += 0xFF3F;
+ $perl_xidc->add_range(0x203F, 0x2040);
+ $perl_xidc->add_range(0xFE33, 0xFE34);
+ $perl_xidc->add_range(0xFE4D, 0xFE4F);
+ }
+
+ # Subtract the NFKC mods
+ foreach my $range ( 0x037A,
+ [ 0xFC5E, 0xFC63 ],
+ [ 0xFDFA, 0xFE1F ],
+ 0xFE70,
+ [ 0xFE72, 0xFE76 ],
+ 0xFE78,
+ 0xFE7A,
+ 0xFE7C,
+ 0xFE7E,
+ ) {
+ if (ref $range) {
+ $perl_xidc->delete_range($range->[0], $range->[1]);
+ }
+ else {
+ $perl_xidc->delete_range($range, $range);
+ }
+ }
+ }
+
+ $perl_xidc &= $Word;
+
+ my $gcb = property_ref('Grapheme_Cluster_Break');
# These are used in Unicode's definition of \X
my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
Fate => $INTERNAL_ONLY);
my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
Fate => $INTERNAL_ONLY);
- # For backward compatibility, Perl has its own definition for IDStart
- # First, we include the underscore, and then the regular XID_Start also
- # have to be Words
- $perl->add_match_table('_Perl_IDStart',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize =>
- ord('_')
- + (property_ref('XID_Start')->table('Y') & $Word)
- );
-
- my $gcb = property_ref('Grapheme_Cluster_Break');
-
# The 'extended' grapheme cluster came in 5.1. The non-extended
# definition differs too much from the traditional Perl one to use.
- if (defined $gcb && defined $gcb->table('SpacingMark')) {
+ if (defined $gcb->table('Control')) {
- # Note that assumes HST is defined; it came in an earlier release than
+ # Note that assumes hst is defined; it came in an earlier release than
# GCB. In the line below, two negatives means: yes hangul
$begin += ~ property_ref('Hangul_Syllable_Type')
->table('Not_Applicable')
+ $gcb->table('LF'));
$begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
- $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
- $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
+ $extend += $gcb->table('Extend');
+ my $comment = 'For use in \X; matches: Extend';
+ if (defined $gcb->table('SpacingMark')) {
+ $extend += $gcb->table('SpacingMark');
+ $comment .= ' | SpacingMark';
+ }
+ $extend->add_comment($comment);
+
+ if (!defined $gcb->table('Prepend')) {
+ my $table = $gcb->add_match_table('Prepend');
+ push @tables_that_may_be_empty, $table->complete_name;
+ }
}
else { # Old definition, used on early releases.
$extend += $gc->table('Mark')
# We set things up so the Perl core degrades gracefully, possibly with
# placeholders that match nothing.
- if (! defined $gcb) {
- $gcb = Property->new('GCB', Status => $PLACEHOLDER);
- }
- my $hst = property_ref('HST');
- if (!defined $hst) {
- $hst = Property->new('HST', Status => $PLACEHOLDER);
- $hst->add_match_table('Not_Applicable',
- Initialize => $Any,
- Matches_All => 1);
- }
+ my $hst = property_ref('Hangul_Syllable_Type');
# On some releases, here we may not have the needed tables for the
# perl core, in some releases we may.
push @tables_that_may_be_empty, $table->complete_name;
}
- # The HST property predates the GCB one, and has identical tables
+ # The hst property predates the GCB one, and has identical tables
# for some of them, so use it if we can.
- if ($table->is_empty
- && defined $hst
- && defined $hst->table($name))
+ if ($table->is_empty && defined $hst->table($name))
{
$table += $hst->table($name);
}
push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
} else {
$lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
- $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
+ $lv_lvt_v->add_comment('For use in \X; matches: hst=LV | hst=LVT | hst=V');
}
- # Was previously constructed to contain both Name and Unicode_1_Name
+ # Was earlier constructed to contain both Name and Unicode_1_Name
my @composition = ('Name', 'Unicode_1_Name');
if (@named_sequences) {
}
my $alias_sentence = "";
+ my %abbreviations;
my $alias = property_ref('Name_Alias');
- if (defined $alias) {
- push @composition, 'Name_Alias';
- $perl_charname->set_proxy_for('Name_Alias');
- my $unicode_1 = property_ref('Unicode_1_Name');
- my %abbreviations;
-
- # Add each entry in Name_Alias to Perl_Charnames. Where these go with
- # respect to any existing entry depends on the entry type.
- # Corrections go before said entry, as they should be returned in
- # preference over the existing entry. (A correction to a correction
- # should be later in the Name_Alias table, so it will correctly
- # precede the erroneous correction in Perl_Charnames.)
- #
- # Abbreviations go after everything else, so they are saved
- # temporarily in a hash for later.
- #
- # Controls are currently added afterwards. This is because Perl has
- # previously used the Unicode1 name, and so should still use that.
- # (Most of them will be the same anyway, in which case we don't add a
- # duplicate)
-
- $alias->reset_each_range;
- while (my ($range) = $alias->each_range) {
- next if $range->value eq "";
- my $code_point = $range->start;
- if ($code_point != $range->end) {
- Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
- }
- my ($value, $type) = split ': ', $range->value;
- my $replace_type;
- if ($type eq 'correction') {
- $replace_type = $MULTIPLE_BEFORE;
- }
- elsif ($type eq 'abbreviation') {
-
- # Save for later
- $abbreviations{$value} = $code_point;
- next;
- }
- elsif ($type eq 'control') {
- my $unicode_1_value = $unicode_1->value_of($code_point);
- next if $unicode_1_value eq $value;
- $replace_type = $MULTIPLE_AFTER;
- }
- else {
- $replace_type = $MULTIPLE_AFTER;
- }
-
- # Actually add; before or after current entry(ies) as determined
- # above.
- $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ push @composition, 'Name_Alias';
+ $perl_charname->set_proxy_for('Name_Alias');
+
+ # Add each entry in Name_Alias to Perl_Charnames. Where these go with
+ # respect to any existing entry depends on the entry type. Corrections go
+ # before said entry, as they should be returned in preference over the
+ # existing entry. (A correction to a correction should be later in the
+ # Name_Alias table, so it will correctly precede the erroneous correction
+ # in Perl_Charnames.)
+ #
+ # Abbreviations go after everything else, so they are saved temporarily in
+ # a hash for later.
+ #
+ # Controls are currently added afterwards. This is because Perl has
+ # previously used the Unicode1 name, and so should still use that. (Most
+ # of them will be the same anyway, in which case we don't add a duplicate)
+
+ $alias->reset_each_range;
+ while (my ($range) = $alias->each_range) {
+ next if $range->value eq "";
+ my $code_point = $range->start;
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+ my ($value, $type) = split ': ', $range->value;
+ my $replace_type;
+ if ($type eq 'correction') {
+ $replace_type = $MULTIPLE_BEFORE;
+ }
+ elsif ($type eq 'abbreviation') {
+
+ # Save for later
+ $abbreviations{$value} = $code_point;
+ next;
}
-
- # Now that have everything added, add in abbreviations after
- # everything else.
- foreach my $value (keys %abbreviations) {
- $perl_charname->add_duplicate($abbreviations{$value}, $value, Replace => $MULTIPLE_AFTER);
+ else {
+ $replace_type = $MULTIPLE_AFTER;
}
- $alias_sentence = <<END;
+
+ # Actually add; before or after current entry(ies) as determined
+ # above.
+
+ $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ }
+ $alias_sentence = <<END;
The Name_Alias property adds duplicate code point entries that are
alternatives to the original name. If an addition is a corrected
name, it will be physically first in the table. The original (less correct,
but still valid) name will be next; then any alternatives, in no particular
order; and finally any abbreviations, again in no particular order.
END
+
+ # Now add the Unicode_1 names for the controls. The Unicode_1 names had
+ # precedence before 6.1, so should be first in the file; the other names
+ # have precedence starting in 6.1,
+ my $before_or_after = ($v_version lt v6.1.0)
+ ? $MULTIPLE_BEFORE
+ : $MULTIPLE_AFTER;
+
+ foreach my $range (property_ref('Unicode_1_Name')->ranges) {
+ my $code_point = $range->start;
+ my $unicode_1_value = $range->value;
+ next if $unicode_1_value eq ""; # Skip if name doesn't exist.
+
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+
+ # To handle EBCDIC, we don't hard code in the code points of the
+ # controls; instead realizing that all of them are below 256.
+ last if $code_point > 255;
+
+ # We only add in the controls.
+ next if $gc->value_of($code_point) ne 'Cc';
+
+ # This won't add an exact duplicate.
+ $perl_charname->add_duplicate($code_point, $unicode_1_value,
+ Replace => $before_or_after);
+ }
+
+ # But in this version only, the ALERT has precedence over BELL, the
+ # Unicode_1_Name that would otherwise have precedence.
+ if ($v_version eq v6.0.0) {
+ $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE);
+ }
+
+ # Now that have everything added, add in abbreviations after
+ # everything else.
+ foreach my $value (keys %abbreviations) {
+ $perl_charname->add_duplicate($abbreviations{$value}, $value,
+ Replace => $MULTIPLE_AFTER);
}
my $comment;
$unassigned->set_equivalent_to($age_default, Related => 1);
}
+ # See L<perlfunc/quotemeta>
+ my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+
+ # Initialize to what's common in
+ # all Unicode releases.
+ Initialize =>
+ $Space
+ + $gc->table('Control')
+ );
+
+ # In early releases without the proper Unicode properties, just set to \W.
+ if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
+ || ! defined (my $patws = property_ref('Pattern_White_Space'))
+ || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
+ {
+ $quotemeta += ~ $Word;
+ }
+ else {
+ $quotemeta += $patsyn->table('Y')
+ + $patws->table('Y')
+ + $di->table('Y')
+ + ((~ $Word) & $ASCII);
+ }
# Finished creating all the perl properties. All non-internal non-string
# ones have a synonym of 'Is_' prefixed. (Internal properties begin with
# This separates out the non-characters from the other unassigneds, so
# can give different annotations for each.
$unassigned_sans_noncharacters = Range_List->new(
- Initialize => $gc->table('Unassigned')
- & property_ref('Noncharacter_Code_Point')->table('N'));
+ Initialize => $gc->table('Unassigned'));
+ if (defined (my $nonchars = property_ref('Noncharacter_Code_Point'))) {
+ $unassigned_sans_noncharacters &= $nonchars->table('N');
+ }
for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
$i = populate_char_info($i); # Note sets $i so may cause skips
&& ($actual->property != $block || $prefix eq 'In_'))
{
print simple_fold(join_lines(<<END
-There is already an alias named $proposed_name (from " . $pre_existing . "),
-so not creating this alias for " . $actual
+There is already an alias named $proposed_name (from $pre_existing),
+so not creating this alias for $actual
END
), "", 4);
}
my $status_info = $input_table->status_info;
my $caseless_equivalent = $input_table->caseless_equivalent;
+ # Don't mention a placeholder equivalent as it isn't to be listed in the
+ # pod
+ $caseless_equivalent = 0 if $caseless_equivalent != 0
+ && $caseless_equivalent->fate > $ORDINARY;
+
my $entry_for_first_table; # The entry for the first table output.
# Almost certainly, it is the parent.
# First, gather all the info that applies to this table as a whole.
- push @zero_match_tables, $table if $count == 0;
+ push @zero_match_tables, $table if $count == 0
+ # Don't mention special tables
+ # as being zero length
+ && $table->fate == $ORDINARY;
my $table_property = $table->property;
Uppercase_Mapping uc()
Also, Case_Folding is accessible through the C</i> modifier in regular
-expressions.
+expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
+operator.
And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
interpolation in double-quoted strings and regular expressions; and functions
# Make a list of all combinations of properties/values that are suppressed.
my @suppressed;
- foreach my $property_name (keys %why_suppressed) {
+ if (! $debug_skip) { # This tends to fail in this debug mode
+ foreach my $property_name (keys %why_suppressed) {
- # Just the value
- my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
+ # Just the value
+ my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
- # The hash may contain properties not in this release of Unicode
- next unless defined (my $property = property_ref($property_name));
+ # The hash may contain properties not in this release of Unicode
+ next unless defined (my $property = property_ref($property_name));
- # Find all combinations
- foreach my $prop_alias ($property->aliases) {
- my $prop_alias_name = standardize($prop_alias->name);
+ # Find all combinations
+ foreach my $prop_alias ($property->aliases) {
+ my $prop_alias_name = standardize($prop_alias->name);
- # If no =value, there's just one combination possibe for this
- if (! $value_name) {
+ # If no =value, there's just one combination possibe for this
+ if (! $value_name) {
- # The property may be suppressed, but there may be a proxy for
- # it, so it shouldn't be listed as suppressed
- next if $prop_alias->ucd;
- push @suppressed, $prop_alias_name;
- }
- else { # Otherwise
- foreach my $value_alias ($property->table($value_name)->aliases)
- {
- next if $value_alias->ucd;
+ # The property may be suppressed, but there may be a proxy
+ # for it, so it shouldn't be listed as suppressed
+ next if $prop_alias->ucd;
+ push @suppressed, $prop_alias_name;
+ }
+ else { # Otherwise
+ foreach my $value_alias
+ ($property->table($value_name)->aliases)
+ {
+ next if $value_alias->ucd;
- push @suppressed, "$prop_alias_name="
- . standardize($value_alias->name);
+ push @suppressed, "$prop_alias_name="
+ . standardize($value_alias->name);
+ }
}
}
}
}
}
}
- elsif ($count == $MAX_UNICODE_CODEPOINTS) {
- if ($table == $property || $table->leader == $table) {
+ elsif ($count == $MAX_UNICODE_CODEPOINTS
+ && ($table == $property || $table->leader == $table)
+ && $table->property->status != $PLACEHOLDER)
+ {
Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
- }
}
if ($table->fate == $SUPPRESSED) {
make_UCD;
make_property_test_script() if $make_test_script;
+ make_normalization_test_script() if $make_norm_test_script;
return;
}
return;
}
+sub make_normalization_test_script() {
+ print "Making normalization test script\n" if $verbosity >= $PROGRESS;
+
+ my $n_path = 'TestNorm.pl';
+
+ unshift @normalization_tests, <<'END';
+use utf8;
+use Test::More;
+
+sub ord_string { # Convert packed ords to printable string
+ use charnames ();
+ return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
+ unpack "U*", shift) . "'";
+ #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
+}
+
+sub Test_N {
+ my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
+ my $display_source = ord_string($source);
+ my $display_nfc = ord_string($nfc);
+ my $display_nfd = ord_string($nfd);
+ my $display_nfkc = ord_string($nfkc);
+ my $display_nfkd = ord_string($nfkd);
+
+ use Unicode::Normalize;
+ # NFC
+ # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
+ # nfkc == toNFC(nfkc) == toNFC(nfkd)
+ #
+ # NFD
+ # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
+ # nfkd == toNFD(nfkc) == toNFD(nfkd)
+ #
+ # NFKC
+ # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
+ # toNFKC(nfkc) == toNFKC(nfkd)
+ #
+ # NFKD
+ # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
+ # toNFKD(nfkc) == toNFKD(nfkd)
+
+ is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
+ is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
+ is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
+ is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
+ is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
+
+ is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
+ is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
+ is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
+ is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
+ is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
+
+ is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
+ is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
+ is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
+ is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
+ is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
+
+ is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
+ is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
+ is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
+ is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
+ is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
+}
+END
+
+ &write($n_path,
+ 1, # Is utf8;
+ [
+ @normalization_tests,
+ 'done_testing();'
+ ]);
+ return;
+}
+
# This is a list of the input files and how to handle them. The files are
# processed in their order in this list. Some reordering is possible if
# desired, but the v0 files should be first, and the extracted before the
? \&filter_v6_ucd
: undef),
+ # Early versions did not have the
+ # proper Unicode_1 names for the
+ # controls
+ (($v_version lt v3.0.0)
+ ? \&filter_early_U1_names
+ : undef),
+
+ # Early versions did not correctly
+ # use the later method for giving
+ # decimal digit values
+ (($v_version le v3.2.0)
+ ? \&filter_bad_Nd_ucd
+ : undef),
+
# And the main filter
\&filter_UnicodeData_line,
],
Each_Line_Handler => \&filter_unihan_line,
),
Input_file->new('SpecialCasing.txt', v2.1.8,
- Each_Line_Handler => \&filter_special_casing_line,
+ Each_Line_Handler => ($v_version eq 2.1.8)
+ ? \&filter_2_1_8_special_casing_line
+ : \&filter_special_casing_line,
Pre_Handler => \&setup_special_casing,
Has_Missings_Defaults => $IGNORED,
),
Input_file->new('BidiMirroring.txt', v3.0.1,
Property => 'Bidi_Mirroring_Glyph',
),
- Input_file->new("NormalizationTest.txt", v3.0.1,
- Skip => 'Validation Tests',
+ Input_file->new("NormTest.txt", v3.0.0,
+ Handler => \&process_NormalizationsTest,
+ Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
),
Input_file->new('CaseFolding.txt', v3.0.1,
Pre_Handler => \&setup_case_folding,
Input_file->new('NamedSequences.txt', v4.1.0,
Handler => \&process_NamedSequences
),
- Input_file->new('NameAliases.txt', v5.0.0,
+ Input_file->new('NameAliases.txt', v0,
Property => 'Name_Alias',
Pre_Handler => ($v_version le v6.0.0)
? \&setup_early_name_alias