-makelist : Rewrite the file list $file_list based on current setup
-annotate : Output an annotation for each character in the table files;
useful for debugging mktables, looking at diffs; but is slow,
- memory intensive; resulting tables are usable but slow and
- very large.
+ memory intensive; resulting tables are usable but are slow and
+ very large (and currently fail the Unicode::UCD.t tests).
-check A B : Executes $0 only if A and B are the same
END
}
if $v_version ge v4.1.0;
push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
if $v_version ge v6.0.0;
+push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
+ if $v_version ge v6.1.0;
# The lists below are hashes, so the key is the item in the list, and the
# value is the reason why it is in the list. This makes generation of
my $UNCONDITIONALLY = 2; # Replace without conditions.
my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
# already there
-my $CROAK = 5; # Die with an error if is already there
+my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
+ # already there
+my $CROAK = 6; # Die with an error if is already there
# Flags to give property statuses. The phrases are to remind maintainers that
# if the flag is changed, the indefinite article referring to it in the
# its name
if ($seen_non_extracted_non_age) {
if ($file =~ /$EXTRACTED/i) {
- Carp::my_carp_bug(join_lines(<<END
+ Carp::my_carp_bug(main::join_lines(<<END
$file should be processed just after the 'Prop...Alias' files, and before
anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
have subtle problems
|| @defaults > 2
|| ($default =~ /^</
&& $default !~ /^<code *point>$/i
- && $default !~ /^<none>$/i))
+ && $default !~ /^<none>$/i
+ && $default !~ /^<script>$/i))
{
$self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
}
elsif ($default =~ /^<code *point>$/i) {
$default = $CODE_POINT;
}
+ elsif ($default =~ /^<script>$/i) {
+
+ # Special case this one. Currently is from
+ # ScriptExtensions.txt, and means for all unlisted
+ # code points, use their Script property values.
+ # For the code points not listed in that file, the
+ # default value is 'Unknown'.
+ $default = "Unknown";
+ }
# Store them as a sub-arrays with both components.
push @{$missings{$addr}}, [ $default, $property ];
# either a constructor or a method. If called as a method, the result
# will be a new() instance of the calling object, containing the union
# of that object with the other parameter's code points; if called as
- # a constructor, the first parameter gives the class the new object
+ # a constructor, the first parameter gives the class that the new object
# should be, and the second parameter gives the code points to go into
# it.
# In either case, there are two parameters looked at by this routine;
# just a single code point.
#
# If they are ranges, this routine doesn't make any effort to preserve
- # the range values of one input over the other. Therefore this base
- # class should not allow _union to be called from other than
+ # the range values and types of one input over the other. Therefore
+ # this base class should not allow _union to be called from other than
# initialization code, so as to prevent two tables from being added
# together where the range values matter. The general form of this
# routine therefore belongs in a derived class, but it was moved here
# to avoid duplication of code. The failure to overload this in this
# class keeps it safe.
#
+ # It does make the effort during initialization to accept tables with
+ # multiple values for the same code point, and to preserve the order
+ # of these. If there is only one input range or range set, it doesn't
+ # sort (as it should already be sorted to the desired order), and will
+ # accept multiple values per code point. Otherwise it will merge
+ # multiple values into a single one.
my $self;
my @args; # Arguments to pass to the constructor
# Accumulate all records from both lists.
my @records;
+ my $input_count = 0;
for my $arg (@args) {
#local $to_trace = 0 if main::DEBUG;
trace "argument = $arg" if main::DEBUG && $to_trace;
Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
return;
}
+
$arg = [ $arg ] if ! ref $arg;
my $type = ref $arg;
if ($type eq 'ARRAY') {
foreach my $element (@$arg) {
push @records, Range->new($element, $element);
+ $input_count++;
}
}
elsif ($arg->isa('Range')) {
push @records, $arg;
+ $input_count++;
}
elsif ($arg->can('ranges')) {
push @records, $arg->ranges;
+ $input_count++;
}
else {
my $message = "";
# Sort with the range containing the lowest ordinal first, but if
# two ranges start at the same code point, sort with the bigger range
# of the two first, because it takes fewer cycles.
- @records = sort { ($a->start <=> $b->start)
+ if ($input_count > 1) {
+ @records = sort { ($a->start <=> $b->start)
or
# if b is shorter than a, b->end will be
# less than a->end, and we want to select
# a, so want to return -1
($b->end <=> $a->end)
} @records;
+ }
my $new = $class->new(@_);
for my $set (@records) {
my $start = $set->start;
my $end = $set->end;
- my $value = $set->value;
+ my $value = $set->value;
+ my $type = $set->type;
if ($start > $new->max) {
- $new->_add_delete('+', $start, $end, $value);
+ $new->_add_delete('+', $start, $end, $value, Type => $type);
}
elsif ($end > $new->max) {
- $new->_add_delete('+', $new->max +1, $end, $value);
+ $new->_add_delete('+', $new->max +1, $end, $value,
+ Type => $type);
+ }
+ elsif ($input_count == 1) {
+ # Here, overlaps existing range, but is from a single input,
+ # so preserve the multiple values from that input.
+ $new->_add_delete('+', $start, $end, $value, Type => $type,
+ Replace => $MULTIPLE_AFTER);
}
}
# new and old values are identical, the
# replacement is skipped to save cycles
# => $IF_NOT_EQUIVALENT means to replace the existing values
- # with this one if they are not equivalent.
+ # (the default) with this one if they are not equivalent.
# Ranges are equivalent if their types are the
# same, and they are the same string; or if
# both are type 0 ranges, if their Unicode
# multiple times. They are stored LIFO, so
# that the final one inserted is the first one
# returned in an ordered search of the table.
+ # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
+ # FIFO, so that this one is inserted after all
+ # others that currently exist.
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
# "same value" means identical for non-type-0 ranges, and it means
# Here, we have taken care of the case where $replace is $NO.
# Remember that here, r[$i-1]->end < $start <= r[$i]->end
# If inserting a multiple record, this is where it goes, before the
- # first (if any) existing one. This implies an insertion, and no
- # change to any existing ranges. Note that $i can be -1 if this new
- # range doesn't actually duplicate any existing, and comes at the
- # beginning of the list.
- if ($replace == $MULTIPLE_BEFORE) {
+ # first (if any) existing one if inserting LIFO. (If this is to go
+ # afterwards, FIFO, we below move the pointer to there.) These imply
+ # an insertion, and no change to any existing ranges. Note that $i
+ # can be -1 if this new range doesn't actually duplicate any existing,
+ # and comes at the beginning of the list.
+ if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
if ($start != $end) {
Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
# pre-existing code point, which will again be a single code
# point range. Because 'i' likely will have changed as a
# result of these operations, we can't just continue on, but
- # do this operation recursively as well.
+ # do this operation recursively as well. If we are inserting
+ # LIFO, the pre-existing code point needs to go after the new
+ # one, so use MULTIPLE_AFTER; and vice versa.
if ($r->[$i]->start != $r->[$i]->end) {
$self->_add_delete('-', $start, $end, "");
$self->_add_delete('+', $start, $end, $value, Type => $type);
- return $self->_add_delete('+', $start, $end, $existing_value, Type => $existing_type, Replace => $MULTIPLE_BEFORE);
+ return $self->_add_delete('+',
+ $start, $end,
+ $existing_value,
+ Type => $existing_type,
+ Replace => ($replace == $MULTIPLE_BEFORE)
+ ? $MULTIPLE_AFTER
+ : $MULTIPLE_BEFORE);
+ }
+ }
+
+ # If to place this new record after, move to beyond all existing
+ # ones.
+ if ($replace == $MULTIPLE_AFTER) {
+ while ($i < @$r && $r->[$i]->start == $start) {
+ $i++;
}
}
return @return;
}
- # Here, we have taken care of $NO and $MULTIPLE replaces. This leaves
- # delete, insert, and replace either unconditionally or if not
+ # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
+ # leaves delete, insert, and replace either unconditionally or if not
# equivalent. $i still points to the first potential affected range.
# Now find the highest range affected, which will determine the length
# parameter to splice. (The input range can span multiple existing
$j--; # $j now points to the highest affected range.
trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
- # Here, have taken care of $NO and $MULTIPLE replaces.
+ # Here, have taken care of $NO and $MULTIPLE_foo replaces.
# $j points to the highest affected range. But it can be < $i or even
# -1. These happen only if the insertion is entirely in the gap
# between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
for my $try_hard (0, 1) {
# Look through all the ranges for a usable code point.
- for my $set ($self->ranges) {
+ for my $set (reverse $self->ranges) {
# Try the edge cases first, starting with the end point of the
# range.
my $self = shift;
my $code_point = shift;
my $value = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+ my %args = @_;
+ my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
+ Carp::carp_extra_args(\%args) if main::DEBUG && %args;
return $self->add_map($code_point, $code_point,
- $value, Replace => $MULTIPLE_BEFORE);
+ $value, Replace => $replace);
}
} # End of closure for package Range_Map
my %format;
# The format of the entries of the table. This is calculated from the
# data in the table (or passed in the constructor). This is an enum e.g.,
- # $STRING_FORMAT
+ # $STRING_FORMAT. It is marked protected as it should not be generally
+ # used to override calculations.
main::set_access('format', \%format, 'r', 'p_s');
sub new {
if ($annotate) {
- # if annotating each code point, must print 1 per line.
+ # If annotating each code point, must print 1 per line.
# The variable could point to a subroutine, and we don't want
# to lose that fact, so only set if not set already
$range_size_1 = 1 if ! $range_size_1;
next RANGE if defined $suppress_value
&& $value eq $suppress_value;
+ {
+
# If there is a range and doesn't need a single point range
# output
if ($start != $end && ! $range_size_1) {
$count);
$OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
}
- next RANGE;
}
- # Here to output a single code point per line
-
- # If not to annotate, use the simple formats
- if (! $annotate) {
+ # Here to output a single code point per line.
+ # If not to annotate, use the simple formats
+ elsif (! $annotate) {
# Use any passed in subroutine to output.
if (ref $range_size_1 eq 'CODE') {
push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
}
}
- next RANGE;
}
+ else {
# Here, wants annotation.
for (my $i = $start; $i <= $end; $i++) {
$OUT[-1] .= "\n";
}
}
+ }
+ }
} # End of loop through all the table's ranges
}
return unless defined $name;
if (defined $swash_keys{$name}) {
- Carp::my_carp(join_lines(<<END
+ Carp::my_carp(main::join_lines(<<END
Already created a swash name '$name' for $swash_keys{$name}. This means that
the same name desired for $self shouldn't be used. Bad News. This must be
fixed before production use, but proceeding anyway
\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
END
if ($specials_name) {
- $return .= <<END;
+ $return .= <<END;
\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
END
}
if $format eq $FLOAT_FORMAT
&& $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
$format = $HEX_FORMAT
- if $format eq $RATIONAL_FORMAT
- && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
+ if ($format eq $RATIONAL_FORMAT
+ && $map !~
+ m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
+ # Assume a leading zero means hex,
+ # even if all digits are 0-9
+ || ($format eq $INTEGER_FORMAT
+ && $map =~ /^0/);
$format = $STRING_FORMAT if $format eq $HEX_FORMAT
&& $map =~ /[^0-9A-F]/;
}
set_default_map
set_file_path
set_final_comment
+ _set_format
set_range_size_1
set_status
set_to_output_map
}
}
- # This entry is still missing as of 6.0, perhaps because no short name for
- # it.
- if (-e 'NameAliases.txt') {
- my $aliases = property_ref('Name_Alias');
- if (! defined $aliases) {
- $aliases = Property->new('Name_Alias');
- }
- }
-
# These are used so much, that we set globals for them.
$gc = property_ref('General_Category');
$block = property_ref('Block');
# Perl adds this alias.
$gc->add_alias('Category');
- # For backwards compatibility, these property files have particular names.
- property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what
- # utf8.c calls it
- property_ref('Lowercase_Mapping')->set_file('Lower');
- property_ref('Titlecase_Mapping')->set_file('Title');
-
- my $fold = property_ref('Case_Folding');
- $fold->set_file('Fold') if defined $fold;
-
# Unicode::Normalize expects this file with this name and directory.
my $ccc = property_ref('Canonical_Combining_Class');
if (defined $ccc) {
my $input_field_count = $i;
# This routine in addition outputs these extra fields:
+
my $DECOMP_TYPE = $i++; # Decomposition type
# These fields are modifications of ones above, and are usually
Range_Size_1 => \&output_perl_charnames_line,
Type => $STRING,
);
- $perl_charname->set_proxy_for('Name', 'Name_Alias');
+ $perl_charname->set_proxy_for('Name');
my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
Directory => File::Spec->curdir(),
my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
Default_Map => "",
Perl_Extension => 1,
- File => 'Digit', # Trad. location
Directory => $map_directory,
Type => $STRING,
Range_Size_1 => 1,
# simple ones are in UnicodeData.txt, which should already have been
# read in to the full property data structures, so as to initialize
# these with the simple ones. Then the SpecialCasing.txt entries
- # overwrite the ones which have different full mappings.
+ # add or overwrite the ones which have different full mappings.
# This routine sees if the simple mappings are to be output, and if
# so, copies what has already been put into the full mapping tables,
# relatively few entries in them that have different full mappings,
# and thus skip the simple mapping tables altogether.
- # New tables with just the simple mappings that are overridden by the
- # full ones are constructed. These are for Unicode::UCD, which
- # requires the simple mappings. The Case_Folding table is a combined
- # table of both the simple and full mappings, with the full ones being
- # in the hash, and the simple ones, even those overridden by the hash,
- # being in the base table. That same mechanism could have been
- # employed here, except that the docs have said that the generated
- # files are usuable directly by programs, so we dare not change the
- # format in any way.
-
my $file= shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
$uc = property_ref('uc');
# For each of the case change mappings...
- foreach my $case_table ($lc, $tc, $uc) {
- my $case = $case_table->name;
- my $full = property_ref($case);
- unless (defined $full && ! $full->is_empty) {
+ foreach my $full_table ($lc, $tc, $uc) {
+ my $full_name = $full_table->name;
+ unless (defined $full_table && ! $full_table->is_empty) {
Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
}
- # The simple version's name in each mapping merely has an 's' in
- # front of the full one's
- my $simple_name = 's' . $case;
- my $simple = property_ref($simple_name);
- $simple->initialize($full) if $simple->to_output_map();
+ # Create a table in the old-style format and with the original
+ # file name for backwards compatibility with applications that
+ # read it directly. The new tables contain both the simple and
+ # full maps, and the old are missing simple maps when there is a
+ # conflicting full one. Probably it would have been ok to add
+ # those to the legacy version, as was already done in 5.14 to the
+ # case folding one, but this was not done, out of an abundance of
+ # caution. The tables are set up here before we deal with the
+ # full maps so that as we handle those, we can override the simple
+ # maps for them in the legacy table, and merely add them in the
+ # new-style one.
+ my $legacy = Property->new("Legacy_" . $full_table->full_name,
+ File => $full_table->full_name =~
+ s/case_Mapping//r,
+ Range_Size_1 => 1,
+ Format => $HEX_FORMAT,
+ Default_Map => $CODE_POINT,
+ UCD => 0,
+ Initialize => $full_table,
+ );
- my $simple_only = Property->new("_s$case",
- Type => $STRING,
- Default_Map => $CODE_POINT,
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Description => "This contains the simple mappings for $case for just the code points that have different full mappings");
- $simple_only->set_to_output_map($INTERNAL_MAP);
- $simple_only->add_comment(join_lines( <<END
-This file is for UCD.pm so that it can construct simple mappings that would
-otherwise be lost because they are overridden by full mappings.
+ $full_table->add_comment(join_lines( <<END
+This file includes both the simple and full case changing maps. The simple
+ones are in the main body of the table below, and the full ones adding to or
+overriding them are in the hash.
END
));
+ # The simple version's name in each mapping merely has an 's' in
+ # front of the full one's
+ my $simple_name = 's' . $full_name;
+ my $simple = property_ref($simple_name);
+ $simple->initialize($full_table) if $simple->to_output_map();
+
unless ($simple->to_output_map()) {
- $simple_only->set_proxy_for($simple_name);
+ $full_table->set_proxy_for($simple_name);
}
}
return;
}
- $_ = "$fields[0]; lc; $fields[1]";
- $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
- $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
+ my $decimal_code_point = hex $fields[0];
- # Copy any simple case change to the special tables constructed if
- # being overridden by a multi-character case change.
- if ($fields[1] ne $fields[0]
- && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _slc; $value");
- }
- if ($fields[2] ne $fields[0]
- && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _stc; $value");
- }
- if ($fields[3] ne $fields[0]
- && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _suc; $value");
+ # Loop to handle each of the three mappings in the input line, in
+ # order, with $i indicating the current field number.
+ my $i = 0;
+ for my $object ($lc, $tc, $uc) {
+ $i++; # First time through, $i = 0 ... 3rd time = 3
+
+ my $value = $object->value_of($decimal_code_point);
+ $value = ($value eq $CODE_POINT)
+ ? $decimal_code_point
+ : hex $value;
+
+ # If this isn't a multi-character mapping, it should already have
+ # been read in.
+ if ($fields[$i] !~ / /) {
+ if ($value != hex $fields[$i]) {
+ Carp::my_carp("Bad news. UnicodeData.txt thinks "
+ . $object->name
+ . "(0x$fields[0]) is $value"
+ . " and SpecialCasing.txt thinks it is "
+ . hex $fields[$i]
+ . ". Good luck. Proceeding anyway.");
+ }
+ }
+ else {
+
+ # The mapping goes into both the legacy table, in which it
+ # replaces the simple one...
+ $file->insert_adjusted_lines("$fields[0]; Legacy_"
+ . $object->full_name
+ . "; $fields[$i]");
+
+ # ... and, the The regular table, in which it is additional,
+ # beyond the simple mapping.
+ $file->insert_adjusted_lines("$fields[0]; "
+ . $object->name
+ . "; "
+ . $CMD_DELIM
+ . "$REPLACE_CMD=$MULTIPLE_BEFORE"
+ . $CMD_DELIM
+ . $fields[$i]);
+ }
}
+ # Everything has been handled by the insert_adjusted_lines()
+ $_ = "";
+
return;
}
}
# Create the map for simple only if are going to output it, for otherwise
# it takes no part in anything we do.
my $to_output_simple;
+ my $non_final_folds;
sub setup_case_folding($) {
# Read in the case foldings in CaseFolding.txt. This handles both
property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
}
+ $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
+ );
+
# If we ever wanted to show that these tables were combined, a new
# property method could be created, like set_combined_props()
property_ref('Case_Folding')->add_comment(join_lines( <<END
if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
$_ = "$range; Case_Folding; "
. "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
+ if ($type eq 'F') {
+ my @string = split " ", $map;
+ for my $i (0 .. @string - 1 -1) {
+ $non_final_folds->add_range(hex $string[$i], hex $string[$i]);
+ }
+ }
}
else {
$_ = "";
# The Script_Extensions property starts out with a clone of the Script
# property.
- my $sc = property_ref("Script");
- my $scx = Property->new("scx", Full_Name => "Script_Extensions",
- Initialize => $sc,
- Default_Map => $sc->default_map,
- Pre_Declared_Maps => 0,
- Format => $STRING_WHITE_SPACE_LIST,
- );
+ my $scx = property_ref("Script_Extensions");
+ $scx = Property->new("scx", Full_Name => "Script_Extensions")
+ if ! defined $scx;
+ $scx->_set_format($STRING_WHITE_SPACE_LIST);
+ $scx->initialize($script);
+ $scx->set_default_map($script->default_map);
+ $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
$scx->add_comment(join_lines( <<END
The values for code points that appear in one script are just the same as for
the 'Script' property. Likewise the values for those that appear in many
END
));
- # Make the scx's tables and aliases for them the same as sc's
- foreach my $table ($sc->tables) {
+ # Initialize scx's tables and the aliases for them to be the same as sc's
+ foreach my $table ($script->tables) {
my $scx_table = $scx->add_match_table($table->name,
Full_Name => $table->full_name);
foreach my $alias ($table->aliases) {
return;
}
-sub setup_v6_name_alias {
- property_ref('Name_Alias')->add_map(7, 7, "ALERT");
+sub setup_early_name_alias {
+ my $aliases = property_ref('Name_Alias');
+ $aliases = Property->new('Name_Alias') if ! defined $aliases;
+
+ # Before 6.0, this wasn't a problem, and after it, this alias is part of
+ # the Unicode-delivered file.
+ $aliases->add_map(7, 7, "ALERT: control") if $v_version eq v6.0.0;
+ return;
+}
+
+sub filter_later_version_name_alias_line {
+
+ # This file has an extra entry per line for the alias type. This is
+ # handled by creating a compound entry: "$alias: $type"; First, split
+ # the line into components.
+ my ($range, $alias, $type, @remainder)
+ = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+
+ # This file contains multiple entries for some components, so tell the
+ # downstream code to allow this in our internal tables; the
+ # $MULTIPLE_AFTER preserves the input ordering.
+ $_ = join ";", $range, $CMD_DELIM
+ . $REPLACE_CMD
+ . '='
+ . $MULTIPLE_AFTER
+ . $CMD_DELIM
+ . "$alias: $type",
+ @remainder;
+ return;
+}
+
+sub filter_early_version_name_alias_line {
+
+ # Early versions did not have the trailing alias type field; implicitly it
+ # was 'correction'
+ $_ .= "; correction";
+ filter_later_version_name_alias_line;
+ return;
}
sub finish_Unicode() {
# 3) Calculates all the regular expression match tables based on the
# mappings.
# 3) Calculates and adds the tables which are defined by Unicode, but
- # which aren't derived by them
+ # which aren't derived by them, and certain derived tables that Perl
+ # uses.
# For each property, fill in any missing mappings, and calculate the re
# match tables. If a property has more than one missing mapping, the
Lowercase_Mapping
Titlecase_Mapping
Case_Folding
- } ) {
+ } )
+ {
my $full = property_ref($map);
if ($full->is_empty) {
my $simple = property_ref('Simple_' . $map);
}
}
+ # Create digit and case fold tables with the original file names for
+ # backwards compatibility with applications that read them directly.
+ my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
+ Default_Map => "",
+ Perl_Extension => 1,
+ File => 'Digit', # Trad. location
+ Directory => $map_directory,
+ UCD => 0,
+ Type => $STRING,
+ Range_Size_1 => 1,
+ Initialize => property_ref('Perl_Decimal_Digit'),
+ );
+ $Digit->add_comment(join_lines(<<END
+This file gives the mapping of all code points which represent a single
+decimal digit [0-9] to their respective digits. For example, the code point
+U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
+that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
+numerals.
+END
+ ));
+
+ Property->new('Legacy_Case_Folding',
+ File => "Fold",
+ Directory => $map_directory,
+ Default_Map => $CODE_POINT,
+ UCD => 0,
+ Range_Size_1 => 1,
+ Type => $STRING,
+ Format => $HEX_FORMAT,
+ Initialize => property_ref('cf'),
+ );
+
# The Script_Extensions property started out as a clone of the Script
# property. But processing its data file caused some elements to be
# replaced with different data. (These elements were for the Common and
my $alias = property_ref('Name_Alias');
if (defined $alias) {
push @composition, 'Name_Alias';
+ $perl_charname->set_proxy_for('Name_Alias');
+ my $unicode_1 = property_ref('Unicode_1_Name');
+ my %abbreviations;
+
+ # Add each entry in Name_Alias to Perl_Charnames. Where these go with
+ # respect to any existing entry depends on the entry type.
+ # Corrections go before said entry, as they should be returned in
+ # preference over the existing entry. (A correction to a correction
+ # should be later in the Name_Alias table, so it will correctly
+ # precede the erroneous correction in Perl_Charnames.)
+ #
+ # Abbreviations go after everything else, so they are saved
+ # temporarily in a hash for later.
+ #
+ # Controls are currently added afterwards. This is because Perl has
+ # previously used the Unicode1 name, and so should still use that.
+ # (Most of them will be the same anyway, in which case we don't add a
+ # duplicate)
+
$alias->reset_each_range;
while (my ($range) = $alias->each_range) {
next if $range->value eq "";
- if ($range->start != $range->end) {
- Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
+ my $code_point = $range->start;
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+ my ($value, $type) = split ': ', $range->value;
+ my $replace_type;
+ if ($type eq 'correction') {
+ $replace_type = $MULTIPLE_BEFORE;
+ }
+ elsif ($type eq 'abbreviation') {
+
+ # Save for later
+ $abbreviations{$value} = $code_point;
+ next;
+ }
+ elsif ($type eq 'control') {
+ my $unicode_1_value = $unicode_1->value_of($code_point);
+ next if $unicode_1_value eq $value;
+ $replace_type = $MULTIPLE_AFTER;
}
- $perl_charname->add_duplicate($range->start, $range->value);
+ else {
+ $replace_type = $MULTIPLE_AFTER;
+ }
+
+ # Actually add; before or after current entry(ies) as determined
+ # above.
+ $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ }
+
+ # Now that have everything added, add in abbreviations after
+ # everything else.
+ foreach my $value (keys %abbreviations) {
+ $perl_charname->add_duplicate($abbreviations{$value}, $value, Replace => $MULTIPLE_AFTER);
}
$alias_sentence = <<END;
-The Name_Alias property adds duplicate code point entries with a corrected
-name. The original (less correct, but still valid) name will be physically
-last.
+The Name_Alias property adds duplicate code point entries that are
+alternatives to the original name. If an addition is a corrected
+name, it will be physically first in the table. The original (less correct,
+but still valid) name will be next; then any alternatives, in no particular
+order; and finally any abbreviations, again in no particular order.
END
}
+
my $comment;
if (@composition <= 2) { # Always at least 2
$comment = join " and ", @composition;
$perl_charname->add_comment(join_lines( <<END
This file is for charnames.pm. It is the union of the $comment properties.
-Unicode_1_Name entries are used only for otherwise nameless code
-points.
+Unicode_1_Name entries are used only for nameless code points in the Name
+property.
$alias_sentence
This file doesn't include the algorithmically determinable names. For those,
use 'unicore/Name.pm'
|| $ucd_pod{$standard}{'perl_extension'} == $perl_extension
|| $output_this == $perl_extension)
{
- Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations. Proceeding anyway.");
+ Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
}
# We modifiy the info column of the one being output to
# others except DAge.txt (as data in an extracted file can be over-ridden by
# the non-extracted. Some other files depend on data derived from an earlier
# file, like UnicodeData requires data from Jamo, and the case changing and
-# folding requires data from Unicode. Mostly, it safest to order by first
+# folding requires data from Unicode. Mostly, it is safest to order by first
# version releases in (except the Jamo). DAge.txt is read before the
# extracted ones because of the rarely used feature $compare_versions. In the
# unlikely event that there were ever an extracted file that contained the Age
),
Input_file->new('NameAliases.txt', v5.0.0,
Property => 'Name_Alias',
- Pre_Handler => ($v_version ge v6.0.0)
- ? \&setup_v6_name_alias
+ Pre_Handler => ($v_version le v6.0.0)
+ ? \&setup_early_name_alias
: undef,
+ Each_Line_Handler => ($v_version le v6.0.0)
+ ? \&filter_early_version_name_alias_line
+ : \&filter_later_version_name_alias_line,
),
Input_file->new("BidiTest.txt", v5.2.0,
Skip => 'Validation Tests',
Property => 'Script_Extensions',
Pre_Handler => \&setup_script_extensions,
Each_Line_Handler => \&filter_script_extensions_line,
+ Has_Missings_Defaults => (($v_version le v6.0.0)
+ ? $NO_DEFAULTS
+ : $IGNORED),
),
# The two Indic files are actually available starting in v6.0.0, but their
# property values are missing from PropValueAliases.txt in that release,