use File::Path;
use File::Spec;
use Text::Tabs;
+use re "/aa";
sub DEBUG () { 0 } # Set to 0 for production; 1 for development
my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
-makelist : Rewrite the file list $file_list based on current setup
-annotate : Output an annotation for each character in the table files;
useful for debugging mktables, looking at diffs; but is slow,
- memory intensive; resulting tables are usable but slow and
- very large.
+ memory intensive; resulting tables are usable but are slow and
+ very large (and currently fail the Unicode::UCD.t tests).
-check A B : Executes $0 only if A and B are the same
END
}
if $v_version ge v4.1.0;
push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
if $v_version ge v6.0.0;
+push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
+ if $v_version ge v6.1.0;
+push @tables_that_may_be_empty, '_stc';
# The lists below are hashes, so the key is the item in the list, and the
# value is the reason why it is in the list. This makes generation of
# existence is not noted in the comment.
'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
+ 'Indic_Matra_Category' => "Provisional",
+ 'Indic_Syllabic_Category' => "Provisional",
+
# Don't suppress ISO_Comment, as otherwise special handling is needed
# to differentiate between it and gc=c, which can be written as 'isc',
# which is the same characters as ISO_Comment's short name.
- 'Name' => "Accessible via 'use charnames;' or Unicode::UCD::prop_invmap()",
+ 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
);
- # The following are suppressed because they were made contributory or
- # deprecated by Unicode before Perl ever thought about supporting them.
- foreach my $property ('Jamo_Short_Name',
- 'Grapheme_Link',
- 'Expands_On_NFC',
- 'Expands_On_NFD',
- 'Expands_On_NFKC',
- 'Expands_On_NFKD'
+ foreach my $property (
+
+ # The following are suppressed because they were made contributory
+ # or deprecated by Unicode before Perl ever thought about
+ # supporting them.
+ 'Jamo_Short_Name',
+ 'Grapheme_Link',
+ 'Expands_On_NFC',
+ 'Expands_On_NFD',
+ 'Expands_On_NFKC',
+ 'Expands_On_NFKD',
+
+ # The following are suppressed because they have been marked
+ # as deprecated for a sufficient amount of time
+ 'Other_Alphabetic',
+ 'Other_Default_Ignorable_Code_Point',
+ 'Other_Grapheme_Extend',
+ 'Other_ID_Continue',
+ 'Other_ID_Start',
+ 'Other_Lowercase',
+ 'Other_Math',
+ 'Other_Uppercase',
) {
$why_suppressed{$property} = $why_deprecated{$property};
}
'ReadMe.txt' => 'Documentation',
'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
- 'IndicMatraCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts',
- 'IndicSyllabicCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts',
'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
);
+my %skipped_files; # List of files that we skip
+
### End of externally interesting definitions, except for @input_file_objects
my $HEADER=<<"EOF";
my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
# the comments at the subroutine definition.
my $UNCONDITIONALLY = 2; # Replace without conditions.
-my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
+my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
+ # already there
+my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
# already there
-my $CROAK = 5; # Die with an error if is already there
+my $CROAK = 6; # Die with an error if is already there
# Flags to give property statuses. The phrases are to remind maintainers that
# if the flag is changed, the indefinite article referring to it in the
# contrast to the non_skip element, which is supposed to be used very
# temporarily for debugging. Sets 'optional' to 1. Also, files that we
# pretty much will never look at can be placed in the global
- # %ignored_files instead. Ones used here will be added to that list.
+ # %ignored_files instead. Ones used here will be added to %skipped files
main::set_access('skip', \%skip, 'c');
my %each_line_handler;
# including its reason
if ($skip{$addr}) {
$optional{$addr} = 1;
- $ignored_files{$file{$addr}} = $skip{$addr}
+ $skipped_files{$file{$addr}} = $skip{$addr}
}
return $self;
# its name
if ($seen_non_extracted_non_age) {
if ($file =~ /$EXTRACTED/i) {
- Carp::my_carp_bug(join_lines(<<END
+ Carp::my_carp_bug(main::join_lines(<<END
$file should be processed just after the 'Prop...Alias' files, and before
anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
have subtle problems
# they are deleted from the hash, so any that remain at the
# end of the program are files that we didn't process.
my $fkey = File::Spec->rel2abs($file);
- my $expecting = delete $potential_files{$fkey};
- $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
+ my $expecting = delete $potential_files{lc($fkey)};
+
Carp::my_carp("Was not expecting '$file'.") if
! $expecting
&& ! defined $handle{$addr};
|| @defaults > 2
|| ($default =~ /^</
&& $default !~ /^<code *point>$/i
- && $default !~ /^<none>$/i))
+ && $default !~ /^<none>$/i
+ && $default !~ /^<script>$/i))
{
$self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
}
elsif ($default =~ /^<code *point>$/i) {
$default = $CODE_POINT;
}
+ elsif ($default =~ /^<script>$/i) {
+
+ # Special case this one. Currently is from
+ # ScriptExtensions.txt, and means for all unlisted
+ # code points, use their Script property values.
+ # For the code points not listed in that file, the
+ # default value is 'Unknown'.
+ $default = "Unknown";
+ }
# Store them as a sub-arrays with both components.
push @{$missings{$addr}}, [ $default, $property ];
# they don't appear in documentation). Enum
main::set_access('status', \%status, 'r');
- my %externally_ok;
+ my %ok_as_filename;
# Similarly, some aliases should not be considered as usable ones for
# external use, such as file names, or we don't want documentation to
# recommend them. Boolean
- main::set_access('externally_ok', \%externally_ok, 'r');
+ main::set_access('ok_as_filename', \%ok_as_filename, 'r');
sub new {
my $class = shift;
$name{$addr} = shift;
$loose_match{$addr} = shift;
$make_re_pod_entry{$addr} = shift;
- $externally_ok{$addr} = shift;
+ $ok_as_filename{$addr} = shift;
$status{$addr} = shift;
$ucd{$addr} = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Null names are never ok externally
- $externally_ok{$addr} = 0 if $name{$addr} eq "";
+ $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
return $self;
}
# style when the pre-existing and replacement
# standard forms are the same, we can move to
# the modern style
- # => $MULTIPLE means that if this range duplicates an
+ # => $MULTIPLE_BEFORE means that if this range duplicates an
# existing one, but has a different value,
# don't replace the existing one, but insert
# this, one so that the same range can occur
# multiple times. They are stored LIFO, so
# that the final one inserted is the first one
# returned in an ordered search of the table.
+ # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
+ # FIFO, so that this one is inserted after all
+ # others that currently exist.
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
# "same value" means identical for non-type-0 ranges, and it means
# Here, we have taken care of the case where $replace is $NO.
# Remember that here, r[$i-1]->end < $start <= r[$i]->end
# If inserting a multiple record, this is where it goes, before the
- # first (if any) existing one. This implies an insertion, and no
- # change to any existing ranges. Note that $i can be -1 if this new
- # range doesn't actually duplicate any existing, and comes at the
- # beginning of the list.
- if ($replace == $MULTIPLE) {
+ # first (if any) existing one if inserting LIFO. (If this is to go
+ # afterwards, FIFO, we below move the pointer to there.) These imply
+ # an insertion, and no change to any existing ranges. Note that $i
+ # can be -1 if this new range doesn't actually duplicate any existing,
+ # and comes at the beginning of the list.
+ if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
if ($start != $end) {
Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
return;
}
- # Don't add an exact duplicate, as it isn't really a multiple
+ # If the new code point is within a current range ...
if ($end >= $r->[$i]->start) {
+
+ # Don't add an exact duplicate, as it isn't really a multiple
my $existing_value = $r->[$i]->value;
my $existing_type = $r->[$i]->type;
return if $value eq $existing_value && $type eq $existing_type;
# pre-existing code point, which will again be a single code
# point range. Because 'i' likely will have changed as a
# result of these operations, we can't just continue on, but
- # do this operation recursively as well.
+ # do this operation recursively as well. If we are inserting
+ # LIFO, the pre-existing code point needs to go after the new
+ # one, so use MULTIPLE_AFTER; and vice versa.
if ($r->[$i]->start != $r->[$i]->end) {
$self->_add_delete('-', $start, $end, "");
$self->_add_delete('+', $start, $end, $value, Type => $type);
- return $self->_add_delete('+', $start, $end, $existing_value, Type => $existing_type, Replace => $MULTIPLE);
+ return $self->_add_delete('+',
+ $start, $end,
+ $existing_value,
+ Type => $existing_type,
+ Replace => ($replace == $MULTIPLE_BEFORE)
+ ? $MULTIPLE_AFTER
+ : $MULTIPLE_BEFORE);
+ }
+ }
+
+ # If to place this new record after, move to beyond all existing
+ # ones.
+ if ($replace == $MULTIPLE_AFTER) {
+ while ($i < @$r && $r->[$i]->start == $start) {
+ $i++;
}
}
return @return;
}
- # Here, we have taken care of $NO and $MULTIPLE replaces. This leaves
- # delete, insert, and replace either unconditionally or if not
+ # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
+ # leaves delete, insert, and replace either unconditionally or if not
# equivalent. $i still points to the first potential affected range.
# Now find the highest range affected, which will determine the length
# parameter to splice. (The input range can span multiple existing
$j--; # $j now points to the highest affected range.
trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
- # Here, have taken care of $NO and $MULTIPLE replaces.
+ # Here, have taken care of $NO and $MULTIPLE_foo replaces.
# $j points to the highest affected range. But it can be < $i or even
# -1. These happen only if the insertion is entirely in the gap
# between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
for my $try_hard (0, 1) {
# Look through all the ranges for a usable code point.
- for my $set ($self->ranges) {
+ for my $set (reverse $self->ranges) {
# Try the edge cases first, starting with the end point of the
# range.
my $self = shift;
my $code_point = shift;
my $value = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+ my %args = @_;
+ my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
+ Carp::carp_extra_args(\%args) if main::DEBUG && %args;
return $self->add_map($code_point, $code_point,
- $value, Replace => $MULTIPLE);
+ $value, Replace => $replace);
}
} # End of closure for package Range_Map
my %format;
# The format of the entries of the table. This is calculated from the
# data in the table (or passed in the constructor). This is an enum e.g.,
- # $STRING_FORMAT
+ # $STRING_FORMAT. It is marked protected as it should not be generally
+ # used to override calculations.
main::set_access('format', \%format, 'r', 'p_s');
sub new {
# All arguments are key => value pairs, which you can see below, most
# of which match fields documented above. Otherwise: Re_Pod_Entry,
- # Externally_Ok, and Fuzzy apply to the names of the table, and are
+ # OK_as_Filename, and Fuzzy apply to the names of the table, and are
# documented in the Alias package
return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
my $ucd = delete $args{'UCD'};
my $description = delete $args{'Description'};
- my $externally_ok = delete $args{'Externally_Ok'};
+ my $ok_as_filename = delete $args{'OK_as_Filename'};
my $loose_match = delete $args{'Fuzzy'};
my $note = delete $args{'Note'};
my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
# clarity. Other routines rely on the full name being first on the
# list
$self->add_alias($full_name{$addr},
- Externally_Ok => $externally_ok,
+ OK_as_Filename => $ok_as_filename,
Fuzzy => $loose_match,
Re_Pod_Entry => $make_re_pod_entry,
Status => $status{$addr},
# Then comes the other name, if meaningfully different.
if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
$self->add_alias($name{$addr},
- Externally_Ok => $externally_ok,
+ OK_as_Filename => $ok_as_filename,
Fuzzy => $loose_match,
Re_Pod_Entry => $make_re_pod_entry,
Status => $status{$addr},
my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
$make_re_pod_entry = $YES unless defined $make_re_pod_entry;
- my $externally_ok = delete $args{'Externally_Ok'};
- $externally_ok = 1 unless defined $externally_ok;
+ my $ok_as_filename = delete $args{'OK_as_Filename'};
+ $ok_as_filename = 1 unless defined $ok_as_filename;
my $status = delete $args{'Status'};
$status = $NORMAL unless defined $status;
$insert_position,
0,
Alias->new($name, $loose_match, $make_re_pod_entry,
- $externally_ok, $status, $ucd);
+ $ok_as_filename, $status, $ucd);
# This name may be shorter than any existing ones, so clear the cache
# of the shortest, so will have to be recalculated.
foreach my $alias ($self->aliases()) {
# Don't use an alias that isn't ok to use for an external name.
- next if ! $alias->externally_ok;
+ next if ! $alias->ok_as_filename;
my $name = main::Standardize($alias->name);
trace $self, $name if main::DEBUG && $to_trace;
my $return = $self->SUPER::header();
- $return .= $INTERNAL_ONLY_HEADER if $self->to_output_map == $INTERNAL_MAP;
+ if ($self->to_output_map == $INTERNAL_MAP) {
+ $return .= $INTERNAL_ONLY_HEADER;
+ }
+ else {
+ my $property_name = $self->property->full_name;
+ $return .= <<END;
+
+# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
+
+# This file is for internal use by core Perl only. It is retained for
+# backwards compatibility with applications that may have come to rely on it,
+# but its format and even its name or existence are subject to change without
+# notice in a future Perl version. Don't use it directly. Instead, its
+# contents are now retrievable through a stable API in the Unicode::UCD
+# module: Unicode::UCD::prop_invmap('$property_name').
+END
+ }
return $return;
}
# have our own flag for just this purpose; but it works now to exclude
# Perl generated synonyms from the lists for properties, where the
# name is always the proper Unicode one.
- my @property_aliases = grep { $_->externally_ok } $self->aliases;
+ my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
my $count = $self->count;
my $default_map = $default_map{$addr};
return unless defined $name;
if (defined $swash_keys{$name}) {
- Carp::my_carp(join_lines(<<END
+ Carp::my_carp(main::join_lines(<<END
Already created a swash name '$name' for $swash_keys{$name}. This means that
the same name desired for $self shouldn't be used. Bad News. This must be
fixed before production use, but proceeding anyway
if $format eq $FLOAT_FORMAT
&& $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
$format = $HEX_FORMAT
- if $format eq $RATIONAL_FORMAT
- && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
+ if ($format eq $RATIONAL_FORMAT
+ && $map !~
+ m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
+ # Assume a leading zero means hex,
+ # even if all digits are 0-9
+ || ($format eq $INTEGER_FORMAT
+ && $map =~ /^0/);
$format = $STRING_FORMAT if $format eq $HEX_FORMAT
&& $map =~ /[^0-9A-F]/;
}
set_default_map
set_file_path
set_final_comment
+ _set_format
set_range_size_1
set_status
set_to_output_map
{ # Closure
- my $indent_increment = " " x 2;
+ my $indent_increment = " " x (($debugging_build) ? 2 : 0);
my %already_output;
$main::simple_dumper_nesting = 0;
my $item = shift;
my $indent = shift;
- $indent = "" if ! defined $indent;
+ $indent = "" if ! $debugging_build || ! defined $indent;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Indent array elements one level
$output .= &simple_dumper($item->[$i], $next_indent);
+ next if ! $debugging_build;
$output =~ s/\n$//; # Remove any trailing nl so
$output .= " # [$i]\n"; # as to add a comment giving
# the array index
}
}
- # This entry is still missing as of 6.0, perhaps because no short name for
- # it.
- if (-e 'NameAliases.txt') {
- my $aliases = property_ref('Name_Alias');
- if (! defined $aliases) {
- $aliases = Property->new('Name_Alias');
- }
- }
-
# These are used so much, that we set globals for them.
$gc = property_ref('General_Category');
$block = property_ref('Block');
# the code point and name on each line. This was actually the hardest
# thing to design around. The code points in those ranges may actually
# have real maps not given by these two lines. These maps will either
- # be algorithmically determinable, or in the extracted files furnished
+ # be algorithmically determinable, or be in the extracted files furnished
# with the UCD. In the event of conflicts between these extracted files,
# and this one, Unicode says that this one prevails. But it shouldn't
# prevail for conflicts that occur in these ranges. The data from the
Range_Size_1 => \&output_perl_charnames_line,
Type => $STRING,
);
- $perl_charname->set_proxy_for('Name', 'Name_Alias');
+ $perl_charname->set_proxy_for('Name');
my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
Directory => File::Spec->curdir(),
return;
}
- $_ = "$fields[0]; lc; $fields[1]";
- $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
- $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
+ my $decimal_code_point = hex $fields[0];
- # Copy any simple case change to the special tables constructed if
- # being overridden by a multi-character case change.
- if ($fields[1] ne $fields[0]
- && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _slc; $value");
- }
- if ($fields[2] ne $fields[0]
- && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _stc; $value");
- }
- if ($fields[3] ne $fields[0]
- && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _suc; $value");
+ # Loop to handle each of the three mappings in the input line, in
+ # order, with $i indicating the current field number.
+ my $i = 0;
+ for my $object ($lc, $tc, $uc) {
+ $i++; # First time through, $i = 0 ... 3rd time = 3
+
+ my $value = $object->value_of($decimal_code_point);
+ $value = ($value eq $CODE_POINT)
+ ? $decimal_code_point
+ : hex $value;
+
+ # If this isn't a multi-character mapping, it should already have
+ # been read in.
+ if ($fields[$i] !~ / /) {
+ if ($value != hex $fields[$i]) {
+ Carp::my_carp("Bad news. UnicodeData.txt thinks "
+ . $object->name
+ . "(0x$fields[0]) is $value"
+ . " and SpecialCasing.txt thinks it is "
+ . hex $fields[$i]
+ . ". Good luck. Proceeding anyway.");
+ }
+ }
+ else {
+ $file->insert_adjusted_lines("$fields[0]; "
+ . $object->full_name
+ . "; $fields[$i]");
+
+ # Copy any simple case change to the special tables
+ # constructed if being overridden by a multi-character case
+ # change.
+ if ($value != $decimal_code_point) {
+ $file->insert_adjusted_lines(sprintf("%s; _s%s; %04X",
+ $fields[0],
+ $object->name,
+ $value));
+ }
+ }
}
+ # Everything has been handled by the insert_adjusted_lines()
+ $_ = "";
+
return;
}
}
# Create the map for simple only if are going to output it, for otherwise
# it takes no part in anything we do.
my $to_output_simple;
+ my $non_final_folds;
sub setup_case_folding($) {
# Read in the case foldings in CaseFolding.txt. This handles both
property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
}
+ $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Description => "Code points that particpate in a multi-char fold not in the final position",
+ );
+
# If we ever wanted to show that these tables were combined, a new
# property method could be created, like set_combined_props()
property_ref('Case_Folding')->add_comment(join_lines( <<END
# so that _swash_inversion_hash() is able to construct closures
# without having to worry about F mappings.
if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
- $_ = "$range; Case_Folding; $CMD_DELIM$REPLACE_CMD=$MULTIPLE$CMD_DELIM$map";
+ $_ = "$range; Case_Folding; "
+ . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
+ if ($type eq 'F') {
+ my @string = split " ", $map;
+ for my $i (0 .. @string - 1 -1) {
+ $non_final_folds->add_range(hex $string[$i], hex $string[$i]);
+ }
+ }
}
else {
$_ = "";
# The Script_Extensions property starts out with a clone of the Script
# property.
- my $sc = property_ref("Script");
- my $scx = Property->new("scx", Full_Name => "Script_Extensions",
- Initialize => $sc,
- Default_Map => $sc->default_map,
- Pre_Declared_Maps => 0,
- Format => $STRING_WHITE_SPACE_LIST,
- );
+ my $scx = property_ref("Script_Extensions");
+ $scx = Property->new("scx", Full_Name => "Script_Extensions")
+ if ! defined $scx;
+ $scx->_set_format($STRING_WHITE_SPACE_LIST);
+ $scx->initialize($script);
+ $scx->set_default_map($script->default_map);
+ $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
$scx->add_comment(join_lines( <<END
The values for code points that appear in one script are just the same as for
the 'Script' property. Likewise the values for those that appear in many
END
));
- # Make the scx's tables and aliases for them the same as sc's
- foreach my $table ($sc->tables) {
+ # Initialize scx's tables and the aliases for them to be the same as sc's
+ foreach my $table ($script->tables) {
my $scx_table = $scx->add_match_table($table->name,
Full_Name => $table->full_name);
foreach my $alias ($table->aliases) {
return;
}
-sub setup_v6_name_alias {
- property_ref('Name_Alias')->add_map(7, 7, "ALERT");
+sub setup_early_name_alias {
+ my $aliases = property_ref('Name_Alias');
+ $aliases = Property->new('Name_Alias') if ! defined $aliases;
+
+ # Before 6.0, this wasn't a problem, and after it, this alias is part of
+ # the Unicode-delivered file.
+ $aliases->add_map(7, 7, "ALERT: control") if $v_version eq v6.0.0;
+ return;
+}
+
+sub filter_later_version_name_alias_line {
+
+ # This file has an extra entry per line for the alias type. This is
+ # handled by creating a compound entry: "$alias: $type"; First, split
+ # the line into components.
+ my ($range, $alias, $type, @remainder)
+ = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+
+ # This file contains multiple entries for some components, so tell the
+ # downstream code to allow this in our internal tables; the
+ # $MULTIPLE_AFTER preserves the input ordering.
+ $_ = join ";", $range, $CMD_DELIM
+ . $REPLACE_CMD
+ . '='
+ . $MULTIPLE_AFTER
+ . $CMD_DELIM
+ . "$alias: $type",
+ @remainder;
+ return;
+}
+
+sub filter_early_version_name_alias_line {
+
+ # Early versions did not have the trailing alias type field; implicitly it
+ # was 'correction'
+ $_ .= "; correction";
+ filter_later_version_name_alias_line;
+ return;
}
sub finish_Unicode() {
# need to be finished up.
next if $property == $perl;
+ # Nor do we need to do anything with properties that aren't going to
+ # be output.
+ next if $property->fate == $SUPPRESSED;
+
# Handle the properties that have more than one possible default
if (ref $property->default_map) {
my $default_map = $property->default_map;
$Posix_Lower->set_caseless_equivalent($Posix_Alpha);
my $Alnum = $perl->add_match_table('Alnum',
- Description => 'Alphabetic and (Decimal) Numeric',
+ Description => 'Alphabetic and (decimal) Numeric',
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
$Alnum->add_alias('XPosixAlnum');
my $alias = property_ref('Name_Alias');
if (defined $alias) {
push @composition, 'Name_Alias';
+ $perl_charname->set_proxy_for('Name_Alias');
+ my $unicode_1 = property_ref('Unicode_1_Name');
+ my %abbreviations;
+
+ # Add each entry in Name_Alias to Perl_Charnames. Where these go with
+ # respect to any existing entry depends on the entry type.
+ # Corrections go before said entry, as they should be returned in
+ # preference over the existing entry. (A correction to a correction
+ # should be later in the Name_Alias table, so it will correctly
+ # precede the erroneous correction in Perl_Charnames.)
+ #
+ # Abbreviations go after everything else, so they are saved
+ # temporarily in a hash for later.
+ #
+ # Controls are currently added afterwards. This is because Perl has
+ # previously used the Unicode1 name, and so should still use that.
+ # (Most of them will be the same anyway, in which case we don't add a
+ # duplicate)
+
$alias->reset_each_range;
while (my ($range) = $alias->each_range) {
next if $range->value eq "";
- if ($range->start != $range->end) {
- Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
+ my $code_point = $range->start;
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+ my ($value, $type) = split ': ', $range->value;
+ my $replace_type;
+ if ($type eq 'correction') {
+ $replace_type = $MULTIPLE_BEFORE;
+ }
+ elsif ($type eq 'abbreviation') {
+
+ # Save for later
+ $abbreviations{$value} = $code_point;
+ next;
+ }
+ elsif ($type eq 'control') {
+ my $unicode_1_value = $unicode_1->value_of($code_point);
+ next if $unicode_1_value eq $value;
+ $replace_type = $MULTIPLE_AFTER;
+ }
+ else {
+ $replace_type = $MULTIPLE_AFTER;
}
- $perl_charname->add_duplicate($range->start, $range->value);
+
+ # Actually add; before or after current entry(ies) as determined
+ # above.
+ $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ }
+
+ # Now that have everything added, add in abbreviations after
+ # everything else.
+ foreach my $value (keys %abbreviations) {
+ $perl_charname->add_duplicate($abbreviations{$value}, $value, Replace => $MULTIPLE_AFTER);
}
$alias_sentence = <<END;
-The Name_Alias property adds duplicate code point entries with a corrected
-name. The original (less correct, but still valid) name will be physically
-last.
+The Name_Alias property adds duplicate code point entries that are
+alternatives to the original name. If an addition is a corrected
+name, it will be physically first in the table. The original (less correct,
+but still valid) name will be next; then any alternatives, in no particular
+order; and finally any abbreviations, again in no particular order.
END
}
+
my $comment;
if (@composition <= 2) { # Always at least 2
$comment = join " and ", @composition;
$perl_charname->add_comment(join_lines( <<END
This file is for charnames.pm. It is the union of the $comment properties.
-Unicode_1_Name entries are used only for otherwise nameless code
-points.
+Unicode_1_Name entries are used only for nameless code points in the Name
+property.
$alias_sentence
This file doesn't include the algorithmically determinable names. For those,
use 'unicore/Name.pm'
Re_Pod_Entry => 0,
UCD => 0,
Status => $alias->status,
- Externally_Ok => 0);
+ OK_as_Filename => 0);
}
}
# No name collision, so ok to add the perl synonym.
my $make_re_pod_entry;
- my $externally_ok;
+ my $ok_as_filename;
my $status = $alias->status;
if ($nominal_property == $block) {
if ($prefix eq "") {
$make_re_pod_entry = 1;
$status = $status || $DISCOURAGED;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
elsif ($prefix eq 'In_') {
$make_re_pod_entry = 0;
$status = $status || $NORMAL;
- $externally_ok = 1;
+ $ok_as_filename = 1;
}
else {
$make_re_pod_entry = 0;
$status = $status || $DISCOURAGED;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
}
elsif ($prefix ne "") {
# card, and we won't use it for an external name
$make_re_pod_entry = 0;
$status = $status || $NORMAL;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
else {
# own pod entry and can be used for an external name.
$make_re_pod_entry = 1;
$status = $status || $NORMAL;
- $externally_ok = 1;
+ $ok_as_filename = 1;
}
# Here, there isn't a perl pre-existing table with the
UCD => 0,
Status => $status,
- Externally_Ok => $externally_ok);
+ OK_as_Filename => $ok_as_filename);
trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
next PREFIX;
}
UCD => 0,
Status => $status,
- Externally_Ok => $externally_ok);
+ OK_as_Filename => $ok_as_filename);
# And it will be related to the actual table, since it is
# based on it.
$added_table->set_equivalent_to($actual, Related => 1);
foreach my $file (keys %ignored_files) {
push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
}
+ foreach my $file (keys %skipped_files) {
+ push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
+ }
# Then, sort each group.
foreach my $group (keys %grouped_by_reason) {
expressions.
And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
-interpolation in double-quoted strings and regular expressions, but both
-usages require a L<use charnames;|charnames> to be specified, which also
-contains related functions viacode(), vianame(), and string_vianame().
+interpolation in double-quoted strings and regular expressions; and functions
+C<charnames::viacode()>, C<charnames::vianame()>, and
+C<charnames::string_vianame()> (which require a C<use charnames ();> to be
+specified.
Finally, most properties related to decomposition are accessible via
L<Unicode::Normalize>.
push @name, <<END;
+package charnames;
+
# This module contains machine-generated tables and code for the
# algorithmically-determinable Unicode character names. The following
# routines can be used to translate between name and code point and vice versa
foreach my $alias ($table->aliases) {
# Skip non-legal names
- next unless $alias->externally_ok;
+ next unless $alias->ok_as_filename;
next unless $alias->ucd;
$found_ucd = 1; # have at least one legal name
# standardized alias
foreach my $alias ($table->aliases) {
next unless $alias->ucd;
- next unless $alias->externally_ok;
+ next unless $alias->ok_as_filename;
push @{$perlprop_to_aliases{standardize($alias->name)}},
@aliases_list;
}
|| ($table == $property->table('N')
&& $property->table('Y')->is_empty));
-
- # Some tables should match everything
- my $expected_full =
- ($is_property)
- ? # All these types of map tables will be full because
- # they will have been populated with defaults
- ($type == $ENUM || $type == $FORCED_BINARY)
-
- : # A match table should match everything if its method
- # shows it should
- ($table->matches_all
-
- # The complement of an empty binary table will match
- # everything
- || $is_complement_of_empty_binary
- )
- ;
-
if ($table->is_empty) {
if ($suppress_if_empty_warn_if_not) {
Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
}
+ # Some tables should match everything
+ my $expected_full =
+ ($table->fate == $SUPPRESSED)
+ ? 0
+ : ($is_property)
+ ? # All these types of map tables will be full because
+ # they will have been populated with defaults
+ ($type == $ENUM || $type == $FORCED_BINARY)
+
+ : # A match table should match everything if its method
+ # shows it should
+ ($table->matches_all
+
+ # The complement of an empty binary table will match
+ # everything
+ || $is_complement_of_empty_binary
+ )
+ ;
+
my $count = $table->count;
if ($expected_full) {
if ($count != $MAX_UNICODE_CODEPOINTS) {
push @property_aliases, map { Alias->new("Is_" . $_->name,
$_->loose_match,
$_->make_re_pod_entry,
- $_->externally_ok,
+ $_->ok_as_filename,
$_->status,
$_->ucd,
)
# others except DAge.txt (as data in an extracted file can be over-ridden by
# the non-extracted. Some other files depend on data derived from an earlier
# file, like UnicodeData requires data from Jamo, and the case changing and
-# folding requires data from Unicode. Mostly, it safest to order by first
+# folding requires data from Unicode. Mostly, it is safest to order by first
# version releases in (except the Jamo). DAge.txt is read before the
# extracted ones because of the rarely used feature $compare_versions. In the
# unlikely event that there were ever an extracted file that contained the Age
Input_file->new('SpecialCasing.txt', v2.1.8,
Each_Line_Handler => \&filter_special_casing_line,
Pre_Handler => \&setup_special_casing,
+ Has_Missings_Defaults => $IGNORED,
),
Input_file->new(
'LineBreak.txt', v3.0.0,
: undef,
\&filter_case_folding_line
],
+ Has_Missings_Defaults => $IGNORED,
),
Input_file->new('DCoreProperties.txt', v3.1.0,
# 5.2 changed this file
),
Input_file->new('NameAliases.txt', v5.0.0,
Property => 'Name_Alias',
- Pre_Handler => ($v_version ge v6.0.0)
- ? \&setup_v6_name_alias
+ Pre_Handler => ($v_version le v6.0.0)
+ ? \&setup_early_name_alias
: undef,
+ Each_Line_Handler => ($v_version le v6.0.0)
+ ? \&filter_early_version_name_alias_line
+ : \&filter_later_version_name_alias_line,
),
Input_file->new("BidiTest.txt", v5.2.0,
Skip => 'Validation Tests',
Property => 'Script_Extensions',
Pre_Handler => \&setup_script_extensions,
Each_Line_Handler => \&filter_script_extensions_line,
+ Has_Missings_Defaults => (($v_version le v6.0.0)
+ ? $NO_DEFAULTS
+ : $IGNORED),
+ ),
+ # The two Indic files are actually available starting in v6.0.0, but their
+ # property values are missing from PropValueAliases.txt in that release,
+ # so that further work would have to be done to get them to work properly
+ # for that release.
+ Input_file->new('IndicMatraCategory.txt', v6.1.0,
+ Property => 'Indic_Matra_Category',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ Skip => "Provisional; for the analysis and processing of Indic scripts",
+ ),
+ Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
+ Property => 'Indic_Syllabic_Category',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ Skip => "Provisional; for the analysis and processing of Indic scripts",
),
);
# The paths are stored with relative names, and with '/' as the
# delimiter; convert to absolute on this machine
my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
- $potential_files{$full} = 1
- if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
+ $potential_files{lc $full} = 1
+ if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
}
}
}
my @unknown_input_files;
- foreach my $file (keys %potential_files) {
- next if grep { lc($file) eq lc($_) } @known_files;
+ foreach my $file (keys %potential_files) { # The keys are stored in lc
+ next if grep { $file eq lc($_) } @known_files;
# Here, the file is unknown to us. Get relative path name
$file = File::Spec->abs2rel($file);