use File::Path;
use File::Spec;
use Text::Tabs;
+use re "/aa";
sub DEBUG () { 0 } # Set to 0 for production; 1 for development
my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
-makelist : Rewrite the file list $file_list based on current setup
-annotate : Output an annotation for each character in the table files;
useful for debugging mktables, looking at diffs; but is slow,
- memory intensive; resulting tables are usable but slow and
- very large.
+ memory intensive; resulting tables are usable but are slow and
+ very large (and currently fail the Unicode::UCD.t tests).
-check A B : Executes $0 only if A and B are the same
END
}
if $v_version ge v4.1.0;
push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
if $v_version ge v6.0.0;
+push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
+ if $v_version ge v6.1.0;
+push @tables_that_may_be_empty, '_stc';
# The lists below are hashes, so the key is the item in the list, and the
# value is the reason why it is in the list. This makes generation of
# contains the same information, but without the algorithmically
# determinable Hangul syllables'. This file is not published, so it's
# existence is not noted in the comment.
- 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
+ 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
- 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
+ 'Indic_Matra_Category' => "Provisional",
+ 'Indic_Syllabic_Category' => "Provisional",
- 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
- 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
- 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
- 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
+ # Don't suppress ISO_Comment, as otherwise special handling is needed
+ # to differentiate between it and gc=c, which can be written as 'isc',
+ # which is the same characters as ISO_Comment's short name.
- 'Name' => "Accessible via 'use charnames;'",
- 'Name_Alias' => "Accessible via 'use charnames;'",
+ 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
+
+ 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
+ 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
+ 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
+ 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
);
- # The following are suppressed because they were made contributory or
- # deprecated by Unicode before Perl ever thought about supporting them.
- foreach my $property ('Jamo_Short_Name',
- 'Grapheme_Link',
- 'Expands_On_NFC',
- 'Expands_On_NFD',
- 'Expands_On_NFKC',
- 'Expands_On_NFKD'
+ foreach my $property (
+
+ # The following are suppressed because they were made contributory
+ # or deprecated by Unicode before Perl ever thought about
+ # supporting them.
+ 'Jamo_Short_Name',
+ 'Grapheme_Link',
+ 'Expands_On_NFC',
+ 'Expands_On_NFD',
+ 'Expands_On_NFKC',
+ 'Expands_On_NFKD',
+
+ # The following are suppressed because they have been marked
+ # as deprecated for a sufficient amount of time
+ 'Other_Alphabetic',
+ 'Other_Default_Ignorable_Code_Point',
+ 'Other_Grapheme_Extend',
+ 'Other_ID_Continue',
+ 'Other_ID_Start',
+ 'Other_Lowercase',
+ 'Other_Math',
+ 'Other_Uppercase',
) {
$why_suppressed{$property} = $why_deprecated{$property};
}
'ReadMe.txt' => 'Documentation',
'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
- 'IndicMatraCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts',
- 'IndicSyllabicCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts',
'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
);
+my %skipped_files; # List of files that we skip
+
### End of externally interesting definitions, except for @input_file_objects
my $HEADER=<<"EOF";
my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
# the comments at the subroutine definition.
my $UNCONDITIONALLY = 2; # Replace without conditions.
-my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
+my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
# already there
-my $CROAK = 5; # Die with an error if is already there
+my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
+ # already there
+my $CROAK = 6; # Die with an error if is already there
# Flags to give property statuses. The phrases are to remind maintainers that
# if the flag is changed, the indefinite article referring to it in the
my %loose_to_file_of; # loosely maps table names to their respective
# files
my %stricter_to_file_of; # same; but for stricter mapping.
+my %loose_property_to_file_of; # Maps a loose property name to its map file
+my %file_to_swash_name; # Maps the file name to its corresponding key name
+ # in the hash %utf8::SwashInfo
my %nv_floating_to_rational; # maps numeric values floating point numbers to
# their rational equivalent
my %loose_property_name_of; # Loosely maps (non_string) property names to
# standard form
+my %string_property_loose_to_name; # Same, for string properties.
+my %loose_defaults; # keys are of form "prop=value", where 'prop' is
+ # the property name in standard loose form, and
+ # 'value' is the default value for that property,
+ # also in standard loose form.
my %loose_to_standard_value; # loosely maps table names to the canonical
# alias for them
+my %ambiguous_names; # keys are alias names (in standard form) that
+ # have more than one possible meaning.
+my %prop_aliases; # Keys are standard property name; values are each
+ # one's aliases
+my %prop_value_aliases; # Keys of top level are standard property name;
+ # values are keys to another hash, Each one is
+ # one of the property's values, in standard form.
+ # The values are that prop-val's aliases.
+my %ucd_pod; # Holds entries that will go into the UCD section of the pod
# Most properties are immune to caseless matching, otherwise you would get
# nonsensical results, as properties are a function of a code point, not
# contrast to the non_skip element, which is supposed to be used very
# temporarily for debugging. Sets 'optional' to 1. Also, files that we
# pretty much will never look at can be placed in the global
- # %ignored_files instead. Ones used here will be added to that list.
+ # %ignored_files instead. Ones used here will be added to %skipped files
main::set_access('skip', \%skip, 'c');
my %each_line_handler;
# including its reason
if ($skip{$addr}) {
$optional{$addr} = 1;
- $ignored_files{$file{$addr}} = $skip{$addr}
+ $skipped_files{$file{$addr}} = $skip{$addr}
}
return $self;
# its name
if ($seen_non_extracted_non_age) {
if ($file =~ /$EXTRACTED/i) {
- Carp::my_carp_bug(join_lines(<<END
+ Carp::my_carp_bug(main::join_lines(<<END
$file should be processed just after the 'Prop...Alias' files, and before
anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
have subtle problems
# they are deleted from the hash, so any that remain at the
# end of the program are files that we didn't process.
my $fkey = File::Spec->rel2abs($file);
- my $expecting = delete $potential_files{$fkey};
- $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
+ my $expecting = delete $potential_files{lc($fkey)};
+
Carp::my_carp("Was not expecting '$file'.") if
! $expecting
&& ! defined $handle{$addr};
|| @defaults > 2
|| ($default =~ /^</
&& $default !~ /^<code *point>$/i
- && $default !~ /^<none>$/i))
+ && $default !~ /^<none>$/i
+ && $default !~ /^<script>$/i))
{
$self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
}
elsif ($default =~ /^<code *point>$/i) {
$default = $CODE_POINT;
}
+ elsif ($default =~ /^<script>$/i) {
+
+ # Special case this one. Currently is from
+ # ScriptExtensions.txt, and means for all unlisted
+ # code points, use their Script property values.
+ # For the code points not listed in that file, the
+ # default value is 'Unknown'.
+ $default = "Unknown";
+ }
# Store them as a sub-arrays with both components.
push @{$missings{$addr}}, [ $default, $property ];
# they don't appear in documentation). Enum
main::set_access('status', \%status, 'r');
- my %externally_ok;
+ my %ok_as_filename;
# Similarly, some aliases should not be considered as usable ones for
# external use, such as file names, or we don't want documentation to
# recommend them. Boolean
- main::set_access('externally_ok', \%externally_ok, 'r');
+ main::set_access('ok_as_filename', \%ok_as_filename, 'r');
sub new {
my $class = shift;
$name{$addr} = shift;
$loose_match{$addr} = shift;
$make_re_pod_entry{$addr} = shift;
- $externally_ok{$addr} = shift;
+ $ok_as_filename{$addr} = shift;
$status{$addr} = shift;
$ucd{$addr} = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Null names are never ok externally
- $externally_ok{$addr} = 0 if $name{$addr} eq "";
+ $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
return $self;
}
# either a constructor or a method. If called as a method, the result
# will be a new() instance of the calling object, containing the union
# of that object with the other parameter's code points; if called as
- # a constructor, the first parameter gives the class the new object
+ # a constructor, the first parameter gives the class that the new object
# should be, and the second parameter gives the code points to go into
# it.
# In either case, there are two parameters looked at by this routine;
# just a single code point.
#
# If they are ranges, this routine doesn't make any effort to preserve
- # the range values of one input over the other. Therefore this base
- # class should not allow _union to be called from other than
+ # the range values and types of one input over the other. Therefore
+ # this base class should not allow _union to be called from other than
# initialization code, so as to prevent two tables from being added
# together where the range values matter. The general form of this
# routine therefore belongs in a derived class, but it was moved here
# to avoid duplication of code. The failure to overload this in this
# class keeps it safe.
#
+ # It does make the effort during initialization to accept tables with
+ # multiple values for the same code point, and to preserve the order
+ # of these. If there is only one input range or range set, it doesn't
+ # sort (as it should already be sorted to the desired order), and will
+ # accept multiple values per code point. Otherwise it will merge
+ # multiple values into a single one.
my $self;
my @args; # Arguments to pass to the constructor
# Accumulate all records from both lists.
my @records;
+ my $input_count = 0;
for my $arg (@args) {
#local $to_trace = 0 if main::DEBUG;
trace "argument = $arg" if main::DEBUG && $to_trace;
Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
return;
}
+
$arg = [ $arg ] if ! ref $arg;
my $type = ref $arg;
if ($type eq 'ARRAY') {
foreach my $element (@$arg) {
push @records, Range->new($element, $element);
+ $input_count++;
}
}
elsif ($arg->isa('Range')) {
push @records, $arg;
+ $input_count++;
}
elsif ($arg->can('ranges')) {
push @records, $arg->ranges;
+ $input_count++;
}
else {
my $message = "";
# Sort with the range containing the lowest ordinal first, but if
# two ranges start at the same code point, sort with the bigger range
# of the two first, because it takes fewer cycles.
- @records = sort { ($a->start <=> $b->start)
+ if ($input_count > 1) {
+ @records = sort { ($a->start <=> $b->start)
or
# if b is shorter than a, b->end will be
# less than a->end, and we want to select
# a, so want to return -1
($b->end <=> $a->end)
} @records;
+ }
my $new = $class->new(@_);
for my $set (@records) {
my $start = $set->start;
my $end = $set->end;
- my $value = $set->value;
+ my $value = $set->value;
+ my $type = $set->type;
if ($start > $new->max) {
- $new->_add_delete('+', $start, $end, $value);
+ $new->_add_delete('+', $start, $end, $value, Type => $type);
}
elsif ($end > $new->max) {
- $new->_add_delete('+', $new->max +1, $end, $value);
+ $new->_add_delete('+', $new->max +1, $end, $value,
+ Type => $type);
+ }
+ elsif ($input_count == 1) {
+ # Here, overlaps existing range, but is from a single input,
+ # so preserve the multiple values from that input.
+ $new->_add_delete('+', $start, $end, $value, Type => $type,
+ Replace => $MULTIPLE_AFTER);
}
}
# new and old values are identical, the
# replacement is skipped to save cycles
# => $IF_NOT_EQUIVALENT means to replace the existing values
- # with this one if they are not equivalent.
+ # (the default) with this one if they are not equivalent.
# Ranges are equivalent if their types are the
# same, and they are the same string; or if
# both are type 0 ranges, if their Unicode
# style when the pre-existing and replacement
# standard forms are the same, we can move to
# the modern style
- # => $MULTIPLE means that if this range duplicates an
+ # => $MULTIPLE_BEFORE means that if this range duplicates an
# existing one, but has a different value,
# don't replace the existing one, but insert
# this, one so that the same range can occur
# multiple times. They are stored LIFO, so
# that the final one inserted is the first one
# returned in an ordered search of the table.
+ # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
+ # FIFO, so that this one is inserted after all
+ # others that currently exist.
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
# "same value" means identical for non-type-0 ranges, and it means
# Here, we have taken care of the case where $replace is $NO.
# Remember that here, r[$i-1]->end < $start <= r[$i]->end
# If inserting a multiple record, this is where it goes, before the
- # first (if any) existing one. This implies an insertion, and no
- # change to any existing ranges. Note that $i can be -1 if this new
- # range doesn't actually duplicate any existing, and comes at the
- # beginning of the list.
- if ($replace == $MULTIPLE) {
+ # first (if any) existing one if inserting LIFO. (If this is to go
+ # afterwards, FIFO, we below move the pointer to there.) These imply
+ # an insertion, and no change to any existing ranges. Note that $i
+ # can be -1 if this new range doesn't actually duplicate any existing,
+ # and comes at the beginning of the list.
+ if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
if ($start != $end) {
Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
return;
}
- # Don't add an exact duplicate, as it isn't really a multiple
+ # If the new code point is within a current range ...
if ($end >= $r->[$i]->start) {
+
+ # Don't add an exact duplicate, as it isn't really a multiple
my $existing_value = $r->[$i]->value;
my $existing_type = $r->[$i]->type;
return if $value eq $existing_value && $type eq $existing_type;
# pre-existing code point, which will again be a single code
# point range. Because 'i' likely will have changed as a
# result of these operations, we can't just continue on, but
- # do this operation recursively as well.
+ # do this operation recursively as well. If we are inserting
+ # LIFO, the pre-existing code point needs to go after the new
+ # one, so use MULTIPLE_AFTER; and vice versa.
if ($r->[$i]->start != $r->[$i]->end) {
$self->_add_delete('-', $start, $end, "");
$self->_add_delete('+', $start, $end, $value, Type => $type);
- return $self->_add_delete('+', $start, $end, $existing_value, Type => $existing_type, Replace => $MULTIPLE);
+ return $self->_add_delete('+',
+ $start, $end,
+ $existing_value,
+ Type => $existing_type,
+ Replace => ($replace == $MULTIPLE_BEFORE)
+ ? $MULTIPLE_AFTER
+ : $MULTIPLE_BEFORE);
+ }
+ }
+
+ # If to place this new record after, move to beyond all existing
+ # ones.
+ if ($replace == $MULTIPLE_AFTER) {
+ while ($i < @$r && $r->[$i]->start == $start) {
+ $i++;
}
}
return @return;
}
- # Here, we have taken care of $NO and $MULTIPLE replaces. This leaves
- # delete, insert, and replace either unconditionally or if not
+ # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
+ # leaves delete, insert, and replace either unconditionally or if not
# equivalent. $i still points to the first potential affected range.
# Now find the highest range affected, which will determine the length
# parameter to splice. (The input range can span multiple existing
$j--; # $j now points to the highest affected range.
trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
- # Here, have taken care of $NO and $MULTIPLE replaces.
+ # Here, have taken care of $NO and $MULTIPLE_foo replaces.
# $j points to the highest affected range. But it can be < $i or even
# -1. These happen only if the insertion is entirely in the gap
# between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
for my $try_hard (0, 1) {
# Look through all the ranges for a usable code point.
- for my $set ($self->ranges) {
+ for my $set (reverse $self->ranges) {
# Try the edge cases first, starting with the end point of the
# range.
my $self = shift;
my $code_point = shift;
my $value = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+ my %args = @_;
+ my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
+ Carp::carp_extra_args(\%args) if main::DEBUG && %args;
return $self->add_map($code_point, $code_point,
- $value, Replace => $MULTIPLE);
+ $value, Replace => $replace);
}
} # End of closure for package Range_Map
my %format;
# The format of the entries of the table. This is calculated from the
# data in the table (or passed in the constructor). This is an enum e.g.,
- # $STRING_FORMAT
+ # $STRING_FORMAT. It is marked protected as it should not be generally
+ # used to override calculations.
main::set_access('format', \%format, 'r', 'p_s');
sub new {
# All arguments are key => value pairs, which you can see below, most
# of which match fields documented above. Otherwise: Re_Pod_Entry,
- # Externally_Ok, and Fuzzy apply to the names of the table, and are
+ # OK_as_Filename, and Fuzzy apply to the names of the table, and are
# documented in the Alias package
return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
my $ucd = delete $args{'UCD'};
my $description = delete $args{'Description'};
- my $externally_ok = delete $args{'Externally_Ok'};
+ my $ok_as_filename = delete $args{'OK_as_Filename'};
my $loose_match = delete $args{'Fuzzy'};
my $note = delete $args{'Note'};
my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
# clarity. Other routines rely on the full name being first on the
# list
$self->add_alias($full_name{$addr},
- Externally_Ok => $externally_ok,
+ OK_as_Filename => $ok_as_filename,
Fuzzy => $loose_match,
Re_Pod_Entry => $make_re_pod_entry,
Status => $status{$addr},
# Then comes the other name, if meaningfully different.
if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
$self->add_alias($name{$addr},
- Externally_Ok => $externally_ok,
+ OK_as_Filename => $ok_as_filename,
Fuzzy => $loose_match,
Re_Pod_Entry => $make_re_pod_entry,
Status => $status{$addr},
my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
$make_re_pod_entry = $YES unless defined $make_re_pod_entry;
- my $externally_ok = delete $args{'Externally_Ok'};
- $externally_ok = 1 unless defined $externally_ok;
+ my $ok_as_filename = delete $args{'OK_as_Filename'};
+ $ok_as_filename = 1 unless defined $ok_as_filename;
my $status = delete $args{'Status'};
$status = $NORMAL unless defined $status;
$insert_position,
0,
Alias->new($name, $loose_match, $make_re_pod_entry,
- $externally_ok, $status, $ucd);
+ $ok_as_filename, $status, $ucd);
# This name may be shorter than any existing ones, so clear the cache
# of the shortest, so will have to be recalculated.
foreach my $alias ($self->aliases()) {
# Don't use an alias that isn't ok to use for an external name.
- next if ! $alias->externally_ok;
+ next if ! $alias->ok_as_filename;
my $name = main::Standardize($alias->name);
trace $self, $name if main::DEBUG && $to_trace;
if ($annotate) {
- # if annotating each code point, must print 1 per line.
+ # If annotating each code point, must print 1 per line.
# The variable could point to a subroutine, and we don't want
# to lose that fact, so only set if not set already
$range_size_1 = 1 if ! $range_size_1;
\%anomalous_entries,
'readable_array');
- my %core_access;
- # This is a string, solely for documentation, indicating how one can get
- # access to this property via the Perl core.
- main::set_access('core_access', \%core_access, 'r', 's');
-
my %to_output_map;
# Enum as to whether or not to write out this map table:
# 0 don't output
# Optional initialization data for the table.
my $initialize = delete $args{'Initialize'};
- my $core_access = delete $args{'Core_Access'};
my $default_map = delete $args{'Default_Map'};
my $property = delete $args{'_Property'};
my $full_name = delete $args{'Full_Name'};
my $addr = do { no overloading; pack 'J', $self; };
$anomalous_entries{$addr} = [];
- $core_access{$addr} = $core_access;
$default_map{$addr} = $default_map;
$self->initialize($initialize) if defined $initialize;
my $return = $self->SUPER::header();
- $return .= $INTERNAL_ONLY_HEADER if $self->to_output_map == $INTERNAL_MAP;
+ if ($self->to_output_map == $INTERNAL_MAP) {
+ $return .= $INTERNAL_ONLY_HEADER;
+ }
+ else {
+ my $property_name = $self->property->full_name;
+ $return .= <<END;
+
+# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
+
+# This file is for internal use by core Perl only. It is retained for
+# backwards compatibility with applications that may have come to rely on it,
+# but its format and even its name or existence are subject to change without
+# notice in a future Perl version. Don't use it directly. Instead, its
+# contents are now retrievable through a stable API in the Unicode::UCD
+# module: Unicode::UCD::prop_invmap('$property_name').
+END
+ }
return $return;
}
# have our own flag for just this purpose; but it works now to exclude
# Perl generated synonyms from the lists for properties, where the
# name is always the proper Unicode one.
- my @property_aliases = grep { $_->externally_ok } $self->aliases;
+ my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
my $count = $self->count;
my $default_map = $default_map{$addr};
$property_aliases[$i]->name . '(cp)'
);
}
- $comment .=
- "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
-
- my $access = $core_access{$addr};
- if ($access) {
- $comment .= "accessible through the Perl core via $access.";
- }
- else {
- $comment .= "not accessible through the Perl core directly.";
- }
+ my $full_name = $self->property->full_name;
+ $comment .= "\nwhere 'cp' is $cp. Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
# And append any commentary already set from the actual property.
$comment .= "\n\n" . $self->comment if $self->comment;
return unless defined $name;
if (defined $swash_keys{$name}) {
- Carp::my_carp(join_lines(<<END
+ Carp::my_carp(main::join_lines(<<END
Already created a swash name '$name' for $swash_keys{$name}. This means that
the same name desired for $self shouldn't be used. Bad News. This must be
fixed before production use, but proceeding anyway
\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
END
if ($specials_name) {
- $return .= <<END;
+ $return .= <<END;
\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
END
}
if $format eq $FLOAT_FORMAT
&& $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
$format = $HEX_FORMAT
- if $format eq $RATIONAL_FORMAT
- && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
+ if ($format eq $RATIONAL_FORMAT
+ && $map !~
+ m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
+ # Assume a leading zero means hex,
+ # even if all digits are 0-9
+ || ($format eq $INTEGER_FORMAT
+ && $map =~ /^0/);
$format = $STRING_FORMAT if $format eq $HEX_FORMAT
&& $map =~ /[^0-9A-F]/;
}
$any_of_these = 'any of these'
}
- my $comment = "";
+ my $comment = "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
if ($has_unrelated) {
$comment .= <<END;
This file is for tables that are not necessarily related: To conserve
comment
complete_name
containing_range
- core_access
count
default_map
delete_range
range_size_1
reset_each_range
set_comment
- set_core_access
set_default_map
set_file_path
set_final_comment
+ _set_format
set_range_size_1
set_status
set_to_output_map
{ # Closure
- my $indent_increment = " " x 2;
+ my $indent_increment = " " x (($debugging_build) ? 2 : 0);
my %already_output;
$main::simple_dumper_nesting = 0;
my $item = shift;
my $indent = shift;
- $indent = "" if ! defined $indent;
+ $indent = "" if ! $debugging_build || ! defined $indent;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Indent array elements one level
$output .= &simple_dumper($item->[$i], $next_indent);
+ next if ! $debugging_build;
$output =~ s/\n$//; # Remove any trailing nl so
$output .= " # [$i]\n"; # as to add a comment giving
# the array index
}
}
- # This entry is still missing as of 6.0, perhaps because no short name for
- # it.
- if (-e 'NameAliases.txt') {
- my $aliases = property_ref('Name_Alias');
- if (! defined $aliases) {
- $aliases = Property->new('Name_Alias');
- }
- }
-
# These are used so much, that we set globals for them.
$gc = property_ref('General_Category');
$block = property_ref('Block');
$gc->add_alias('Category');
# For backwards compatibility, these property files have particular names.
- my $upper = property_ref('Uppercase_Mapping');
- $upper->set_core_access('uc()');
- $upper->set_file('Upper'); # This is what utf8.c calls it
-
- my $lower = property_ref('Lowercase_Mapping');
- $lower->set_core_access('lc()');
- $lower->set_file('Lower');
-
- my $title = property_ref('Titlecase_Mapping');
- $title->set_core_access('ucfirst()');
- $title->set_file('Title');
+ property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what
+ # utf8.c calls it
+ property_ref('Lowercase_Mapping')->set_file('Lower');
+ property_ref('Titlecase_Mapping')->set_file('Title');
my $fold = property_ref('Case_Folding');
$fold->set_file('Fold') if defined $fold;
my $input_field_count = $i;
# This routine in addition outputs these extra fields:
+
my $DECOMP_TYPE = $i++; # Decomposition type
# These fields are modifications of ones above, and are usually
# the code point and name on each line. This was actually the hardest
# thing to design around. The code points in those ranges may actually
# have real maps not given by these two lines. These maps will either
- # be algorithmically determinable, or in the extracted files furnished
+ # be algorithmically determinable, or be in the extracted files furnished
# with the UCD. In the event of conflicts between these extracted files,
# and this one, Unicode says that this one prevails. But it shouldn't
# prevail for conflicts that occur in these ranges. The data from the
# first.) A comment for it will later be constructed based on the
# actual properties present and used
$perl_charname = Property->new('Perl_Charnames',
- Core_Access => '\N{...} and "use charnames"',
Default_Map => "",
Directory => File::Spec->curdir(),
File => 'Name',
Range_Size_1 => \&output_perl_charnames_line,
Type => $STRING,
);
- $perl_charname->set_proxy_for('Name', 'Name_Alias');
+ $perl_charname->set_proxy_for('Name');
my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
Directory => File::Spec->curdir(),
$uc = property_ref('uc');
# For each of the case change mappings...
- foreach my $case_table ($lc, $tc, $uc) {
- my $case = $case_table->name;
- my $full = property_ref($case);
- unless (defined $full && ! $full->is_empty) {
+ foreach my $full_table ($lc, $tc, $uc) {
+ my $full_name = $full_table->name;
+ unless (defined $full_table && ! $full_table->is_empty) {
Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
}
# The simple version's name in each mapping merely has an 's' in
# front of the full one's
- my $simple_name = 's' . $case;
+ my $simple_name = 's' . $full_name;
my $simple = property_ref($simple_name);
- $simple->initialize($full) if $simple->to_output_map();
+ $simple->initialize($full_table) if $simple->to_output_map();
- my $simple_only = Property->new("_s$case",
+ my $simple_only = Property->new("_s$full_name",
Type => $STRING,
Default_Map => $CODE_POINT,
Perl_Extension => 1,
Fate => $INTERNAL_ONLY,
- Description => "This contains the simple mappings for $case for just the code points that have different full mappings");
+ Description => "This contains the simple mappings for $full_name for just the code points that have different full mappings");
$simple_only->set_to_output_map($INTERNAL_MAP);
$simple_only->add_comment(join_lines( <<END
This file is for UCD.pm so that it can construct simple mappings that would
return;
}
- $_ = "$fields[0]; lc; $fields[1]";
- $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
- $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
+ my $decimal_code_point = hex $fields[0];
- # Copy any simple case change to the special tables constructed if
- # being overridden by a multi-character case change.
- if ($fields[1] ne $fields[0]
- && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _slc; $value");
- }
- if ($fields[2] ne $fields[0]
- && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _stc; $value");
- }
- if ($fields[3] ne $fields[0]
- && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _suc; $value");
+ # Loop to handle each of the three mappings in the input line, in
+ # order, with $i indicating the current field number.
+ my $i = 0;
+ for my $object ($lc, $tc, $uc) {
+ $i++; # First time through, $i = 0 ... 3rd time = 3
+
+ my $value = $object->value_of($decimal_code_point);
+ $value = ($value eq $CODE_POINT)
+ ? $decimal_code_point
+ : hex $value;
+
+ # If this isn't a multi-character mapping, it should already have
+ # been read in.
+ if ($fields[$i] !~ / /) {
+ if ($value != hex $fields[$i]) {
+ Carp::my_carp("Bad news. UnicodeData.txt thinks "
+ . $object->name
+ . "(0x$fields[0]) is $value"
+ . " and SpecialCasing.txt thinks it is "
+ . hex $fields[$i]
+ . ". Good luck. Proceeding anyway.");
+ }
+ }
+ else {
+ $file->insert_adjusted_lines("$fields[0]; "
+ . $object->full_name
+ . "; $fields[$i]");
+
+ # Copy any simple case change to the special tables
+ # constructed if being overridden by a multi-character case
+ # change.
+ if ($value != $decimal_code_point) {
+ $file->insert_adjusted_lines(sprintf("%s; _s%s; %04X",
+ $fields[0],
+ $object->name,
+ $value));
+ }
+ }
}
+ # Everything has been handled by the insert_adjusted_lines()
+ $_ = "";
+
return;
}
}
# Create the map for simple only if are going to output it, for otherwise
# it takes no part in anything we do.
my $to_output_simple;
+ my $non_final_folds;
sub setup_case_folding($) {
# Read in the case foldings in CaseFolding.txt. This handles both
property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
}
+ $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
+ );
+
# If we ever wanted to show that these tables were combined, a new
# property method could be created, like set_combined_props()
property_ref('Case_Folding')->add_comment(join_lines( <<END
# so that _swash_inversion_hash() is able to construct closures
# without having to worry about F mappings.
if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
- $_ = "$range; Case_Folding; $CMD_DELIM$REPLACE_CMD=$MULTIPLE$CMD_DELIM$map";
+ $_ = "$range; Case_Folding; "
+ . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
+ if ($type eq 'F') {
+ my @string = split " ", $map;
+ for my $i (0 .. @string - 1 -1) {
+ $non_final_folds->add_range(hex $string[$i], hex $string[$i]);
+ }
+ }
}
else {
$_ = "";
# The Script_Extensions property starts out with a clone of the Script
# property.
- my $sc = property_ref("Script");
- my $scx = Property->new("scx", Full_Name => "Script_Extensions",
- Initialize => $sc,
- Default_Map => $sc->default_map,
- Pre_Declared_Maps => 0,
- Format => $STRING_WHITE_SPACE_LIST,
- );
+ my $scx = property_ref("Script_Extensions");
+ $scx = Property->new("scx", Full_Name => "Script_Extensions")
+ if ! defined $scx;
+ $scx->_set_format($STRING_WHITE_SPACE_LIST);
+ $scx->initialize($script);
+ $scx->set_default_map($script->default_map);
+ $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
$scx->add_comment(join_lines( <<END
The values for code points that appear in one script are just the same as for
the 'Script' property. Likewise the values for those that appear in many
END
));
- # Make the scx's tables and aliases for them the same as sc's
- foreach my $table ($sc->tables) {
+ # Initialize scx's tables and the aliases for them to be the same as sc's
+ foreach my $table ($script->tables) {
my $scx_table = $scx->add_match_table($table->name,
Full_Name => $table->full_name);
foreach my $alias ($table->aliases) {
return;
}
-sub setup_v6_name_alias {
- property_ref('Name_Alias')->add_map(7, 7, "ALERT");
+sub setup_early_name_alias {
+ my $aliases = property_ref('Name_Alias');
+ $aliases = Property->new('Name_Alias') if ! defined $aliases;
+
+ # Before 6.0, this wasn't a problem, and after it, this alias is part of
+ # the Unicode-delivered file.
+ $aliases->add_map(7, 7, "ALERT: control") if $v_version eq v6.0.0;
+ return;
+}
+
+sub filter_later_version_name_alias_line {
+
+ # This file has an extra entry per line for the alias type. This is
+ # handled by creating a compound entry: "$alias: $type"; First, split
+ # the line into components.
+ my ($range, $alias, $type, @remainder)
+ = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+
+ # This file contains multiple entries for some components, so tell the
+ # downstream code to allow this in our internal tables; the
+ # $MULTIPLE_AFTER preserves the input ordering.
+ $_ = join ";", $range, $CMD_DELIM
+ . $REPLACE_CMD
+ . '='
+ . $MULTIPLE_AFTER
+ . $CMD_DELIM
+ . "$alias: $type",
+ @remainder;
+ return;
+}
+
+sub filter_early_version_name_alias_line {
+
+ # Early versions did not have the trailing alias type field; implicitly it
+ # was 'correction'
+ $_ .= "; correction";
+ filter_later_version_name_alias_line;
+ return;
}
sub finish_Unicode() {
# 3) Calculates all the regular expression match tables based on the
# mappings.
# 3) Calculates and adds the tables which are defined by Unicode, but
- # which aren't derived by them
+ # which aren't derived by them, and certain derived tables that Perl
+ # uses.
# For each property, fill in any missing mappings, and calculate the re
# match tables. If a property has more than one missing mapping, the
# need to be finished up.
next if $property == $perl;
+ # Nor do we need to do anything with properties that aren't going to
+ # be output.
+ next if $property->fate == $SUPPRESSED;
+
# Handle the properties that have more than one possible default
if (ref $property->default_map) {
my $default_map = $property->default_map;
Lowercase_Mapping
Titlecase_Mapping
Case_Folding
- } ) {
+ } )
+ {
my $full = property_ref($map);
if ($full->is_empty) {
my $simple = property_ref('Simple_' . $map);
$Posix_Lower->set_caseless_equivalent($Posix_Alpha);
my $Alnum = $perl->add_match_table('Alnum',
- Description => 'Alphabetic and (Decimal) Numeric',
+ Description => 'Alphabetic and (decimal) Numeric',
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
$Alnum->add_alias('XPosixAlnum');
my $alias = property_ref('Name_Alias');
if (defined $alias) {
push @composition, 'Name_Alias';
+ $perl_charname->set_proxy_for('Name_Alias');
+ my $unicode_1 = property_ref('Unicode_1_Name');
+ my %abbreviations;
+
+ # Add each entry in Name_Alias to Perl_Charnames. Where these go with
+ # respect to any existing entry depends on the entry type.
+ # Corrections go before said entry, as they should be returned in
+ # preference over the existing entry. (A correction to a correction
+ # should be later in the Name_Alias table, so it will correctly
+ # precede the erroneous correction in Perl_Charnames.)
+ #
+ # Abbreviations go after everything else, so they are saved
+ # temporarily in a hash for later.
+ #
+ # Controls are currently added afterwards. This is because Perl has
+ # previously used the Unicode1 name, and so should still use that.
+ # (Most of them will be the same anyway, in which case we don't add a
+ # duplicate)
+
$alias->reset_each_range;
while (my ($range) = $alias->each_range) {
next if $range->value eq "";
- if ($range->start != $range->end) {
- Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
+ my $code_point = $range->start;
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+ my ($value, $type) = split ': ', $range->value;
+ my $replace_type;
+ if ($type eq 'correction') {
+ $replace_type = $MULTIPLE_BEFORE;
+ }
+ elsif ($type eq 'abbreviation') {
+
+ # Save for later
+ $abbreviations{$value} = $code_point;
+ next;
+ }
+ elsif ($type eq 'control') {
+ my $unicode_1_value = $unicode_1->value_of($code_point);
+ next if $unicode_1_value eq $value;
+ $replace_type = $MULTIPLE_AFTER;
+ }
+ else {
+ $replace_type = $MULTIPLE_AFTER;
}
- $perl_charname->add_duplicate($range->start, $range->value);
+
+ # Actually add; before or after current entry(ies) as determined
+ # above.
+ $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ }
+
+ # Now that have everything added, add in abbreviations after
+ # everything else.
+ foreach my $value (keys %abbreviations) {
+ $perl_charname->add_duplicate($abbreviations{$value}, $value, Replace => $MULTIPLE_AFTER);
}
$alias_sentence = <<END;
-The Name_Alias property adds duplicate code point entries with a corrected
-name. The original (less correct, but still valid) name will be physically
-last.
+The Name_Alias property adds duplicate code point entries that are
+alternatives to the original name. If an addition is a corrected
+name, it will be physically first in the table. The original (less correct,
+but still valid) name will be next; then any alternatives, in no particular
+order; and finally any abbreviations, again in no particular order.
END
}
+
my $comment;
if (@composition <= 2) { # Always at least 2
$comment = join " and ", @composition;
$perl_charname->add_comment(join_lines( <<END
This file is for charnames.pm. It is the union of the $comment properties.
-Unicode_1_Name entries are used only for otherwise nameless code
-points.
+Unicode_1_Name entries are used only for nameless code points in the Name
+property.
$alias_sentence
This file doesn't include the algorithmically determinable names. For those,
use 'unicore/Name.pm'
Re_Pod_Entry => 0,
UCD => 0,
Status => $alias->status,
- Externally_Ok => 0);
+ OK_as_Filename => 0);
}
}
# No name collision, so ok to add the perl synonym.
my $make_re_pod_entry;
- my $externally_ok;
+ my $ok_as_filename;
my $status = $alias->status;
if ($nominal_property == $block) {
if ($prefix eq "") {
$make_re_pod_entry = 1;
$status = $status || $DISCOURAGED;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
elsif ($prefix eq 'In_') {
$make_re_pod_entry = 0;
$status = $status || $NORMAL;
- $externally_ok = 1;
+ $ok_as_filename = 1;
}
else {
$make_re_pod_entry = 0;
$status = $status || $DISCOURAGED;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
}
elsif ($prefix ne "") {
# card, and we won't use it for an external name
$make_re_pod_entry = 0;
$status = $status || $NORMAL;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
else {
# own pod entry and can be used for an external name.
$make_re_pod_entry = 1;
$status = $status || $NORMAL;
- $externally_ok = 1;
+ $ok_as_filename = 1;
}
# Here, there isn't a perl pre-existing table with the
UCD => 0,
Status => $status,
- Externally_Ok => $externally_ok);
+ OK_as_Filename => $ok_as_filename);
trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
next PREFIX;
}
UCD => 0,
Status => $status,
- Externally_Ok => $externally_ok);
+ OK_as_Filename => $ok_as_filename);
# And it will be related to the actual table, since it is
# based on it.
$added_table->set_equivalent_to($actual, Related => 1);
if ($table->isa('Property')) {
$table->set_file_path(@$directory_ref, $file);
push @map_properties, $table;
+
+ # No swash means don't do the rest of this.
+ return if $table->fate != $ORDINARY;
+
+ # Get the path to the file
+ my @path = $table->file_path;
+
+ # Use just the file name if no subdirectory.
+ shift @path if $path[0] eq File::Spec->curdir();
+
+ my $file = join '/', @path;
+
+ # Create a hash entry for utf8_heavy to get the file that stores this
+ # property's map table
+ foreach my $alias ($table->aliases) {
+ my $name = $alias->name;
+ $loose_property_to_file_of{standardize($name)} = $file;
+ }
+
+ # And a way for utf8_heavy to find the proper key in the SwashInfo
+ # hash for this property.
+ $file_to_swash_name{$file} = "To" . $table->swash_name;
return;
}
# Associate it with its file internally. Don't include the
# $matches_directory first component
$table->set_file_path(@$directory_ref, $file);
+
+ # No swash means don't do the rest of this.
+ next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
+
my $sub_filename = join('/', $directory_ref->[1, -1], $file);
my $property = $table->property;
? "" # 'perl' is never explicitly stated
: standardize($property->name) . '=';
+ my $is_default = 0; # Is this table the default one for the property?
+
+ # To calculate $is_default, we find if this table is the same as the
+ # default one for the property. But this is complicated by the
+ # possibility that there is a master table for this one, and the
+ # information is stored there instead of here.
my $parent = $table->parent;
my $leader_prop = $parent->property;
+ my $default_map = $leader_prop->default_map;
+ if (defined $default_map) {
+ my $default_table = $leader_prop->table($default_map);
+ $is_default = 1 if defined $default_table && $parent == $default_table;
+ }
# Calculate the loose name for this table. Mostly it's just its name,
# standardized. But in the case of Perl tables that are single-form
if ($caseless_equivalent != 0) {
$caseless_equivalent_to{$standard} = $caseless_equivalent;
}
+
+ # Add to defaults list if the table this alias belongs to is the
+ # default one
+ $loose_defaults{$standard} = 1 if $is_default;
}
}
return;
}
+sub make_ucd_table_pod_entries {
+ my $table = shift;
+
+ # Generate the entries for the UCD section of the pod for $table. This
+ # also calculates if names are ambiguous, so has to be called even if the
+ # pod is not being output
+
+ my $short_name = $table->name;
+ my $standard_short_name = standardize($short_name);
+ my $full_name = $table->full_name;
+ my $standard_full_name = standardize($full_name);
+
+ my $full_info = ""; # Text of info column for full-name entries
+ my $other_info = ""; # Text of info column for short-name entries
+ my $short_info = ""; # Text of info column for other entries
+ my $meaning = ""; # Synonym of this table
+
+ my $property = ($table->isa('Property'))
+ ? $table
+ : $table->parent->property;
+
+ my $perl_extension = $table->perl_extension;
+
+ # Get the more official name for for perl extensions that aren't
+ # stand-alone properties
+ if ($perl_extension && $property != $table) {
+ if ($property == $perl ||$property->type == $BINARY) {
+ $meaning = $table->complete_name;
+ }
+ else {
+ $meaning = $property->full_name . "=$full_name";
+ }
+ }
+
+ # There are three types of info column. One for the short name, one for
+ # the full name, and one for everything else. They mostly are the same,
+ # so initialize in the same loop.
+ foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
+ if ($perl_extension && $property != $table) {
+
+ # Add the synonymous name for the non-full name entries; and to
+ # the full-name entry if it adds extra information
+ if ($info_ref == \$other_info
+ || ($info_ref == \$short_info
+ && $standard_short_name ne $standard_full_name)
+ || standardize($meaning) ne $standard_full_name
+ ) {
+ $$info_ref .= "$meaning.";
+ }
+ }
+ elsif ($info_ref != \$full_info) {
+
+ # Otherwise, the non-full name columns include the full name
+ $$info_ref .= $full_name;
+ }
+
+ # And the full-name entry includes the short name, if different
+ if ($info_ref == \$full_info
+ && $standard_short_name ne $standard_full_name)
+ {
+ $full_info =~ s/\.\Z//;
+ $full_info .= " " if $full_info;
+ $full_info .= "(Short: $short_name)";
+ }
+
+ if ($table->perl_extension) {
+ $$info_ref =~ s/\.\Z//;
+ $$info_ref .= ". " if $$info_ref;
+ $$info_ref .= "(Perl extension)";
+ }
+ }
+
+ # Add any extra annotations to the full name entry
+ foreach my $more_info ($table->description,
+ $table->note,
+ $table->status_info)
+ {
+ next unless $more_info;
+ $full_info =~ s/\.\Z//;
+ $full_info .= ". " if $full_info;
+ $full_info .= $more_info;
+ }
+
+ # These keep track if have created full and short name pod entries for the
+ # property
+ my $done_full = 0;
+ my $done_short = 0;
+
+ # Every possible name is kept track of, even those that aren't going to be
+ # output. This way we can be sure to find the ambiguities.
+ foreach my $alias ($table->aliases) {
+ my $name = $alias->name;
+ my $standard = standardize($name);
+ my $info;
+ my $output_this = $alias->ucd;
+
+ # If the full and short names are the same, we want to output the full
+ # one's entry, so it has priority.
+ if ($standard eq $standard_full_name) {
+ next if $done_full;
+ $done_full = 1;
+ $info = $full_info;
+ }
+ elsif ($standard eq $standard_short_name) {
+ next if $done_short;
+ $done_short = 1;
+ next if $standard_short_name eq $standard_full_name;
+ $info = $short_info;
+ }
+ else {
+ $info = $other_info;
+ }
+
+ # Here, we have set up the two columns for this entry. But if an
+ # entry already exists for this name, we have to decide which one
+ # we're going to later output.
+ if (exists $ucd_pod{$standard}) {
+
+ # If the two entries refer to the same property, it's not going to
+ # be ambiguous. (Likely it's because the names when standardized
+ # are the same.) But that means if they are different properties,
+ # there is ambiguity.
+ if ($ucd_pod{$standard}->{'property'} != $property) {
+
+ # Here, we have an ambiguity. This code assumes that one is
+ # scheduled to be output and one not and that one is a perl
+ # extension (which is not to be output) and the other isn't.
+ # If those assumptions are wrong, things have to be rethought.
+ if ($ucd_pod{$standard}{'output_this'} == $output_this
+ || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
+ || $output_this == $perl_extension)
+ {
+ Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
+ }
+
+ # We modifiy the info column of the one being output to
+ # indicate the ambiguity. Set $which to point to that one's
+ # info.
+ my $which;
+ if ($ucd_pod{$standard}{'output_this'}) {
+ $which = \$ucd_pod{$standard}->{'info'};
+ }
+ else {
+ $which = \$info;
+ $meaning = $ucd_pod{$standard}{'meaning'};
+ }
+
+ chomp $$which;
+ $$which =~ s/\.\Z//;
+ $$which .= "; NOT '$standard' meaning '$meaning'";
+
+ $ambiguous_names{$standard} = 1;
+ }
+
+ # Use the non-perl-extension variant
+ next unless $ucd_pod{$standard}{'perl_extension'};
+ }
+
+ # Store enough information about this entry that we can later look for
+ # ambiguities, and output it properly.
+ $ucd_pod{$standard} = { 'name' => $name,
+ 'info' => $info,
+ 'meaning' => $meaning,
+ 'output_this' => $output_this,
+ 'perl_extension' => $perl_extension,
+ 'property' => $property,
+ 'status' => $alias->status,
+ };
+ } # End of looping through all this table's aliases
+
+ return;
+}
+
sub pod_alphanumeric_sort {
# Sort pod entries alphanumerically.
foreach my $file (keys %ignored_files) {
push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
}
+ foreach my $file (keys %skipped_files) {
+ push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
+ }
# Then, sort each group.
foreach my $group (keys %grouped_by_reason) {
push @unused_files, "\n$reason\n";
}
- # Generate a list of the properties whose map table we output, from the
- # global @map_properties.
- my @map_tables_actually_output;
- my $info_indent = 20; # Left column is narrower than \p{} table.
- foreach my $property (@map_properties) {
-
- # Get the path to the file; don't output any not in the standard
- # directory.
- my @path = $property->file_path;
- next if $path[0] ne $map_directory;
-
- # Don't mention map tables that are for internal-use only
- next if $property->to_output_map == $INTERNAL_MAP;
-
- shift @path; # Remove the standard name
-
- my $file = join '/', @path; # In case is in sub directory
- my $info = $property->full_name;
- my $short_name = $property->name;
- if ($info ne $short_name) {
- $info .= " ($short_name)";
- }
- foreach my $more_info ($property->description,
- $property->note,
- $property->status_info)
- {
- next unless $more_info;
- $info =~ s/\.\Z//;
- $info .= ". $more_info";
- }
- push @map_tables_actually_output, format_pod_line($info_indent,
- $file,
- $info,
- $property->status);
+ # Similarly, create the output text for the UCD section of the pod
+ my @ucd_pod;
+ foreach my $key (keys %ucd_pod) {
+ next unless $ucd_pod{$key}->{'output_this'};
+ push @ucd_pod, format_pod_line($indent_info_column,
+ $ucd_pod{$key}->{'name'},
+ $ucd_pod{$key}->{'info'},
+ $ucd_pod{$key}->{'status'},
+ );
}
# Sort alphabetically, and fold for output
- @map_tables_actually_output = sort
- pod_alphanumeric_sort @map_tables_actually_output;
- @map_tables_actually_output
- = simple_fold(\@map_tables_actually_output,
- ' ',
- $info_indent,
- $automatic_pod_indent);
-
- # Generate a list of the formats that can appear in the map tables.
- my @map_table_formats;
- foreach my $format (sort keys %map_table_formats) {
- push @map_table_formats,
- Text::Tabs::expand("$format\t$map_table_formats{$format}\n");
- }
- @map_table_formats = simple_fold(\@map_table_formats,
- ' ',
- 8,
- $automatic_pod_indent);
+ @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
+ my $ucd_pod = simple_fold(\@ucd_pod,
+ ' ',
+ $indent_info_column,
+ $automatic_pod_indent);
+ $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
+ . "\n"
+ . $ucd_pod;
local $" = "";
# Everything is ready to assemble.
though not all are enabled by default. The omitted ones are the Unihan
properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
deprecated or Unicode-internal properties. (An installation may choose to
-recompile Perl's tables to change this. See L<Unicode regular expression
+recompile Perl's tables to change this. See L<Unicode character
properties that are NOT accepted by Perl>.)
+For most purposes, access to Unicode properties from the Perl core is through
+regular expression matches, as described in the next section.
+For some special purposes, and to access the properties that are not suitable
+for regular expression matching, all the Unicode character properties that
+Perl handles are accessible via the standard L<Unicode::UCD> module, as
+described in the section L</Properties accessible through Unicode::UCD>.
+
Perl also provides some additional extensions and short-cut synonyms
for Unicode properties.
$zero_matches
-=head1 Properties not accessible through \\p{} and \\P{}
-
-A few properties are accessible in Perl via various function calls only.
-These are:
+=head1 Properties accessible through Unicode::UCD
+
+All the Unicode character properties mentioned above (except for those marked
+as for internal use by Perl) are also accessible by
+L<Unicode::UCD/prop_invlist()>.
+
+Due to their nature, not all Unicode character properties are suitable for
+regular expression matches, nor C<prop_invlist()>. The remaining
+non-provisional, non-internal ones are accessible via
+L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
+hasn't included; see L<below for which those are|/Unicode character properties
+that are NOT accepted by Perl>).
+
+For compatibility with other parts of Perl, all the single forms given in the
+table in the L<section above|/Properties accessible through \\p{} and \\P{}>
+are recognized. BUT, there are some ambiguities between some Perl extensions
+and the Unicode properties, all of which are silently resolved in favor of the
+official Unicode property. To avoid surprises, you should only use
+C<prop_invmap()> for forms listed in the table below, which omits the
+non-recommended ones. The affected forms are the Perl single form equivalents
+of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
+C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
+whose short name is C<sc>. The table indicates the current ambiguities in the
+INFO column, beginning with the word C<"NOT">.
+
+The standard Unicode properties listed below are documented in
+L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
+L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
+L<perlunicode/Other Properties>;
+
+The first column in the table is a name for the property; the second column is
+an alternative name, if any, plus possibly some annotations. The alternative
+name is the property's full name, unless that would simply repeat the first
+column, in which case the second column indicates the property's short name
+(if different). The annotations are given only in the entry for the full
+name. If a property is obsolete, etc, the entry will be flagged with the same
+characters used in the table in the L<section above|/Properties accessible
+through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
+
+$ucd_pod
+
+=head1 Properties accessible through other means
+
+Certain properties are accessible also via core function calls. These are:
Lowercase_Mapping lc() and lcfirst()
Titlecase_Mapping ucfirst()
expressions.
And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
-interpolation in double-quoted strings and regular expressions, but both
-usages require a L<use charnames;|charnames> to be specified, which also
-contains related functions viacode(), vianame(), and string_vianame().
+interpolation in double-quoted strings and regular expressions; and functions
+C<charnames::viacode()>, C<charnames::vianame()>, and
+C<charnames::string_vianame()> (which require a C<use charnames ();> to be
+specified.
+
+Finally, most properties related to decomposition are accessible via
+L<Unicode::Normalize>.
-=head1 Unicode regular expression properties that are NOT accepted by Perl
+=head1 Unicode character properties that are NOT accepted by Perl
Perl will generate an error for a few character properties in Unicode when
used in a regular expression. The non-Unihan ones are listed below, with the
C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
(C<\%Config> is available from the Config module).
-=head1 Files in the I<To> directory (for serious hackers only)
-
-All Unicode properties are really mappings (in the mathematical sense) from
-code points to their respective values. As part of its build process,
-Perl constructs tables containing these mappings for all properties that it
-deals with. Some, but not all, of these are written out into files.
-Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
-(C<%Config> is available from the C<Config> module).
-
-Perl reserves the right to change the format and even the existence of any of
-those files without notice, except the ones that were in existence prior to
-release 5.14. If those change, a deprecation cycle will be done first. These
-are:
-
-@map_tables_actually_output
-
-Each of the files in this directory defines several hash entries to help
-reading programs decipher it. One of them looks like this:
-
- \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
-
-where "NAME" is a name to indicate the property. For backwards compatibility,
-this is not necessarily the property's official Unicode name. (The "To" is
-also for backwards compatibility.) The hash entry gives the format of the
-mapping fields of the table, currently one of the following:
-
-@map_table_formats
-
-This format applies only to the entries in the main body of the table.
-Entries defined in hashes or ones that are missing from the list can have a
-different format.
-
-The value that the missing entries have is given by another SwashInfo hash
-entry line; it looks like this:
-
- \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
-
-This example line says that any Unicode code points not explicitly listed in
-the file have the value "NaN" under the property indicated by NAME. If the
-value is the special string C<< <code point> >>, it means that the value for
-any missing code point is the code point itself. This happens, for example,
-in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
-character "A", are missing because the uppercase of "A" is itself.
-
-Finally, if the file contains a hash for special case entries, its name is
-specified by an entry that looks like this:
-
- \$utf8::SwashInfo{'ToNAME'}{'specials_name'} = 'utf8::ToSpecNAME';
-
-
=head1 Other information in the Unicode data base
The Unicode data base is delivered in two different formats. The XML version
= simple_dumper(\%caseless_equivalent_to, ' ' x 4);
chomp $caseless_equivalent_to;
+ my $loose_property_to_file_of
+ = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
+ chomp $loose_property_to_file_of;
+
+ my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
+ chomp $file_to_swash_name;
+
my @heavy = <<END;
$HEADER
$INTERNAL_ONLY_HEADER
-# This file is for the use of utf8_heavy.pl
+# This file is for the use of utf8_heavy.pl and Unicode::UCD
# Maps Unicode (not Perl single-form extensions) property names in loose
# standard form to their corresponding standard names
$caseless_equivalent_to
);
+# Property names to mapping files
+\%utf8::loose_property_to_file_of = (
+$loose_property_to_file_of
+);
+
+# Files to the swash names within them.
+\%utf8::file_to_swash_name = (
+$file_to_swash_name
+);
+
1;
END
push @name, <<END;
+package charnames;
+
# This module contains machine-generated tables and code for the
# algorithmically-determinable Unicode character names. The following
# routines can be used to translate between name and code point and vice versa
foreach my $alias ($table->aliases) {
# Skip non-legal names
- next unless $alias->externally_ok;
+ next unless $alias->ok_as_filename;
next unless $alias->ucd;
$found_ucd = 1; # have at least one legal name
# standardized alias
foreach my $alias ($table->aliases) {
next unless $alias->ucd;
- next unless $alias->externally_ok;
+ next unless $alias->ok_as_filename;
push @{$perlprop_to_aliases{standardize($alias->name)}},
@aliases_list;
}
}
+ # Make a list of all combinations of properties/values that are suppressed.
+ my @suppressed;
+ foreach my $property_name (keys %why_suppressed) {
+
+ # Just the value
+ my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
+
+ # The hash may contain properties not in this release of Unicode
+ next unless defined (my $property = property_ref($property_name));
+
+ # Find all combinations
+ foreach my $prop_alias ($property->aliases) {
+ my $prop_alias_name = standardize($prop_alias->name);
+
+ # If no =value, there's just one combination possibe for this
+ if (! $value_name) {
+
+ # The property may be suppressed, but there may be a proxy for
+ # it, so it shouldn't be listed as suppressed
+ next if $prop_alias->ucd;
+ push @suppressed, $prop_alias_name;
+ }
+ else { # Otherwise
+ foreach my $value_alias ($property->table($value_name)->aliases)
+ {
+ next if $value_alias->ucd;
+
+ push @suppressed, "$prop_alias_name="
+ . standardize($value_alias->name);
+ }
+ }
+ }
+ }
+
+ # Convert the structure below (designed for Name.pm) to a form that UCD
+ # wants, so it doesn't have to modify it at all; i.e. so that it includes
+ # an element for the Hangul syllables in the appropriate place, and
+ # otherwise changes the name to include the "-<code point>" suffix.
+ my @algorithm_names;
+ my $done_hangul = 0;
+
+ # Copy it linearly.
+ for my $i (0 .. @code_points_ending_in_code_point - 1) {
+
+ # Insert the hanguls in the correct place.
+ if (! $done_hangul
+ && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
+ {
+ $done_hangul = 1;
+ push @algorithm_names, { low => $SBase,
+ high => $SBase + $SCount - 1,
+ name => '<hangul syllable>',
+ };
+ }
+
+ # Copy the current entry, modified.
+ push @algorithm_names, {
+ low => $code_points_ending_in_code_point[$i]->{'low'},
+ high => $code_points_ending_in_code_point[$i]->{'high'},
+ name =>
+ "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
+ };
+ }
+
# Serialize these structures for output.
my $loose_to_standard_value
= simple_dumper(\%loose_to_standard_value, ' ' x 4);
chomp $loose_to_standard_value;
+ my $string_property_loose_to_name
+ = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
+ chomp $string_property_loose_to_name;
+
my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
chomp $perlprop_to_aliases;
+ my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
+ chomp $prop_aliases;
+
+ my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
+ chomp $prop_value_aliases;
+
+ my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
+ chomp $suppressed;
+
+ my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
+ chomp $algorithm_names;
+
+ my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
+ chomp $ambiguous_names;
+
+ my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
+ chomp $loose_defaults;
+
my @ucd = <<END;
$HEADER
$INTERNAL_ONLY_HEADER
$loose_to_standard_value
);
+# String property loose names to standard loose name
+\%Unicode::UCD::string_property_loose_to_name = (
+$string_property_loose_to_name
+);
+
# Keys are Perl extensions in loose form; values are each one's list of
# aliases
\%Unicode::UCD::loose_perlprop_to_name = (
$perlprop_to_aliases
);
+# Keys are standard property name; values are each one's aliases
+\%Unicode::UCD::prop_aliases = (
+$prop_aliases
+);
+
+# Keys of top level are standard property name; values are keys to another
+# hash, Each one is one of the property's values, in standard form. The
+# values are that prop-val's aliases. If only one specified, the short and
+# long alias are identical.
+\%Unicode::UCD::prop_value_aliases = (
+$prop_value_aliases
+);
+
+# Ordered (by code point ordinal) list of the ranges of code points whose
+# names are algorithmically determined. Each range entry is an anonymous hash
+# of the start and end points and a template for the names within it.
+\@Unicode::UCD::algorithmic_named_code_points = (
+$algorithm_names
+);
+
+# The properties that as-is have two meanings, and which must be disambiguated
+\%Unicode::UCD::ambiguous_names = (
+$ambiguous_names
+);
+
+# Keys are the prop-val combinations which are the default values for the
+# given property, expressed in standard loose form
+\%Unicode::UCD::loose_defaults = (
+$loose_defaults
+);
+
+# All combinations of names that are suppressed.
+# This is actually for UCD.t, so it knows which properties shouldn't have
+# entries. If it got any bigger, would probably want to put it in its own
+# file to use memory only when it was needed, in testing.
+\@Unicode::UCD::suppressed_properties = (
+$suppressed
+);
+
1;
END
|| ($table == $property->table('N')
&& $property->table('Y')->is_empty));
-
- # Some tables should match everything
- my $expected_full =
- ($is_property)
- ? # All these types of map tables will be full because
- # they will have been populated with defaults
- ($type == $ENUM || $type == $FORCED_BINARY)
-
- : # A match table should match everything if its method
- # shows it should
- ($table->matches_all
-
- # The complement of an empty binary table will match
- # everything
- || $is_complement_of_empty_binary
- )
- ;
-
if ($table->is_empty) {
if ($suppress_if_empty_warn_if_not) {
Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
}
+ # Some tables should match everything
+ my $expected_full =
+ ($table->fate == $SUPPRESSED)
+ ? 0
+ : ($is_property)
+ ? # All these types of map tables will be full because
+ # they will have been populated with defaults
+ ($type == $ENUM || $type == $FORCED_BINARY)
+
+ : # A match table should match everything if its method
+ # shows it should
+ ($table->matches_all
+
+ # The complement of an empty binary table will match
+ # everything
+ || $is_complement_of_empty_binary
+ )
+ ;
+
my $count = $table->count;
if ($expected_full) {
if ($count != $MAX_UNICODE_CODEPOINTS) {
next TABLE;
}
+
if (! $is_property) {
+ make_ucd_table_pod_entries($table) if $table->property == $perl;
+
# Several things need to be done just once for each related
# group of match tables. Do them on the parent.
if ($table->parent == $table) {
# Don't write out or make references to the $perl property
next if $table == $perl;
+ make_ucd_table_pod_entries($table);
+
# There is a mapping stored of the various synonyms to the
# standardized name of the property for utf8_heavy.pl.
# Also, the pod file contains entries of the form:
my $full_property_name = $property->full_name;
my $property_name = $property->name;
my $standard_property_name = standardize($property_name);
-
+ my $standard_property_full_name
+ = standardize($full_property_name);
+
+ # We also create for Unicode::UCD a list of aliases for
+ # the property. The list starts with the property name;
+ # then its full name.
+ my @property_list;
+ my @standard_list;
+ if ( $property->fate <= $MAP_PROXIED) {
+ @property_list = ($property_name, $full_property_name);
+ @standard_list = ($standard_property_name,
+ $standard_property_full_name);
+ }
# For each synonym ...
for my $i (0 .. @property_aliases - 1) {
my $alias_standard = standardize($alias_name);
+ # Add other aliases to the list of property aliases
+ if ($property->fate <= $MAP_PROXIED
+ && ! grep { $alias_standard eq $_ } @standard_list)
+ {
+ push @property_list, $alias_name;
+ push @standard_list, $alias_standard;
+ }
# For utf8_heavy, set the mapping of the alias to the
# property
- if ($type != $STRING) {
+ if ($type == $STRING) {
+ if ($property->fate <= $MAP_PROXIED) {
+ $string_property_loose_to_name{$alias_standard}
+ = $standard_property_name;
+ }
+ }
+ else {
if (exists ($loose_property_name_of{$alias_standard}))
{
Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
}
}
+ # The list of all possible names is attached to each alias, so
+ # lookup is easy
+ if (@property_list) {
+ push @{$prop_aliases{$standard_list[0]}}, @property_list;
+ }
+
+ if ($property->fate <= $MAP_PROXIED) {
+
+ # Similarly, we create for Unicode::UCD a list of
+ # property-value aliases.
+
+ my $property_full_name = $property->full_name;
+
+ # Look at each table in the property...
+ foreach my $table ($property->tables) {
+ my @values_list;
+ my $table_full_name = $table->full_name;
+ my $standard_table_full_name
+ = standardize($table_full_name);
+ my $table_name = $table->name;
+ my $standard_table_name = standardize($table_name);
+
+ # The list starts with the table name and its full
+ # name.
+ push @values_list, $table_name, $table_full_name;
+
+ # We add to the table each unique alias that isn't
+ # discouraged from use.
+ foreach my $alias ($table->aliases) {
+ next if $alias->status
+ && $alias->status eq $DISCOURAGED;
+ my $name = $alias->name;
+ my $standard = standardize($name);
+ next if $standard eq $standard_table_name;
+ next if $standard eq $standard_table_full_name;
+ push @values_list, $name;
+ }
+
+ # Here @values_list is a list of all the aliases for
+ # the table. That is, all the property-values given
+ # by this table. By agreement with Unicode::UCD,
+ # if the name and full name are identical, and there
+ # are no other names, drop the duplcate entry to save
+ # memory.
+ if (@values_list == 2
+ && $values_list[0] eq $values_list[1])
+ {
+ pop @values_list
+ }
+
+ # To save memory, unlike the similar list for property
+ # aliases above, only the standard forms hve the list.
+ # This forces an extra step of converting from input
+ # name to standard name, but the savings are
+ # considerable. (There is only marginal savings if we
+ # did this with the property aliases.)
+ push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
+ }
+ }
# Don't write out a mapping file if not desired.
next if ! $property->to_output_map;
push @property_aliases, map { Alias->new("Is_" . $_->name,
$_->loose_match,
$_->make_re_pod_entry,
- $_->externally_ok,
+ $_->ok_as_filename,
$_->status,
$_->ucd,
)
# others except DAge.txt (as data in an extracted file can be over-ridden by
# the non-extracted. Some other files depend on data derived from an earlier
# file, like UnicodeData requires data from Jamo, and the case changing and
-# folding requires data from Unicode. Mostly, it safest to order by first
+# folding requires data from Unicode. Mostly, it is safest to order by first
# version releases in (except the Jamo). DAge.txt is read before the
# extracted ones because of the rarely used feature $compare_versions. In the
# unlikely event that there were ever an extracted file that contained the Age
Input_file->new('SpecialCasing.txt', v2.1.8,
Each_Line_Handler => \&filter_special_casing_line,
Pre_Handler => \&setup_special_casing,
+ Has_Missings_Defaults => $IGNORED,
),
Input_file->new(
'LineBreak.txt', v3.0.0,
: undef,
\&filter_case_folding_line
],
+ Has_Missings_Defaults => $IGNORED,
),
Input_file->new('DCoreProperties.txt', v3.1.0,
# 5.2 changed this file
),
Input_file->new('NameAliases.txt', v5.0.0,
Property => 'Name_Alias',
- Pre_Handler => ($v_version ge v6.0.0)
- ? \&setup_v6_name_alias
+ Pre_Handler => ($v_version le v6.0.0)
+ ? \&setup_early_name_alias
: undef,
+ Each_Line_Handler => ($v_version le v6.0.0)
+ ? \&filter_early_version_name_alias_line
+ : \&filter_later_version_name_alias_line,
),
Input_file->new("BidiTest.txt", v5.2.0,
Skip => 'Validation Tests',
Property => 'Script_Extensions',
Pre_Handler => \&setup_script_extensions,
Each_Line_Handler => \&filter_script_extensions_line,
+ Has_Missings_Defaults => (($v_version le v6.0.0)
+ ? $NO_DEFAULTS
+ : $IGNORED),
+ ),
+ # The two Indic files are actually available starting in v6.0.0, but their
+ # property values are missing from PropValueAliases.txt in that release,
+ # so that further work would have to be done to get them to work properly
+ # for that release.
+ Input_file->new('IndicMatraCategory.txt', v6.1.0,
+ Property => 'Indic_Matra_Category',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ Skip => "Provisional; for the analysis and processing of Indic scripts",
+ ),
+ Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
+ Property => 'Indic_Syllabic_Category',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ Skip => "Provisional; for the analysis and processing of Indic scripts",
),
);
# The paths are stored with relative names, and with '/' as the
# delimiter; convert to absolute on this machine
my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
- $potential_files{$full} = 1
- if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
+ $potential_files{lc $full} = 1
+ if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
}
}
}
my @unknown_input_files;
- foreach my $file (keys %potential_files) {
- next if grep { lc($file) eq lc($_) } @known_files;
+ foreach my $file (keys %potential_files) { # The keys are stored in lc
+ next if grep { $file eq lc($_) } @known_files;
# Here, the file is unknown to us. Get relative path name
$file = File::Spec->abs2rel($file);