$0 = File::Spec->canonpath($0);
my $make_test_script = 0; # ? Should we output a test script
+my $make_norm_test_script = 0; # ? Should we output a normalization test script
my $write_unchanged_files = 0; # ? Should we update the output files even if
# we don't think they have changed
my $use_directory = ""; # ? Should we chdir somewhere.
{
$make_test_script = 1;
}
+ elsif ($arg eq '-makenormtest')
+ {
+ $make_norm_test_script = 1;
+ }
elsif ($arg eq '-makelist') {
$make_list = 1;
}
-makelist : Rewrite the file list $file_list based on current setup
-annotate : Output an annotation for each character in the table files;
useful for debugging mktables, looking at diffs; but is slow,
- memory intensive; resulting tables are usable but slow and
- very large.
+ memory intensive; resulting tables are usable but are slow and
+ very large (and currently fail the Unicode::UCD.t tests).
-check A B : Executes $0 only if A and B are the same
END
}
if $v_version ge v4.1.0;
push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
if $v_version ge v6.0.0;
+push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
+ if $v_version ge v6.1.0;
# The lists below are hashes, so the key is the item in the list, and the
# value is the reason why it is in the list. This makes generation of
# Enum values for to_output_map() method in the Map_Table package.
my $EXTERNAL_MAP = 1;
my $INTERNAL_MAP = 2;
+my $OUTPUT_ADJUSTED = 3;
# To override computed values for writing the map tables for these properties.
# The default for enum map tables is to write them out, so that the Unicode
# existence is not noted in the comment.
'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
+ 'Indic_Matra_Category' => "Provisional",
+ 'Indic_Syllabic_Category' => "Provisional",
+
# Don't suppress ISO_Comment, as otherwise special handling is needed
# to differentiate between it and gc=c, which can be written as 'isc',
# which is the same characters as ISO_Comment's short name.
- 'Name' => "Accessible via 'use charnames;' or Unicode::UCD::prop_invmap()",
+ 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
);
- # The following are suppressed because they were made contributory or
- # deprecated by Unicode before Perl ever thought about supporting them.
- foreach my $property ('Jamo_Short_Name',
- 'Grapheme_Link',
- 'Expands_On_NFC',
- 'Expands_On_NFD',
- 'Expands_On_NFKC',
- 'Expands_On_NFKD'
+ foreach my $property (
+
+ # The following are suppressed because they were made contributory
+ # or deprecated by Unicode before Perl ever thought about
+ # supporting them.
+ 'Jamo_Short_Name',
+ 'Grapheme_Link',
+ 'Expands_On_NFC',
+ 'Expands_On_NFD',
+ 'Expands_On_NFKC',
+ 'Expands_On_NFKD',
+
+ # The following are suppressed because they have been marked
+ # as deprecated for a sufficient amount of time
+ 'Other_Alphabetic',
+ 'Other_Default_Ignorable_Code_Point',
+ 'Other_Grapheme_Extend',
+ 'Other_ID_Continue',
+ 'Other_ID_Start',
+ 'Other_Lowercase',
+ 'Other_Math',
+ 'Other_Uppercase',
) {
$why_suppressed{$property} = $why_deprecated{$property};
}
'ReadMe.txt' => 'Documentation',
'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
- 'IndicMatraCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts',
- 'IndicSyllabicCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts',
'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
);
+my %skipped_files; # List of files that we skip
+
### End of externally interesting definitions, except for @input_file_objects
my $HEADER=<<"EOF";
my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
# the comments at the subroutine definition.
my $UNCONDITIONALLY = 2; # Replace without conditions.
-my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
+my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
+ # already there
+my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
# already there
-my $CROAK = 5; # Die with an error if is already there
+my $CROAK = 6; # Die with an error if is already there
# Flags to give property statuses. The phrases are to remind maintainers that
# if the flag is changed, the indefinite article referring to it in the
my $HEX_FORMAT = 'x';
my $RATIONAL_FORMAT = 'r';
my $STRING_FORMAT = 's';
+my $ADJUST_FORMAT = 'a';
my $DECOMP_STRING_FORMAT = 'c';
my $STRING_WHITE_SPACE_LIST = 'sw';
$HEX_FORMAT => 'non-negative hex whole number; a code point',
$RATIONAL_FORMAT => 'rational: an integer or a fraction',
$STRING_FORMAT => 'string',
+ $ADJUST_FORMAT => 'some entries need adjustment',
$DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
$STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
);
# anonymous hash.
my @code_points_ending_in_code_point;
+# To hold Unicode's normalization test suite
+my @normalization_tests;
+
# Boolean: does this Unicode version have the hangul syllables, and are we
# writing out a table for them?
my $has_hangul_syllables = 0;
# contrast to the non_skip element, which is supposed to be used very
# temporarily for debugging. Sets 'optional' to 1. Also, files that we
# pretty much will never look at can be placed in the global
- # %ignored_files instead. Ones used here will be added to that list.
+ # %ignored_files instead. Ones used here will be added to %skipped files
main::set_access('skip', \%skip, 'c');
my %each_line_handler;
# including its reason
if ($skip{$addr}) {
$optional{$addr} = 1;
- $ignored_files{$file{$addr}} = $skip{$addr}
+ $skipped_files{$file{$addr}} = $skip{$addr}
}
return $self;
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
# its name
if ($seen_non_extracted_non_age) {
if ($file =~ /$EXTRACTED/i) {
- Carp::my_carp_bug(join_lines(<<END
+ Carp::my_carp_bug(main::join_lines(<<END
$file should be processed just after the 'Prop...Alias' files, and before
anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
have subtle problems
# they are deleted from the hash, so any that remain at the
# end of the program are files that we didn't process.
my $fkey = File::Spec->rel2abs($file);
- my $expecting = delete $potential_files{$fkey};
- $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
+ my $expecting = delete $potential_files{lc($fkey)};
+
Carp::my_carp("Was not expecting '$file'.") if
! $expecting
&& ! defined $handle{$addr};
|| @defaults > 2
|| ($default =~ /^</
&& $default !~ /^<code *point>$/i
- && $default !~ /^<none>$/i))
+ && $default !~ /^<none>$/i
+ && $default !~ /^<script>$/i))
{
$self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
}
elsif ($default =~ /^<code *point>$/i) {
$default = $CODE_POINT;
}
+ elsif ($default =~ /^<script>$/i) {
+
+ # Special case this one. Currently is from
+ # ScriptExtensions.txt, and means for all unlisted
+ # code points, use their Script property values.
+ # For the code points not listed in that file, the
+ # default value is 'Unknown'.
+ $default = "Unknown";
+ }
# Store them as a sub-arrays with both components.
push @{$missings{$addr}}, [ $default, $property ];
# they don't appear in documentation). Enum
main::set_access('status', \%status, 'r');
- my %externally_ok;
+ my %ok_as_filename;
# Similarly, some aliases should not be considered as usable ones for
# external use, such as file names, or we don't want documentation to
# recommend them. Boolean
- main::set_access('externally_ok', \%externally_ok, 'r');
+ main::set_access('ok_as_filename', \%ok_as_filename, 'r');
sub new {
my $class = shift;
$name{$addr} = shift;
$loose_match{$addr} = shift;
$make_re_pod_entry{$addr} = shift;
- $externally_ok{$addr} = shift;
+ $ok_as_filename{$addr} = shift;
$status{$addr} = shift;
$ucd{$addr} = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Null names are never ok externally
- $externally_ok{$addr} = 0 if $name{$addr} eq "";
+ $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
return $self;
}
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
# either a constructor or a method. If called as a method, the result
# will be a new() instance of the calling object, containing the union
# of that object with the other parameter's code points; if called as
- # a constructor, the first parameter gives the class the new object
+ # a constructor, the first parameter gives the class that the new object
# should be, and the second parameter gives the code points to go into
# it.
# In either case, there are two parameters looked at by this routine;
# just a single code point.
#
# If they are ranges, this routine doesn't make any effort to preserve
- # the range values of one input over the other. Therefore this base
- # class should not allow _union to be called from other than
+ # the range values and types of one input over the other. Therefore
+ # this base class should not allow _union to be called from other than
# initialization code, so as to prevent two tables from being added
# together where the range values matter. The general form of this
# routine therefore belongs in a derived class, but it was moved here
# to avoid duplication of code. The failure to overload this in this
# class keeps it safe.
#
+ # It does make the effort during initialization to accept tables with
+ # multiple values for the same code point, and to preserve the order
+ # of these. If there is only one input range or range set, it doesn't
+ # sort (as it should already be sorted to the desired order), and will
+ # accept multiple values per code point. Otherwise it will merge
+ # multiple values into a single one.
my $self;
my @args; # Arguments to pass to the constructor
# Accumulate all records from both lists.
my @records;
+ my $input_count = 0;
for my $arg (@args) {
#local $to_trace = 0 if main::DEBUG;
trace "argument = $arg" if main::DEBUG && $to_trace;
Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
return;
}
+
$arg = [ $arg ] if ! ref $arg;
my $type = ref $arg;
if ($type eq 'ARRAY') {
foreach my $element (@$arg) {
push @records, Range->new($element, $element);
+ $input_count++;
}
}
elsif ($arg->isa('Range')) {
push @records, $arg;
+ $input_count++;
}
elsif ($arg->can('ranges')) {
push @records, $arg->ranges;
+ $input_count++;
}
else {
my $message = "";
# Sort with the range containing the lowest ordinal first, but if
# two ranges start at the same code point, sort with the bigger range
# of the two first, because it takes fewer cycles.
- @records = sort { ($a->start <=> $b->start)
+ if ($input_count > 1) {
+ @records = sort { ($a->start <=> $b->start)
or
# if b is shorter than a, b->end will be
# less than a->end, and we want to select
# a, so want to return -1
($b->end <=> $a->end)
} @records;
+ }
my $new = $class->new(@_);
for my $set (@records) {
my $start = $set->start;
my $end = $set->end;
- my $value = $set->value;
+ my $value = $set->value;
+ my $type = $set->type;
if ($start > $new->max) {
- $new->_add_delete('+', $start, $end, $value);
+ $new->_add_delete('+', $start, $end, $value, Type => $type);
}
elsif ($end > $new->max) {
- $new->_add_delete('+', $new->max +1, $end, $value);
+ $new->_add_delete('+', $new->max +1, $end, $value,
+ Type => $type);
+ }
+ elsif ($input_count == 1) {
+ # Here, overlaps existing range, but is from a single input,
+ # so preserve the multiple values from that input.
+ $new->_add_delete('+', $start, $end, $value, Type => $type,
+ Replace => $MULTIPLE_AFTER);
}
}
# new and old values are identical, the
# replacement is skipped to save cycles
# => $IF_NOT_EQUIVALENT means to replace the existing values
- # with this one if they are not equivalent.
+ # (the default) with this one if they are not equivalent.
# Ranges are equivalent if their types are the
# same, and they are the same string; or if
# both are type 0 ranges, if their Unicode
# style when the pre-existing and replacement
# standard forms are the same, we can move to
# the modern style
- # => $MULTIPLE means that if this range duplicates an
+ # => $MULTIPLE_BEFORE means that if this range duplicates an
# existing one, but has a different value,
# don't replace the existing one, but insert
# this, one so that the same range can occur
# multiple times. They are stored LIFO, so
# that the final one inserted is the first one
# returned in an ordered search of the table.
+ # If this is an exact duplicate, including the
+ # value, the original will be moved to be
+ # first, before any other duplicate ranges
+ # with different values.
+ # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
+ # FIFO, so that this one is inserted after all
+ # others that currently exist. If this is an
+ # exact duplicate, including value, of an
+ # existing range, this one is discarded
+ # (leaving the existing one in its original,
+ # higher priority position
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
# "same value" means identical for non-type-0 ranges, and it means
Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
return;
}
+ if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
+ Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway");
+ }
#local $to_trace = 1 if main::DEBUG;
if ($operation eq '-') {
# Here, we have taken care of the case where $replace is $NO.
# Remember that here, r[$i-1]->end < $start <= r[$i]->end
# If inserting a multiple record, this is where it goes, before the
- # first (if any) existing one. This implies an insertion, and no
- # change to any existing ranges. Note that $i can be -1 if this new
- # range doesn't actually duplicate any existing, and comes at the
- # beginning of the list.
- if ($replace == $MULTIPLE) {
+ # first (if any) existing one if inserting LIFO. (If this is to go
+ # afterwards, FIFO, we below move the pointer to there.) These imply
+ # an insertion, and no change to any existing ranges. Note that $i
+ # can be -1 if this new range doesn't actually duplicate any existing,
+ # and comes at the beginning of the list.
+ if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
if ($start != $end) {
Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
return;
}
- # Don't add an exact duplicate, as it isn't really a multiple
+ # If the new code point is within a current range ...
if ($end >= $r->[$i]->start) {
+
+ # Don't add an exact duplicate, as it isn't really a multiple
my $existing_value = $r->[$i]->value;
my $existing_type = $r->[$i]->type;
return if $value eq $existing_value && $type eq $existing_type;
# pre-existing code point, which will again be a single code
# point range. Because 'i' likely will have changed as a
# result of these operations, we can't just continue on, but
- # do this operation recursively as well.
+ # do this operation recursively as well. If we are inserting
+ # LIFO, the pre-existing code point needs to go after the new
+ # one, so use MULTIPLE_AFTER; and vice versa.
if ($r->[$i]->start != $r->[$i]->end) {
$self->_add_delete('-', $start, $end, "");
$self->_add_delete('+', $start, $end, $value, Type => $type);
- return $self->_add_delete('+', $start, $end, $existing_value, Type => $existing_type, Replace => $MULTIPLE);
+ return $self->_add_delete('+',
+ $start, $end,
+ $existing_value,
+ Type => $existing_type,
+ Replace => ($replace == $MULTIPLE_BEFORE)
+ ? $MULTIPLE_AFTER
+ : $MULTIPLE_BEFORE);
+ }
+ }
+
+ # If to place this new record after, move to beyond all existing
+ # ones; but don't add this one if identical to any of them, as it
+ # isn't really a multiple. This leaves the original order, so
+ # that the current request is ignored. The reasoning is that the
+ # previous request that wanted this record to have high priority
+ # should have precedence.
+ if ($replace == $MULTIPLE_AFTER) {
+ while ($i < @$r && $r->[$i]->start == $start) {
+ return if $value eq $r->[$i]->value
+ && $type eq $r->[$i]->type;
+ $i++;
+ }
+ }
+ else {
+ # If instead we are to place this new record before any
+ # existing ones, remove any identical ones that come after it.
+ # This changes the existing order so that the new one is
+ # first, as is being requested.
+ for (my $j = $i + 1;
+ $j < @$r && $r->[$j]->start == $start;
+ $j++)
+ {
+ if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
+ splice @$r, $j, 1;
+ last; # There should only be one instance, so no
+ # need to keep looking
+ }
}
}
return @return;
}
- # Here, we have taken care of $NO and $MULTIPLE replaces. This leaves
- # delete, insert, and replace either unconditionally or if not
+ # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
+ # leaves delete, insert, and replace either unconditionally or if not
# equivalent. $i still points to the first potential affected range.
# Now find the highest range affected, which will determine the length
# parameter to splice. (The input range can span multiple existing
$j--; # $j now points to the highest affected range.
trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
- # Here, have taken care of $NO and $MULTIPLE replaces.
+ # Here, have taken care of $NO and $MULTIPLE_foo replaces.
# $j points to the highest affected range. But it can be < $i or even
# -1. These happen only if the insertion is entirely in the gap
# between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
return $self->_union($other)
},
+ '+=' => sub { my $self = shift;
+ my $other = shift;
+ my $reversed = shift;
+
+ if ($reversed) {
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' += '
+ . ref($self)
+ . "'. undef returned.");
+ return;
+ }
+
+ return $self->_union($other)
+ },
'&' => sub { my $self = shift;
my $other = shift;
for my $try_hard (0, 1) {
# Look through all the ranges for a usable code point.
- for my $set ($self->ranges) {
+ for my $set (reverse $self->ranges) {
# Try the edge cases first, starting with the end point of the
# range.
my $self = shift;
my $code_point = shift;
my $value = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+ my %args = @_;
+ my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
+ Carp::carp_extra_args(\%args) if main::DEBUG && %args;
return $self->add_map($code_point, $code_point,
- $value, Replace => $MULTIPLE);
+ $value, Replace => $replace);
}
} # End of closure for package Range_Map
my %format;
# The format of the entries of the table. This is calculated from the
# data in the table (or passed in the constructor). This is an enum e.g.,
- # $STRING_FORMAT
+ # $STRING_FORMAT. It is marked protected as it should not be generally
+ # used to override calculations.
main::set_access('format', \%format, 'r', 'p_s');
sub new {
# All arguments are key => value pairs, which you can see below, most
# of which match fields documented above. Otherwise: Re_Pod_Entry,
- # Externally_Ok, and Fuzzy apply to the names of the table, and are
+ # OK_as_Filename, and Fuzzy apply to the names of the table, and are
# documented in the Alias package
return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
my $ucd = delete $args{'UCD'};
my $description = delete $args{'Description'};
- my $externally_ok = delete $args{'Externally_Ok'};
+ my $ok_as_filename = delete $args{'OK_as_Filename'};
my $loose_match = delete $args{'Fuzzy'};
my $note = delete $args{'Note'};
my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
# clarity. Other routines rely on the full name being first on the
# list
$self->add_alias($full_name{$addr},
- Externally_Ok => $externally_ok,
+ OK_as_Filename => $ok_as_filename,
Fuzzy => $loose_match,
Re_Pod_Entry => $make_re_pod_entry,
Status => $status{$addr},
# Then comes the other name, if meaningfully different.
if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
$self->add_alias($name{$addr},
- Externally_Ok => $externally_ok,
+ OK_as_Filename => $ok_as_filename,
Fuzzy => $loose_match,
Re_Pod_Entry => $make_re_pod_entry,
Status => $status{$addr},
use overload
fallback => 0,
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
'!=' => \&main::_operator_not_equal,
'==' => \&main::_operator_equal,
;
my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
$make_re_pod_entry = $YES unless defined $make_re_pod_entry;
- my $externally_ok = delete $args{'Externally_Ok'};
- $externally_ok = 1 unless defined $externally_ok;
+ my $ok_as_filename = delete $args{'OK_as_Filename'};
+ $ok_as_filename = 1 unless defined $ok_as_filename;
my $status = delete $args{'Status'};
$status = $NORMAL unless defined $status;
- my $ucd = delete $args{'UCD'} // 1;
+ # An internal name does not get documented, unless overridden by the
+ # input.
+ my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
$insert_position,
0,
Alias->new($name, $loose_match, $make_re_pod_entry,
- $externally_ok, $status, $ucd);
+ $ok_as_filename, $status, $ucd);
# This name may be shorter than any existing ones, so clear the cache
# of the shortest, so will have to be recalculated.
foreach my $alias ($self->aliases()) {
# Don't use an alias that isn't ok to use for an external name.
- next if ! $alias->externally_ok;
+ next if ! $alias->ok_as_filename;
my $name = main::Standardize($alias->name);
trace $self, $name if main::DEBUG && $to_trace;
# Write a representation of the table to its file. It calls several
# functions furnished by sub-classes of this abstract base class to
# handle non-normal ranges, to add stuff before the table, and at its
- # end.
+ # end. If the table is to be written so that adjustments are
+ # required, this does that conversion.
my $self = shift;
+ my $use_adjustments = shift; # ? output in adjusted format or not
my $tab_stops = shift; # The number of tab stops over to put any
# comment.
my $suppress_value = shift; # Optional, if the value associated with
if ($annotate) {
- # if annotating each code point, must print 1 per line.
+ # If annotating each code point, must print 1 per line.
# The variable could point to a subroutine, and we don't want
# to lose that fact, so only set if not set already
$range_size_1 = 1 if ! $range_size_1;
);
}
+ # Values for previous time through the loop. Initialize to
+ # something that won't be adjacent to the first iteration;
+ # only $previous_end matters for that.
+ my $previous_start;
+ my $previous_end = -2;
+ my $previous_value;
+
+ # Values for next time through the portion of the loop that splits
+ # the range. 0 in $next_start means there is no remaining portion
+ # to deal with.
+ my $next_start = 0;
+ my $next_end;
+ my $next_value;
+ my $offset = 0;
+
# Output each range as part of the here document.
RANGE:
for my $set ($range_list{$addr}->ranges) {
next RANGE if defined $suppress_value
&& $value eq $suppress_value;
- # If there is a range and doesn't need a single point range
- # output
- if ($start != $end && ! $range_size_1) {
- push @OUT, sprintf "%04X\t%04X", $start, $end;
- $OUT[-1] .= "\t$value" if $value ne "";
-
- # Add a comment with the size of the range, if requested.
- # Expand Tabs to make sure they all start in the same
- # column, and then unexpand to use mostly tabs.
- if (! $output_range_counts{$addr}) {
- $OUT[-1] .= "\n";
+ { # This bare block encloses the scope where we may need to
+ # split a range (when outputting adjusteds), and each time
+ # through we handle the next portion of the original by
+ # ending the block with a 'redo'. The values to use for
+ # that next time through are set up just below in the
+ # scalars whose names begin with '$next_'.
+
+ if ($use_adjustments) {
+
+ # When converting to use adjustments, we can handle
+ # only single element ranges. Set up so that this
+ # time through the loop, we look at the first element,
+ # and the next time through, we start off with the
+ # remainder. Thus each time through we look at the
+ # first element of the range
+ if ($end != $start) {
+ $next_start = $start + 1;
+ $next_end = $end;
+ $next_value = $value;
+ $end = $start;
+ }
+
+ # The values for some of these tables are stored as
+ # hex strings. Convert those to decimal
+ $value = hex($value)
+ if $self->default_map eq $CODE_POINT
+ && $value =~ / ^ [A-Fa-f0-9]+ $ /x;
+
+ # If this range is adjacent to the previous one, and
+ # the values in each are integers that are also
+ # adjacent (differ by 1), then this range really
+ # extends the previous one that is already in element
+ # $OUT[-1]. So we pop that element, and pretend that
+ # the range starts with whatever it started with.
+ # $offset is incremented by 1 each time so that it
+ # gives the current offset from the first element in
+ # the accumulating range, and we keep in $value the
+ # value of that first element.
+ if ($start == $previous_end + 1
+ && $value =~ /^ -? \d+ $/xa
+ && $previous_value =~ /^ -? \d+ $/xa
+ && ($value == ($previous_value + ++$offset)))
+ {
+ pop @OUT;
+ $start = $previous_start;
+ $value = $previous_value;
+ }
+ else {
+ $offset = 0;
+ }
+
+ # Save the current values for the next time through
+ # the loop.
+ $previous_start = $start;
+ $previous_end = $end;
+ $previous_value = $value;
}
- else {
- $OUT[-1] = Text::Tabs::expand($OUT[-1]);
- my $count = main::clarify_number($end - $start + 1);
- use integer;
-
- my $width = $tab_stops * 8 - 1;
- $OUT[-1] = sprintf("%-*s # [%s]\n",
- $width,
- $OUT[-1],
- $count);
- $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
+
+ # If there is a range and doesn't need a single point range
+ # output
+ if ($start != $end && ! $range_size_1) {
+ push @OUT, sprintf "%04X\t%04X", $start, $end;
+ $OUT[-1] .= "\t$value" if $value ne "";
+
+ # Add a comment with the size of the range, if
+ # requested. Expand Tabs to make sure they all start
+ # in the same column, and then unexpand to use mostly
+ # tabs.
+ if (! $output_range_counts{$addr}) {
+ $OUT[-1] .= "\n";
+ }
+ else {
+ $OUT[-1] = Text::Tabs::expand($OUT[-1]);
+ my $count = main::clarify_number($end - $start + 1);
+ use integer;
+
+ my $width = $tab_stops * 8 - 1;
+ $OUT[-1] = sprintf("%-*s # [%s]\n",
+ $width,
+ $OUT[-1],
+ $count);
+ $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
+ }
}
- next RANGE;
- }
- # Here to output a single code point per line
+ # Here to output a single code point per line.
+ # If not to annotate, use the simple formats
+ elsif (! $annotate) {
- # If not to annotate, use the simple formats
- if (! $annotate) {
+ # Use any passed in subroutine to output.
+ if (ref $range_size_1 eq 'CODE') {
+ for my $i ($start .. $end) {
+ push @OUT, &{$range_size_1}($i, $value);
+ }
+ }
+ else {
- # Use any passed in subroutine to output.
- if (ref $range_size_1 eq 'CODE') {
- for my $i ($start .. $end) {
- push @OUT, &{$range_size_1}($i, $value);
+ # Here, caller is ok with default output.
+ for (my $i = $start; $i <= $end; $i++) {
+ push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
+ }
}
}
else {
- # Here, caller is ok with default output.
+ # Here, wants annotation.
for (my $i = $start; $i <= $end; $i++) {
- push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
- }
- }
- next RANGE;
- }
- # Here, wants annotation.
- for (my $i = $start; $i <= $end; $i++) {
-
- # Get character information if don't have it already
- main::populate_char_info($i)
- if ! defined $viacode[$i];
- my $type = $annotate_char_type[$i];
-
- # Figure out if should output the next code points as part
- # of a range or not. If this is not in an annotation
- # range, then won't output as a range, so returns $i.
- # Otherwise use the end of the annotation range, but no
- # further than the maximum possible end point of the loop.
- my $range_end = main::min($annotate_ranges->value_of($i)
- || $i,
- $end);
-
- # Use a range if it is a range, and either is one of the
- # special annotation ranges, or the range is at most 3
- # long. This last case causes the algorithmically named
- # code points to be output individually in spans of at
- # most 3, as they are the ones whose $type is > 0.
- if ($range_end != $i
- && ( $type < 0 || $range_end - $i > 2))
- {
- # Here is to output a range. We don't allow a
- # caller-specified output format--just use the
- # standard one.
- push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
+ # Get character information if don't have it already
+ main::populate_char_info($i)
+ if ! defined $viacode[$i];
+ my $type = $annotate_char_type[$i];
+
+ # Figure out if should output the next code points
+ # as part of a range or not. If this is not in an
+ # annotation range, then won't output as a range,
+ # so returns $i. Otherwise use the end of the
+ # annotation range, but no further than the
+ # maximum possible end point of the loop.
+ my $range_end = main::min(
+ $annotate_ranges->value_of($i) || $i,
+ $end);
+
+ # Use a range if it is a range, and either is one
+ # of the special annotation ranges, or the range
+ # is at most 3 long. This last case causes the
+ # algorithmically named code points to be output
+ # individually in spans of at most 3, as they are
+ # the ones whose $type is > 0.
+ if ($range_end != $i
+ && ( $type < 0 || $range_end - $i > 2))
+ {
+ # Here is to output a range. We don't allow a
+ # caller-specified output format--just use the
+ # standard one.
+ push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
$range_end,
$value;
- my $range_name = $viacode[$i];
-
- # For the code points which end in their hex value, we
- # eliminate that from the output annotation, and
- # capitalize only the first letter of each word.
- if ($type == $CP_IN_NAME) {
- my $hex = sprintf "%04X", $i;
- $range_name =~ s/-$hex$//;
- my @words = split " ", $range_name;
- for my $word (@words) {
- $word = ucfirst(lc($word)) if $word ne 'CJK';
- }
- $range_name = join " ", @words;
- }
- elsif ($type == $HANGUL_SYLLABLE) {
- $range_name = "Hangul Syllable";
- }
+ my $range_name = $viacode[$i];
+
+ # For the code points which end in their hex
+ # value, we eliminate that from the output
+ # annotation, and capitalize only the first
+ # letter of each word.
+ if ($type == $CP_IN_NAME) {
+ my $hex = sprintf "%04X", $i;
+ $range_name =~ s/-$hex$//;
+ my @words = split " ", $range_name;
+ for my $word (@words) {
+ $word =
+ ucfirst(lc($word)) if $word ne 'CJK';
+ }
+ $range_name = join " ", @words;
+ }
+ elsif ($type == $HANGUL_SYLLABLE) {
+ $range_name = "Hangul Syllable";
+ }
- $OUT[-1] .= " $range_name" if $range_name;
+ $OUT[-1] .= " $range_name" if $range_name;
- # Include the number of code points in the range
- my $count = main::clarify_number($range_end - $i + 1);
- $OUT[-1] .= " [$count]\n";
+ # Include the number of code points in the
+ # range
+ my $count =
+ main::clarify_number($range_end - $i + 1);
+ $OUT[-1] .= " [$count]\n";
- # Skip to the end of the range
- $i = $range_end;
- }
- else { # Not in a range.
- my $comment = "";
-
- # When outputting the names of each character, use
- # the character itself if printable
- $comment .= "'" . chr($i) . "' " if $printable[$i];
-
- # To make it more readable, use a minimum indentation
- my $comment_indent;
-
- # Determine the annotation
- if ($format eq $DECOMP_STRING_FORMAT) {
-
- # This is very specialized, with the type of
- # decomposition beginning the line enclosed in
- # <...>, and the code points that the code point
- # decomposes to separated by blanks. Create two
- # strings, one of the printable characters, and
- # one of their official names.
- (my $map = $value) =~ s/ \ * < .*? > \ +//x;
- my $tostr = "";
- my $to_name = "";
- my $to_chr = "";
- foreach my $to (split " ", $map) {
- $to = CORE::hex $to;
- $to_name .= " + " if $to_name;
- $to_chr .= chr($to);
- main::populate_char_info($to)
- if ! defined $viacode[$to];
- $to_name .= $viacode[$to];
+ # Skip to the end of the range
+ $i = $range_end;
}
+ else { # Not in a range.
+ my $comment = "";
+
+ # When outputting the names of each character,
+ # use the character itself if printable
+ $comment .= "'" . chr($i) . "' "
+ if $printable[$i];
+
+ # To make it more readable, use a minimum
+ # indentation
+ my $comment_indent;
+
+ # Determine the annotation
+ if ($format eq $DECOMP_STRING_FORMAT) {
+
+ # This is very specialized, with the type
+ # of decomposition beginning the line
+ # enclosed in <...>, and the code points
+ # that the code point decomposes to
+ # separated by blanks. Create two
+ # strings, one of the printable
+ # characters, and one of their official
+ # names.
+ (my $map = $value) =~ s/ \ * < .*? > \ +//x;
+ my $tostr = "";
+ my $to_name = "";
+ my $to_chr = "";
+ foreach my $to (split " ", $map) {
+ $to = CORE::hex $to;
+ $to_name .= " + " if $to_name;
+ $to_chr .= chr($to);
+ main::populate_char_info($to)
+ if ! defined $viacode[$to];
+ $to_name .= $viacode[$to];
+ }
- $comment .=
+ $comment .=
"=> '$to_chr'; $viacode[$i] => $to_name";
- $comment_indent = 25; # Determined by experiment
- }
- else {
-
- # Assume that any table that has hex format is a
- # mapping of one code point to another.
- if ($format eq $HEX_FORMAT) {
- my $decimal_value = CORE::hex $value;
- main::populate_char_info($decimal_value)
+ $comment_indent = 25; # Determined by
+ # experiment
+ }
+ else {
+
+ # Assume that any table that has hex
+ # format is a mapping of one code point to
+ # another.
+ if ($format eq $HEX_FORMAT) {
+ my $decimal_value = CORE::hex $value;
+ main::populate_char_info($decimal_value)
if ! defined $viacode[$decimal_value];
- $comment .= "=> '"
- . chr($decimal_value)
- . "'; " if $printable[$decimal_value];
- }
- $comment .= $viacode[$i] if $include_name
- && $viacode[$i];
- if ($format eq $HEX_FORMAT) {
- my $decimal_value = CORE::hex $value;
- $comment .= " => $viacode[$decimal_value]"
- if $viacode[$decimal_value];
- }
+ $comment .= "=> '"
+ . chr($decimal_value)
+ . "'; " if $printable[$decimal_value];
+ }
+ $comment .= $viacode[$i] if $include_name
+ && $viacode[$i];
+ if ($format eq $HEX_FORMAT) {
+ my $decimal_value = CORE::hex $value;
+ $comment .=
+ " => $viacode[$decimal_value]"
+ if $viacode[$decimal_value];
+ }
- # If including the name, no need to indent, as the
- # name will already be way across the line.
- $comment_indent = ($include_name) ? 0 : 60;
- }
+ # If including the name, no need to
+ # indent, as the name will already be way
+ # across the line.
+ $comment_indent = ($include_name) ? 0 : 60;
+ }
- # Use any passed in routine to output the base part of
- # the line.
- if (ref $range_size_1 eq 'CODE') {
- my $base_part = &{$range_size_1}($i, $value);
- chomp $base_part;
- push @OUT, $base_part;
- }
- else {
- push @OUT, sprintf "%04X\t\t%s", $i, $value;
+ # Use any passed in routine to output the base
+ # part of the line.
+ if (ref $range_size_1 eq 'CODE') {
+ my $base_part=&{$range_size_1}($i, $value);
+ chomp $base_part;
+ push @OUT, $base_part;
+ }
+ else {
+ push @OUT, sprintf "%04X\t\t%s", $i, $value;
+ }
+
+ # And add the annotation.
+ $OUT[-1] = sprintf "%-*s\t# %s",
+ $comment_indent,
+ $OUT[-1],
+ $comment
+ if $comment;
+ $OUT[-1] .= "\n";
+ }
}
+ }
- # And add the annotation.
- $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
- $OUT[-1],
- $comment if $comment;
- $OUT[-1] .= "\n";
+ # If we split the range, set up so the next time through
+ # we get the remainder, and redo.
+ if ($next_start) {
+ $start = $next_start;
+ $end = $next_end;
+ $value = $next_value;
+ $next_start = 0;
+ redo;
}
}
} # End of loop through all the table's ranges
'readable_array');
my %to_output_map;
- # Enum as to whether or not to write out this map table:
+ # Enum as to whether or not to write out this map table, and how:
# 0 don't output
# $EXTERNAL_MAP means its existence is noted in the documentation, and
# it should not be removed nor its format changed. This
# output.
# $INTERNAL_MAP means Perl reserves the right to do anything it wants
# with this file
+ # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
+ # outputting the actual mappings as-is, we adjust things
+ # to create a much more compact table. Only those few
+ # tables where the mapping is convertible at least to an
+ # integer and compacting makes a big difference should
+ # have this. Hence, the default is to not do this
+ # unless the table's default mapping is to $CODE_POINT,
+ # and the range size is not 1.
main::set_access('to_output_map', \%to_output_map, 's');
-
sub new {
my $class = shift;
my $name = shift;
my $default_map = delete $args{'Default_Map'};
my $property = delete $args{'_Property'};
my $full_name = delete $args{'Full_Name'};
+ my $to_output_map = delete $args{'To_Output_Map'};
# Rest of parameters passed on
$anomalous_entries{$addr} = [];
$default_map{$addr} = $default_map;
+ $to_output_map{$addr} = $to_output_map;
$self->initialize($initialize) if defined $initialize;
# Don't want to output binary map tables even for debugging.
return 0 if $type == $BINARY;
- # But do want to output string ones.
- return $EXTERNAL_MAP if $type == $STRING;
+ # But do want to output string ones. All the ones that remain to
+ # be dealt with (i.e. which haven't explicitly been set to external)
+ # are for internal Perl use only. The default for those that map to
+ # $CODE_POINT and haven't been restricted to a single element range
+ # is to use the adjusted form.
+ if ($type == $STRING) {
+ return $INTERNAL_MAP if $self->range_size_1
+ || $default_map{$addr} ne $CODE_POINT;
+ return $OUTPUT_ADJUSTED;
+ }
# Otherwise is an $ENUM, do output it, for Perl's purposes
return $INTERNAL_MAP;
my $return = $self->SUPER::header();
- if ($self->to_output_map == $INTERNAL_MAP) {
+ if ($self->to_output_map >= $INTERNAL_MAP) {
$return .= $INTERNAL_ONLY_HEADER;
}
else {
- my $property_name = $self->property->full_name;
+ my $property_name = $self->property->full_name =~ s/Legacy_//r;
$return .= <<END;
# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
# have our own flag for just this purpose; but it works now to exclude
# Perl generated synonyms from the lists for properties, where the
# name is always the proper Unicode one.
- my @property_aliases = grep { $_->externally_ok } $self->aliases;
+ my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
my $count = $self->count;
my $default_map = $default_map{$addr};
my $comment = "";
my $status = $self->status;
- if ($status) {
+ if ($status && $status ne $PLACEHOLDER) {
my $warn = uc $status_past_participles{$status};
$comment .= <<END;
}
$comment .= "This file returns the $mapping:\n";
+ my $ucd_accessible_name = "";
+ my $full_name = $self->property->full_name;
for my $i (0 .. @property_aliases - 1) {
- $comment .= sprintf("%-8s%s\n",
- " ",
- $property_aliases[$i]->name . '(cp)'
- );
+ my $name = $property_aliases[$i]->name;
+ $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
+ if ($property_aliases[$i]->ucd) {
+ if ($name eq $full_name) {
+ $ucd_accessible_name = $full_name;
+ }
+ elsif (! $ucd_accessible_name) {
+ $ucd_accessible_name = $name;
+ }
+ }
+ }
+ $comment .= "\nwhere 'cp' is $cp.";
+ if ($ucd_accessible_name) {
+ $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
}
- my $full_name = $self->property->full_name;
- $comment .= "\nwhere 'cp' is $cp. Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
# And append any commentary already set from the actual property.
$comment .= "\n\n" . $self->comment if $self->comment;
return unless defined $name;
if (defined $swash_keys{$name}) {
- Carp::my_carp(join_lines(<<END
+ Carp::my_carp(main::join_lines(<<END
Already created a swash name '$name' for $swash_keys{$name}. This means that
the same name desired for $self shouldn't be used. Bad News. This must be
fixed before production use, but proceeding anyway
my $format = $self->format;
- my $return = <<END;
+ my $return = "";
+
+ my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
+ if ($output_adjusted) {
+ if ($specials_name) {
+ $return .= <<END;
+# The mappings in the non-hash portion of this file must be modified to get the
+# correct values by adding the code point ordinal number to each one that is
+# numeric.
+END
+ }
+ else {
+ $return .= <<END;
+# The mappings must be modified to get the correct values by adding the code
+# point ordinal number to each one that is numeric.
+END
+ }
+ }
+
+ $return .= <<END;
+
# The name this swash is to be known by, with the format of the mappings in
# the main body of the table, and what all code points missing from this file
# map to.
\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
END
if ($specials_name) {
- $return .= <<END;
+ $return .= <<END;
\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
END
}
my $default_map = $default_map{$addr};
- $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
+
+ # For $CODE_POINT default maps and using adjustments, instead the default
+ # becomes zero.
+ $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
+ . (($output_adjusted && $default_map eq $CODE_POINT)
+ ? "0"
+ : $default_map)
+ . "';";
if ($default_map eq $CODE_POINT) {
$return .= ' # code point maps to itself';
if $format eq $FLOAT_FORMAT
&& $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
$format = $HEX_FORMAT
- if $format eq $RATIONAL_FORMAT
- && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
+ if ($format eq $RATIONAL_FORMAT
+ && $map !~
+ m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
+ # Assume a leading zero means hex,
+ # even if all digits are 0-9
+ || ($format eq $INTEGER_FORMAT
+ && $map =~ /^0[0-9A-F]/);
$format = $STRING_FORMAT if $format eq $HEX_FORMAT
&& $map =~ /[^0-9A-F]/;
}
Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
}
- $self->_set_format($format);
+ # If the output is to be adjusted, the format of the table that gets
+ # output is actually 'a' instead of whatever it is stored internally
+ # as.
+ my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
+ if ($output_adjusted) {
+ $format = $ADJUST_FORMAT;
+ }
- # Core Perl has a different definition of mapping ranges than we do,
- # that is applicable mainly to mapping code points, so for tables
- # where it is possible that core Perl could be used to read it,
- # make it range size 1 to prevent possible confusion
- $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
+ $self->_set_format($format);
return $self->SUPER::write(
+ $output_adjusted,
($self->property == $block)
? 7 # block file needs more tab stops
: 3,
'+=' => sub {
my $self = shift;
my $other = shift;
+ my $reversed = shift;
+
+ if ($reversed) {
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' += '
+ . ref($self)
+ . "'. undef returned.");
+ return;
+ }
return if $self->carp_if_locked;
}
return $self;
},
+ '&=' => sub {
+ my $self = shift;
+ my $other = shift;
+
+ return if $self->carp_if_locked;
+ $self->_set_range_list($self->_range_list & $other);
+ return $self;
+ },
'-' => sub { my $self = shift;
my $other = shift;
my $reversed = shift;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- return $self->SUPER::write(2); # 2 tab stops
+ return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops
}
sub set_final_comment {
# \p{}'s
my @global_comments; # List of all the tables' comments that are
# there before this routine was called.
+ my $has_ucd_alias = 0; # If there is an alias that is accessible via
+ # Unicode::UCD. If not, then don't say it is
+ # in the comment
# Get list of all the parent tables that are equivalent to this one
# (including itself).
[$i % @table_aliases];
my $table_alias = $table_alias_object->name;
my $loose_match = $table_alias_object->loose_match;
+ $has_ucd_alias |= $table_alias_object->ucd;
if ($table_alias !~ /\D/) { # Clarify large numbers.
$table_alias = main::clarify_number($table_alias)
my $flag = $property->status
|| $table->status
|| $table_alias_object->status;
- $flags{$flag} = $status_past_participles{$flag} if $flag;
+ if ($flag && $flag ne $PLACEHOLDER) {
+ $flags{$flag} = $status_past_participles{$flag};
+ }
$loose_count++;
$any_of_these = 'any of these'
}
- my $comment = "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
+ my $comment = "";
+ if ($has_ucd_alias) {
+ $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
+ }
if ($has_unrelated) {
$comment .= <<END;
This file is for tables that are not necessarily related: To conserve
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
'==' => \&main::_operator_equal,
'!=' => \&main::_operator_not_equal,
'=' => sub { return shift },
set_default_map
set_file_path
set_final_comment
+ _set_format
set_range_size_1
set_status
set_to_output_map
{ # Closure
- my $indent_increment = " " x 2;
+ my $indent_increment = " " x (($debugging_build) ? 2 : 0);
my %already_output;
$main::simple_dumper_nesting = 0;
my $item = shift;
my $indent = shift;
- $indent = "" if ! defined $indent;
+ $indent = "" if ! $debugging_build || ! defined $indent;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Indent array elements one level
$output .= &simple_dumper($item->[$i], $next_indent);
+ next if ! $debugging_build;
$output =~ s/\n$//; # Remove any trailing nl so
$output .= " # [$i]\n"; # as to add a comment giving
# the array index
: "$self$other";
}
+sub _operator_dot_equal {
+ # Overloaded '.=' method that is common to all packages.
+
+ my $self = shift;
+ my $other = shift;
+ my $reversed = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ $other = "" unless defined $other;
+
+ if ($reversed) {
+ return $other .= "$self";
+ }
+ else {
+ return "$self" . "$other";
+ }
+}
+
sub _operator_equal {
# Generic overloaded '==' routine. To be equal, they must be the exact
# same object
}
}
- # This entry is still missing as of 6.0, perhaps because no short name for
- # it.
- if (-e 'NameAliases.txt') {
- my $aliases = property_ref('Name_Alias');
- if (! defined $aliases) {
- $aliases = Property->new('Name_Alias');
- }
- }
-
# These are used so much, that we set globals for them.
$gc = property_ref('General_Category');
$block = property_ref('Block');
# Perl adds this alias.
$gc->add_alias('Category');
- # For backwards compatibility, these property files have particular names.
- property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what
- # utf8.c calls it
- property_ref('Lowercase_Mapping')->set_file('Lower');
- property_ref('Titlecase_Mapping')->set_file('Title');
-
- my $fold = property_ref('Case_Folding');
- $fold->set_file('Fold') if defined $fold;
-
# Unicode::Normalize expects this file with this name and directory.
my $ccc = property_ref('Canonical_Combining_Class');
if (defined $ccc) {
$ccc->set_directory(File::Spec->curdir());
}
- # utf8.c has a different meaning for non range-size-1 for map properties
- # that this program doesn't currently handle; and even if it were changed
- # to do so, some other code may be using them expecting range size 1.
- foreach my $property (qw {
- Case_Folding
- Lowercase_Mapping
- Titlecase_Mapping
- Uppercase_Mapping
- })
- {
- property_ref($property)->set_range_size_1(1);
- }
-
# These two properties aren't actually used in the core, but unfortunately
# the names just above that are in the core interfere with these, so
# choose different names. These aren't a problem unless the map tables
$urs->add_alias('kRSUnicode');
}
}
+
+ # For backwards compatibility with applications that may read the mapping
+ # file directly (it was documented in 5.12 and 5.14 as being thusly
+ # usable), keep it from being adjusted. (range_size_1 is
+ # used to force the traditional format.)
+ if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
+ $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
+ $nfkc_cf->set_range_size_1(1);
+ }
+ if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
+ $bmg->set_to_output_map($EXTERNAL_MAP);
+ $bmg->set_range_size_1(1);
+ }
+
+ property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
+
return;
}
# Process each line of the file ...
while ($file->next_line) {
+ # Fix typo in input file
+ s/CCC133/CCC132/g if $v_version eq v6.1.0;
+
my ($property, @data) = split /\s*;\s*/;
# The ccc property has an extra field at the beginning, which is the
return @return;
}
+sub process_NormalizationsTest {
+
+ # Each line looks like:
+ # source code point; NFC; NFD; NFKC; NFKD
+ # e.g.
+ # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
+
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ # Process each line of the file ...
+ while ($file->next_line) {
+
+ next if /^@/;
+
+ my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
+
+ foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
+ $$var = pack "U0U*", map { hex } split " ", $$var;
+ $$var =~ s/(\\)/$1$1/g;
+ }
+
+ push @normalization_tests,
+ "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
+ } # End of looping through the file
+}
+
sub output_perl_charnames_line ($$) {
# Output the entries in Perl_charnames specially, using 5 digits instead
my $input_field_count = $i;
# This routine in addition outputs these extra fields:
+
my $DECOMP_TYPE = $i++; # Decomposition type
# These fields are modifications of ones above, and are usually
# the code point and name on each line. This was actually the hardest
# thing to design around. The code points in those ranges may actually
# have real maps not given by these two lines. These maps will either
- # be algorithmically determinable, or in the extracted files furnished
+ # be algorithmically determinable, or be in the extracted files furnished
# with the UCD. In the event of conflicts between these extracted files,
# and this one, Unicode says that this one prevails. But it shouldn't
# prevail for conflicts that occur in these ranges. The data from the
Range_Size_1 => \&output_perl_charnames_line,
Type => $STRING,
);
- $perl_charname->set_proxy_for('Name', 'Name_Alias');
+ $perl_charname->set_proxy_for('Name');
my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
Directory => File::Spec->curdir(),
# body of the table
Map_Type => $COMPUTE_NO_MULTI_CP,
Type => $STRING,
+ To_Output_Map => $INTERNAL_MAP,
);
$Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
$Perl_decomp->add_comment(join_lines(<<END
my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
Default_Map => "",
Perl_Extension => 1,
- File => 'Digit', # Trad. location
Directory => $map_directory,
Type => $STRING,
- Range_Size_1 => 1,
+ To_Output_Map => $OUTPUT_ADJUSTED,
);
$Decimal_Digit->add_comment(join_lines(<<END
This file gives the mapping of all code points which represent a single
-decimal digit [0-9] to their respective digits. For example, the code point
-U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
-that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
-numerals.
+decimal digit [0-9] to their respective digits, but it has ranges of 10 code
+points, and the mapping of each non-initial element of each range is actually
+not to "0", but to the offset that element has from its corresponding DIGIT 0.
+These code points are those that have Numeric_Type=Decimal; not special
+things, like subscripts nor Roman numerals.
END
));
$file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
$file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
+ $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
$fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
}
elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
# Some code points in this file have the pseudo-name
# '<control>', but the official name for such ones is the null
- # string. For charnames.pm, we use the Unicode version 1 name
- $fields[$NAME] = "";
- $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
+ # string.
+ $fields[$NAME] = $fields[$CHARNAME] = "";
# We had better not be in between range lines.
if ($in_range) {
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Flush the buffers.
- foreach my $i (1 .. $last_field) {
+ foreach my $i (0 .. $last_field) {
$file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
}
# http://www.unicode.org/versions/corrigendum8.html
$fields[$BIDI] = "AL";
}
- elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
+ elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
$fields[$CHARNAME] = "";
}
# simple ones are in UnicodeData.txt, which should already have been
# read in to the full property data structures, so as to initialize
# these with the simple ones. Then the SpecialCasing.txt entries
- # overwrite the ones which have different full mappings.
+ # add or overwrite the ones which have different full mappings.
# This routine sees if the simple mappings are to be output, and if
# so, copies what has already been put into the full mapping tables,
# relatively few entries in them that have different full mappings,
# and thus skip the simple mapping tables altogether.
- # New tables with just the simple mappings that are overridden by the
- # full ones are constructed. These are for Unicode::UCD, which
- # requires the simple mappings. The Case_Folding table is a combined
- # table of both the simple and full mappings, with the full ones being
- # in the hash, and the simple ones, even those overridden by the hash,
- # being in the base table. That same mechanism could have been
- # employed here, except that the docs have said that the generated
- # files are usuable directly by programs, so we dare not change the
- # format in any way.
-
my $file= shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
$uc = property_ref('uc');
# For each of the case change mappings...
- foreach my $case_table ($lc, $tc, $uc) {
- my $case = $case_table->name;
- my $full = property_ref($case);
- unless (defined $full && ! $full->is_empty) {
+ foreach my $full_table ($lc, $tc, $uc) {
+ my $full_name = $full_table->name;
+ unless (defined $full_table && ! $full_table->is_empty) {
Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
}
- # The simple version's name in each mapping merely has an 's' in
- # front of the full one's
- my $simple_name = 's' . $case;
- my $simple = property_ref($simple_name);
- $simple->initialize($full) if $simple->to_output_map();
+ # Create a table in the old-style format and with the original
+ # file name for backwards compatibility with applications that
+ # read it directly. The new tables contain both the simple and
+ # full maps, and the old are missing simple maps when there is a
+ # conflicting full one. Probably it would have been ok to add
+ # those to the legacy version, as was already done in 5.14 to the
+ # case folding one, but this was not done, out of an abundance of
+ # caution. The tables are set up here before we deal with the
+ # full maps so that as we handle those, we can override the simple
+ # maps for them in the legacy table, and merely add them in the
+ # new-style one.
+ my $legacy = Property->new("Legacy_" . $full_table->full_name,
+ File => $full_table->full_name =~
+ s/case_Mapping//r,
+ Range_Size_1 => 1,
+ Format => $HEX_FORMAT,
+ Default_Map => $CODE_POINT,
+ UCD => 0,
+ Initialize => $full_table,
+ To_Output_Map => $EXTERNAL_MAP,
+ );
- my $simple_only = Property->new("_s$case",
- Type => $STRING,
- Default_Map => $CODE_POINT,
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Description => "This contains the simple mappings for $case for just the code points that have different full mappings");
- $simple_only->set_to_output_map($INTERNAL_MAP);
- $simple_only->add_comment(join_lines( <<END
-This file is for UCD.pm so that it can construct simple mappings that would
-otherwise be lost because they are overridden by full mappings.
+ $full_table->add_comment(join_lines( <<END
+This file includes both the simple and full case changing maps. The simple
+ones are in the main body of the table below, and the full ones adding to or
+overriding them are in the hash.
END
));
+ # The simple version's name in each mapping merely has an 's' in
+ # front of the full one's
+ my $simple_name = 's' . $full_name;
+ my $simple = property_ref($simple_name);
+ $simple->initialize($full_table) if $simple->to_output_map();
+
unless ($simple->to_output_map()) {
- $simple_only->set_proxy_for($simple_name);
+ $full_table->set_proxy_for($simple_name);
}
}
return;
}
- $_ = "$fields[0]; lc; $fields[1]";
- $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
- $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
+ my $decimal_code_point = hex $fields[0];
- # Copy any simple case change to the special tables constructed if
- # being overridden by a multi-character case change.
- if ($fields[1] ne $fields[0]
- && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _slc; $value");
- }
- if ($fields[2] ne $fields[0]
- && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _stc; $value");
- }
- if ($fields[3] ne $fields[0]
- && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _suc; $value");
+ # Loop to handle each of the three mappings in the input line, in
+ # order, with $i indicating the current field number.
+ my $i = 0;
+ for my $object ($lc, $tc, $uc) {
+ $i++; # First time through, $i = 0 ... 3rd time = 3
+
+ my $value = $object->value_of($decimal_code_point);
+ $value = ($value eq $CODE_POINT)
+ ? $decimal_code_point
+ : hex $value;
+
+ # If this isn't a multi-character mapping, it should already have
+ # been read in.
+ if ($fields[$i] !~ / /) {
+ if ($value != hex $fields[$i]) {
+ Carp::my_carp("Bad news. UnicodeData.txt thinks "
+ . $object->name
+ . "(0x$fields[0]) is $value"
+ . " and SpecialCasing.txt thinks it is "
+ . hex($fields[$i])
+ . ". Good luck. Retaining UnicodeData value, and proceeding anyway.");
+ }
+ }
+ else {
+
+ # The mapping goes into both the legacy table, in which it
+ # replaces the simple one...
+ $file->insert_adjusted_lines("$fields[0]; Legacy_"
+ . $object->full_name
+ . "; $fields[$i]");
+
+ # ... and, the The regular table, in which it is additional,
+ # beyond the simple mapping.
+ $file->insert_adjusted_lines("$fields[0]; "
+ . $object->name
+ . "; "
+ . $CMD_DELIM
+ . "$REPLACE_CMD=$MULTIPLE_BEFORE"
+ . $CMD_DELIM
+ . $fields[$i]);
+ }
}
+ # Everything has been handled by the insert_adjusted_lines()
+ $_ = "";
+
return;
}
}
# Create the map for simple only if are going to output it, for otherwise
# it takes no part in anything we do.
my $to_output_simple;
+ my $non_final_folds;
sub setup_case_folding($) {
# Read in the case foldings in CaseFolding.txt. This handles both
property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
}
+ $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
+ );
+
# If we ever wanted to show that these tables were combined, a new
# property method could be created, like set_combined_props()
property_ref('Case_Folding')->add_comment(join_lines( <<END
# so that _swash_inversion_hash() is able to construct closures
# without having to worry about F mappings.
if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
- $_ = "$range; Case_Folding; $CMD_DELIM$REPLACE_CMD=$MULTIPLE$CMD_DELIM$map";
+ $_ = "$range; Case_Folding; "
+ . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
+ if ($type eq 'F') {
+ my @string = split " ", $map;
+ for my $i (0 .. @string - 1 -1) {
+ $non_final_folds->add_range(hex $string[$i], hex $string[$i]);
+ }
+ }
}
else {
$_ = "";
# The Script_Extensions property starts out with a clone of the Script
# property.
- my $sc = property_ref("Script");
- my $scx = Property->new("scx", Full_Name => "Script_Extensions",
- Initialize => $sc,
- Default_Map => $sc->default_map,
- Pre_Declared_Maps => 0,
- Format => $STRING_WHITE_SPACE_LIST,
- );
+ my $scx = property_ref("Script_Extensions");
+ $scx = Property->new("scx", Full_Name => "Script_Extensions")
+ if ! defined $scx;
+ $scx->_set_format($STRING_WHITE_SPACE_LIST);
+ $scx->initialize($script);
+ $scx->set_default_map($script->default_map);
+ $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
$scx->add_comment(join_lines( <<END
The values for code points that appear in one script are just the same as for
the 'Script' property. Likewise the values for those that appear in many
END
));
- # Make the scx's tables and aliases for them the same as sc's
- foreach my $table ($sc->tables) {
+ # Initialize scx's tables and the aliases for them to be the same as sc's
+ foreach my $table ($script->tables) {
my $scx_table = $scx->add_match_table($table->name,
Full_Name => $table->full_name);
foreach my $alias ($table->aliases) {
# 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
my @fields = split /\s*;\s*/;
+
+ # This script was erroneously omitted in this Unicode version.
+ $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
+
my @full_names;
foreach my $short_name (split " ", $fields[1]) {
push @full_names, $script->table($short_name)->full_name;
return;
}
-sub setup_v6_name_alias {
- property_ref('Name_Alias')->add_map(7, 7, "ALERT");
+sub setup_early_name_alias {
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ # This has the effect of pretending that the Name_Alias property was
+ # available in all Unicode releases. Strictly speaking, this property
+ # should not be availabe in early releases, but doing this allows
+ # charnames.pm to work on older releases without change. Prior to v5.16
+ # it had these names hard-coded inside it. Unicode 6.1 came along and
+ # created these names, and so they were removed from charnames.
+
+ my $aliases = property_ref('Name_Alias');
+ if (! defined $aliases) {
+ $aliases = Property->new('Name_Alias', Default_Map => "");
+ }
+
+ $file->insert_lines(get_old_name_aliases());
+
+ return;
+}
+
+sub get_old_name_aliases () {
+
+ # The Unicode_1_Name field, contains most of these names. One would
+ # expect, given the field's name, that its values would be fixed across
+ # versions, giving the true Unicode version 1 name for the character.
+ # Sadly, this is not the case. Actually Version 1.1.5 had no names for
+ # any of the controls; Version 2.0 introduced names for the C0 controls,
+ # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2
+ # changed some names: it
+ # changed to parenthesized versions like "NEXT LINE" to
+ # "NEXT LINE (NEL)";
+ # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
+ # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
+ # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
+ # This list contains all the names that were defined so that
+ # charnames::vianame(), etc. understand them all EVEN if this version of
+ # Unicode didn't specify them (this could be construed as a bug).
+ # mktables elsewhere gives preference to the Unicode_1_Name field over
+ # these names, so that viacode() will return the correct value for that
+ # version of Unicode, except when that version doesn't define a name,
+ # viacode() will return one anyway (this also could be construed as a
+ # bug). But these potential "bugs" allow for the smooth working of code
+ # on earlier Unicode releases.
+
+ my @return = split /\n/, <<'END';
+0000;NULL;control
+0000;NUL;abbreviation
+0001;START OF HEADING;control
+0001;SOH;abbreviation
+0002;START OF TEXT;control
+0002;STX;abbreviation
+0003;END OF TEXT;control
+0003;ETX;abbreviation
+0004;END OF TRANSMISSION;control
+0004;EOT;abbreviation
+0005;ENQUIRY;control
+0005;ENQ;abbreviation
+0006;ACKNOWLEDGE;control
+0006;ACK;abbreviation
+0007;BELL;control
+0007;BEL;abbreviation
+0008;BACKSPACE;control
+0008;BS;abbreviation
+0009;CHARACTER TABULATION;control
+0009;HORIZONTAL TABULATION;control
+0009;HT;abbreviation
+0009;TAB;abbreviation
+000A;LINE FEED;control
+000A;LINE FEED (LF);control
+000A;NEW LINE;control
+000A;END OF LINE;control
+000A;LF;abbreviation
+000A;NL;abbreviation
+000A;EOL;abbreviation
+000B;LINE TABULATION;control
+000B;VERTICAL TABULATION;control
+000B;VT;abbreviation
+000C;FORM FEED;control
+000C;FORM FEED (FF);control
+000C;FF;abbreviation
+000D;CARRIAGE RETURN;control
+000D;CARRIAGE RETURN (CR);control
+000D;CR;abbreviation
+000E;SHIFT OUT;control
+000E;LOCKING-SHIFT ONE;control
+000E;SO;abbreviation
+000F;SHIFT IN;control
+000F;LOCKING-SHIFT ZERO;control
+000F;SI;abbreviation
+0010;DATA LINK ESCAPE;control
+0010;DLE;abbreviation
+0011;DEVICE CONTROL ONE;control
+0011;DC1;abbreviation
+0012;DEVICE CONTROL TWO;control
+0012;DC2;abbreviation
+0013;DEVICE CONTROL THREE;control
+0013;DC3;abbreviation
+0014;DEVICE CONTROL FOUR;control
+0014;DC4;abbreviation
+0015;NEGATIVE ACKNOWLEDGE;control
+0015;NAK;abbreviation
+0016;SYNCHRONOUS IDLE;control
+0016;SYN;abbreviation
+0017;END OF TRANSMISSION BLOCK;control
+0017;ETB;abbreviation
+0018;CANCEL;control
+0018;CAN;abbreviation
+0019;END OF MEDIUM;control
+0019;EOM;abbreviation
+001A;SUBSTITUTE;control
+001A;SUB;abbreviation
+001B;ESCAPE;control
+001B;ESC;abbreviation
+001C;INFORMATION SEPARATOR FOUR;control
+001C;FILE SEPARATOR;control
+001C;FS;abbreviation
+001D;INFORMATION SEPARATOR THREE;control
+001D;GROUP SEPARATOR;control
+001D;GS;abbreviation
+001E;INFORMATION SEPARATOR TWO;control
+001E;RECORD SEPARATOR;control
+001E;RS;abbreviation
+001F;INFORMATION SEPARATOR ONE;control
+001F;UNIT SEPARATOR;control
+001F;US;abbreviation
+0020;SP;abbreviation
+007F;DELETE;control
+007F;DEL;abbreviation
+0080;PADDING CHARACTER;figment
+0080;PAD;abbreviation
+0081;HIGH OCTET PRESET;figment
+0081;HOP;abbreviation
+0082;BREAK PERMITTED HERE;control
+0082;BPH;abbreviation
+0083;NO BREAK HERE;control
+0083;NBH;abbreviation
+0084;INDEX;control
+0084;IND;abbreviation
+0085;NEXT LINE;control
+0085;NEXT LINE (NEL);control
+0085;NEL;abbreviation
+0086;START OF SELECTED AREA;control
+0086;SSA;abbreviation
+0087;END OF SELECTED AREA;control
+0087;ESA;abbreviation
+0088;CHARACTER TABULATION SET;control
+0088;HORIZONTAL TABULATION SET;control
+0088;HTS;abbreviation
+0089;CHARACTER TABULATION WITH JUSTIFICATION;control
+0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
+0089;HTJ;abbreviation
+008A;LINE TABULATION SET;control
+008A;VERTICAL TABULATION SET;control
+008A;VTS;abbreviation
+008B;PARTIAL LINE FORWARD;control
+008B;PARTIAL LINE DOWN;control
+008B;PLD;abbreviation
+008C;PARTIAL LINE BACKWARD;control
+008C;PARTIAL LINE UP;control
+008C;PLU;abbreviation
+008D;REVERSE LINE FEED;control
+008D;REVERSE INDEX;control
+008D;RI;abbreviation
+008E;SINGLE SHIFT TWO;control
+008E;SINGLE-SHIFT-2;control
+008E;SS2;abbreviation
+008F;SINGLE SHIFT THREE;control
+008F;SINGLE-SHIFT-3;control
+008F;SS3;abbreviation
+0090;DEVICE CONTROL STRING;control
+0090;DCS;abbreviation
+0091;PRIVATE USE ONE;control
+0091;PRIVATE USE-1;control
+0091;PU1;abbreviation
+0092;PRIVATE USE TWO;control
+0092;PRIVATE USE-2;control
+0092;PU2;abbreviation
+0093;SET TRANSMIT STATE;control
+0093;STS;abbreviation
+0094;CANCEL CHARACTER;control
+0094;CCH;abbreviation
+0095;MESSAGE WAITING;control
+0095;MW;abbreviation
+0096;START OF GUARDED AREA;control
+0096;START OF PROTECTED AREA;control
+0096;SPA;abbreviation
+0097;END OF GUARDED AREA;control
+0097;END OF PROTECTED AREA;control
+0097;EPA;abbreviation
+0098;START OF STRING;control
+0098;SOS;abbreviation
+0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
+0099;SGC;abbreviation
+009A;SINGLE CHARACTER INTRODUCER;control
+009A;SCI;abbreviation
+009B;CONTROL SEQUENCE INTRODUCER;control
+009B;CSI;abbreviation
+009C;STRING TERMINATOR;control
+009C;ST;abbreviation
+009D;OPERATING SYSTEM COMMAND;control
+009D;OSC;abbreviation
+009E;PRIVACY MESSAGE;control
+009E;PM;abbreviation
+009F;APPLICATION PROGRAM COMMAND;control
+009F;APC;abbreviation
+00A0;NBSP;abbreviation
+00AD;SHY;abbreviation
+200B;ZWSP;abbreviation
+200C;ZWNJ;abbreviation
+200D;ZWJ;abbreviation
+200E;LRM;abbreviation
+200F;RLM;abbreviation
+202A;LRE;abbreviation
+202B;RLE;abbreviation
+202C;PDF;abbreviation
+202D;LRO;abbreviation
+202E;RLO;abbreviation
+FEFF;BYTE ORDER MARK;alternate
+FEFF;BOM;abbreviation
+FEFF;ZWNBSP;abbreviation
+END
+
+ if ($v_version ge v3.0.0) {
+ push @return, split /\n/, <<'END';
+180B; FVS1; abbreviation
+180C; FVS2; abbreviation
+180D; FVS3; abbreviation
+180E; MVS; abbreviation
+202F; NNBSP; abbreviation
+END
+ }
+
+ if ($v_version ge v3.2.0) {
+ push @return, split /\n/, <<'END';
+034F; CGJ; abbreviation
+205F; MMSP; abbreviation
+2060; WJ; abbreviation
+END
+ # Add in VS1..VS16
+ my $cp = 0xFE00 - 1;
+ for my $i (1..16) {
+ push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
+ }
+ }
+ if ($v_version ge v4.0.0) { # Add in VS17..VS256
+ my $cp = 0xE0100 - 17;
+ for my $i (17..256) {
+ push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
+ }
+ }
+
+ # ALERT did not come along until 6.0, at which point it became preferred
+ # over BELL, and was never in the Unicode_1_Name field. For the same
+ # reasons, that the other names are made known to all releases by this
+ # function, we make ALERT known too. By inserting it
+ # last in early releases, BELL is preferred over it; and vice-vers in 6.0
+ my $alert = '0007; ALERT; control';
+ if ($v_version lt v6.0.0) {
+ push @return, $alert;
+ }
+ else {
+ unshift @return, $alert;
+ }
+
+ return @return;
+}
+
+sub filter_later_version_name_alias_line {
+
+ # This file has an extra entry per line for the alias type. This is
+ # handled by creating a compound entry: "$alias: $type"; First, split
+ # the line into components.
+ my ($range, $alias, $type, @remainder)
+ = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+
+ # This file contains multiple entries for some components, so tell the
+ # downstream code to allow this in our internal tables; the
+ # $MULTIPLE_AFTER preserves the input ordering.
+ $_ = join ";", $range, $CMD_DELIM
+ . $REPLACE_CMD
+ . '='
+ . $MULTIPLE_AFTER
+ . $CMD_DELIM
+ . "$alias: $type",
+ @remainder;
+ return;
+}
+
+sub filter_early_version_name_alias_line {
+
+ # Early versions did not have the trailing alias type field; implicitly it
+ # was 'correction'. But our synthetic lines we add in this program do
+ # have it, so test for the type field.
+ $_ .= "; correction" if $_ !~ /;.*;/;
+
+ filter_later_version_name_alias_line;
+ return;
}
sub finish_Unicode() {
# This routine should be called after all the Unicode files have been read
# in. It:
- # 1) Adds the mappings for code points missing from the files which have
+ # 1) Creates properties that are missing from the version of Unicode being
+ # compiled, and which, for whatever reason, are needed for the Perl
+ # core to function properly. These are minimally populated as
+ # necessary.
+ # 2) Adds the mappings for code points missing from the files which have
# defaults specified for them.
- # 2) At this this point all mappings are known, so it computes the type of
+ # 3) At this this point all mappings are known, so it computes the type of
# each property whose type hasn't been determined yet.
- # 3) Calculates all the regular expression match tables based on the
+ # 4) Calculates all the regular expression match tables based on the
# mappings.
- # 3) Calculates and adds the tables which are defined by Unicode, but
- # which aren't derived by them
+ # 5) Calculates and adds the tables which are defined by Unicode, but
+ # which aren't derived by them, and certain derived tables that Perl
+ # uses.
+
+ # GCB and hst are not in early Unicode releases; create dummy ones if
+ # they don't exist, as the core needs tables generated from them.
+ my $gcb = property_ref('Grapheme_Cluster_Break');
+ if (! defined $gcb) {
+ $gcb = Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
+ Status => $PLACEHOLDER,
+ Type => $ENUM,
+ Default_Map => 'Other');
+ }
+ my $hst = property_ref('Hangul_Syllable_Type');
+ if (!defined $hst) {
+ $hst = Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
+ Status => $PLACEHOLDER,
+ Type => $ENUM,
+ Default_Map => 'Not_Applicable');
+ }
# For each property, fill in any missing mappings, and calculate the re
# match tables. If a property has more than one missing mapping, the
# need to be finished up.
next if $property == $perl;
+ # Nor do we need to do anything with properties that aren't going to
+ # be output.
+ next if $property->fate == $SUPPRESSED;
+
# Handle the properties that have more than one possible default
if (ref $property->default_map) {
my $default_map = $property->default_map;
Lowercase_Mapping
Titlecase_Mapping
Case_Folding
- } ) {
+ } )
+ {
my $full = property_ref($map);
if ($full->is_empty) {
my $simple = property_ref('Simple_' . $map);
}
}
+ # Create digit and case fold tables with the original file names for
+ # backwards compatibility with applications that read them directly.
+ my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
+ Default_Map => "",
+ Perl_Extension => 1,
+ File => 'Digit', # Trad. location
+ Directory => $map_directory,
+ UCD => 0,
+ Type => $STRING,
+ To_Output_Map => $EXTERNAL_MAP,
+ Range_Size_1 => 1,
+ Initialize => property_ref('Perl_Decimal_Digit'),
+ );
+ $Digit->add_comment(join_lines(<<END
+This file gives the mapping of all code points which represent a single
+decimal digit [0-9] to their respective digits. For example, the code point
+U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
+that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
+numerals.
+END
+ ));
+
+ Property->new('Legacy_Case_Folding',
+ File => "Fold",
+ Directory => $map_directory,
+ Default_Map => $CODE_POINT,
+ UCD => 0,
+ Range_Size_1 => 1,
+ Type => $STRING,
+ To_Output_Map => $EXTERNAL_MAP,
+ Format => $HEX_FORMAT,
+ Initialize => property_ref('cf'),
+ );
+
# The Script_Extensions property started out as a clone of the Script
# property. But processing its data file caused some elements to be
# replaced with different data. (These elements were for the Common and
# identical code points, but their caseless equivalents are not the same,
# one being 'Cased' and the other being 'LC', and so now must be kept as
# separate entities.
- $Title += $lt if defined $lt;
+ if (defined $lt) {
+ $Title += $lt;
+ }
+ else {
+ push @tables_that_may_be_empty, $Title->complete_name;
+ }
# If this Unicode version doesn't have Cased, set up our own. From
# Unicode 5.1: Definition D120: A character C is defined to be cased if
$Posix_Lower->set_caseless_equivalent($Posix_Alpha);
my $Alnum = $perl->add_match_table('Alnum',
- Description => 'Alphabetic and (Decimal) Numeric',
+ Description => 'Alphabetic and (decimal) Numeric',
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
$Alnum->add_alias('XPosixAlnum');
# Perl's traditional space doesn't include Vertical Tab
my $XPerlSpace = $perl->add_match_table('XPerlSpace',
Description => '\s, including beyond ASCII',
- Initialize => $Space - 0x000B,
+ #Initialize => $Space - 0x000B,
+ Initialize => $Space,
);
$XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
my $PerlSpace = $perl->add_match_table('PerlSpace',
- Description => '\s, restricted to ASCII = [ \f\n\r\t]',
+ Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
Initialize => $XPerlSpace & $ASCII,
);
# The 'extended' grapheme cluster came in 5.1. The non-extended
# definition differs too much from the traditional Perl one to use.
- if (defined $gcb && defined $gcb->table('SpacingMark')) {
+ if (defined $gcb->table('SpacingMark')) {
- # Note that assumes HST is defined; it came in an earlier release than
+ # Note that assumes hst is defined; it came in an earlier release than
# GCB. In the line below, two negatives means: yes hangul
$begin += ~ property_ref('Hangul_Syllable_Type')
->table('Not_Applicable')
# We set things up so the Perl core degrades gracefully, possibly with
# placeholders that match nothing.
- if (! defined $gcb) {
- $gcb = Property->new('GCB', Status => $PLACEHOLDER);
- }
- my $hst = property_ref('HST');
- if (!defined $hst) {
- $hst = Property->new('HST', Status => $PLACEHOLDER);
- $hst->add_match_table('Not_Applicable',
- Initialize => $Any,
- Matches_All => 1);
- }
+ my $hst = property_ref('Hangul_Syllable_Type');
# On some releases, here we may not have the needed tables for the
# perl core, in some releases we may.
push @tables_that_may_be_empty, $table->complete_name;
}
- # The HST property predates the GCB one, and has identical tables
+ # The hst property predates the GCB one, and has identical tables
# for some of them, so use it if we can.
- if ($table->is_empty
- && defined $hst
- && defined $hst->table($name))
+ if ($table->is_empty && defined $hst->table($name))
{
$table += $hst->table($name);
}
push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
} else {
$lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
- $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
+ $lv_lvt_v->add_comment('For use in \X; matches: hst=LV | hst=LVT | hst=V');
}
# Was previously constructed to contain both Name and Unicode_1_Name
}
my $alias_sentence = "";
+ my %abbreviations;
my $alias = property_ref('Name_Alias');
- if (defined $alias) {
- push @composition, 'Name_Alias';
- $alias->reset_each_range;
- while (my ($range) = $alias->each_range) {
- next if $range->value eq "";
- if ($range->start != $range->end) {
- Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
- }
- $perl_charname->add_duplicate($range->start, $range->value);
- }
- $alias_sentence = <<END;
-The Name_Alias property adds duplicate code point entries with a corrected
-name. The original (less correct, but still valid) name will be physically
-last.
+ push @composition, 'Name_Alias';
+ $perl_charname->set_proxy_for('Name_Alias');
+
+ # Add each entry in Name_Alias to Perl_Charnames. Where these go with
+ # respect to any existing entry depends on the entry type. Corrections go
+ # before said entry, as they should be returned in preference over the
+ # existing entry. (A correction to a correction should be later in the
+ # Name_Alias table, so it will correctly precede the erroneous correction
+ # in Perl_Charnames.)
+ #
+ # Abbreviations go after everything else, so they are saved temporarily in
+ # a hash for later.
+ #
+ # Controls are currently added afterwards. This is because Perl has
+ # previously used the Unicode1 name, and so should still use that. (Most
+ # of them will be the same anyway, in which case we don't add a duplicate)
+
+ $alias->reset_each_range;
+ while (my ($range) = $alias->each_range) {
+ next if $range->value eq "";
+ my $code_point = $range->start;
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+ my ($value, $type) = split ': ', $range->value;
+ my $replace_type;
+ if ($type eq 'correction') {
+ $replace_type = $MULTIPLE_BEFORE;
+ }
+ elsif ($type eq 'abbreviation') {
+
+ # Save for later
+ $abbreviations{$value} = $code_point;
+ next;
+ }
+ else {
+ $replace_type = $MULTIPLE_AFTER;
+ }
+
+ # Actually add; before or after current entry(ies) as determined
+ # above.
+
+ $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ }
+ $alias_sentence = <<END;
+The Name_Alias property adds duplicate code point entries that are
+alternatives to the original name. If an addition is a corrected
+name, it will be physically first in the table. The original (less correct,
+but still valid) name will be next; then any alternatives, in no particular
+order; and finally any abbreviations, again in no particular order.
END
+
+ # Now add the Unicode_1 names for the controls. The Unicode_1 names had
+ # precedence before 6.1, so should be first in the file; the other names
+ # have precedence starting in 6.1,
+ my $before_or_after = ($v_version lt v6.1.0)
+ ? $MULTIPLE_BEFORE
+ : $MULTIPLE_AFTER;
+
+ foreach my $range (property_ref('Unicode_1_Name')->ranges) {
+ my $code_point = $range->start;
+ my $unicode_1_value = $range->value;
+ next if $unicode_1_value eq ""; # Skip if name doesn't exist.
+
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+
+ # To handle EBCDIC, we don't hard code in the code points of the
+ # controls; instead realizing that all of them are below 256.
+ last if $code_point > 255;
+
+ # We only add in the controls.
+ next if $gc->value_of($code_point) ne 'Cc';
+
+ # This won't add an exact duplicate.
+ $perl_charname->add_duplicate($code_point, $unicode_1_value,
+ Replace => $before_or_after);
+ }
+
+ # Now that have everything added, add in abbreviations after
+ # everything else.
+ foreach my $value (keys %abbreviations) {
+ $perl_charname->add_duplicate($abbreviations{$value}, $value,
+ Replace => $MULTIPLE_AFTER);
}
+
my $comment;
if (@composition <= 2) { # Always at least 2
$comment = join " and ", @composition;
$perl_charname->add_comment(join_lines( <<END
This file is for charnames.pm. It is the union of the $comment properties.
-Unicode_1_Name entries are used only for otherwise nameless code
-points.
+Unicode_1_Name entries are used only for nameless code points in the Name
+property.
$alias_sentence
This file doesn't include the algorithmically determinable names. For those,
use 'unicore/Name.pm'
$unassigned->set_equivalent_to($age_default, Related => 1);
}
+ # See L<perlfunc/quotemeta>
+ my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+
+ # Initialize to what's common in
+ # all Unicode releases.
+ Initialize =>
+ $Space
+ + $gc->table('Control')
+ );
+
+ # In early releases without the proper Unicode properties, just set to \W.
+ if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
+ || ! defined (my $patws = property_ref('Pattern_White_Space'))
+ || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
+ {
+ $quotemeta += ~ $Word;
+ }
+ else {
+ $quotemeta += $patsyn->table('Y')
+ + $patws->table('Y')
+ + $di->table('Y')
+ + ((~ $Word) & $ASCII);
+ }
# Finished creating all the perl properties. All non-internal non-string
# ones have a synonym of 'Is_' prefixed. (Internal properties begin with
Re_Pod_Entry => 0,
UCD => 0,
Status => $alias->status,
- Externally_Ok => 0);
+ OK_as_Filename => 0);
}
}
# No name collision, so ok to add the perl synonym.
my $make_re_pod_entry;
- my $externally_ok;
+ my $ok_as_filename;
my $status = $alias->status;
if ($nominal_property == $block) {
if ($prefix eq "") {
$make_re_pod_entry = 1;
$status = $status || $DISCOURAGED;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
elsif ($prefix eq 'In_') {
$make_re_pod_entry = 0;
$status = $status || $NORMAL;
- $externally_ok = 1;
+ $ok_as_filename = 1;
}
else {
$make_re_pod_entry = 0;
$status = $status || $DISCOURAGED;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
}
elsif ($prefix ne "") {
# card, and we won't use it for an external name
$make_re_pod_entry = 0;
$status = $status || $NORMAL;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
else {
# own pod entry and can be used for an external name.
$make_re_pod_entry = 1;
$status = $status || $NORMAL;
- $externally_ok = 1;
+ $ok_as_filename = 1;
}
# Here, there isn't a perl pre-existing table with the
UCD => 0,
Status => $status,
- Externally_Ok => $externally_ok);
+ OK_as_Filename => $ok_as_filename);
trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
next PREFIX;
}
UCD => 0,
Status => $status,
- Externally_Ok => $externally_ok);
+ OK_as_Filename => $ok_as_filename);
# And it will be related to the actual table, since it is
# based on it.
$added_table->set_equivalent_to($actual, Related => 1);
|| $ucd_pod{$standard}{'perl_extension'} == $perl_extension
|| $output_this == $perl_extension)
{
- Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations. Proceeding anyway.");
+ Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
}
# We modifiy the info column of the one being output to
foreach my $file (keys %ignored_files) {
push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
}
+ foreach my $file (keys %skipped_files) {
+ push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
+ }
# Then, sort each group.
foreach my $group (keys %grouped_by_reason) {
Uppercase_Mapping uc()
Also, Case_Folding is accessible through the C</i> modifier in regular
-expressions.
+expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
+operator.
And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
-interpolation in double-quoted strings and regular expressions, but both
-usages require a L<use charnames;|charnames> to be specified, which also
-contains related functions viacode(), vianame(), and string_vianame().
+interpolation in double-quoted strings and regular expressions; and functions
+C<charnames::viacode()>, C<charnames::vianame()>, and
+C<charnames::string_vianame()> (which require a C<use charnames ();> to be
+specified.
Finally, most properties related to decomposition are accessible via
L<Unicode::Normalize>.
push @name, <<END;
+package charnames;
+
# This module contains machine-generated tables and code for the
# algorithmically-determinable Unicode character names. The following
# routines can be used to translate between name and code point and vice versa
foreach my $alias ($table->aliases) {
# Skip non-legal names
- next unless $alias->externally_ok;
+ next unless $alias->ok_as_filename;
next unless $alias->ucd;
$found_ucd = 1; # have at least one legal name
# standardized alias
foreach my $alias ($table->aliases) {
next unless $alias->ucd;
- next unless $alias->externally_ok;
+ next unless $alias->ok_as_filename;
push @{$perlprop_to_aliases{standardize($alias->name)}},
@aliases_list;
}
# Make a list of all combinations of properties/values that are suppressed.
my @suppressed;
- foreach my $property_name (keys %why_suppressed) {
+ if (! $debug_skip) { # This tends to fail in this debug mode
+ foreach my $property_name (keys %why_suppressed) {
- # Just the value
- my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
+ # Just the value
+ my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
- # The hash may contain properties not in this release of Unicode
- next unless defined (my $property = property_ref($property_name));
+ # The hash may contain properties not in this release of Unicode
+ next unless defined (my $property = property_ref($property_name));
- # Find all combinations
- foreach my $prop_alias ($property->aliases) {
- my $prop_alias_name = standardize($prop_alias->name);
+ # Find all combinations
+ foreach my $prop_alias ($property->aliases) {
+ my $prop_alias_name = standardize($prop_alias->name);
- # If no =value, there's just one combination possibe for this
- if (! $value_name) {
+ # If no =value, there's just one combination possibe for this
+ if (! $value_name) {
- # The property may be suppressed, but there may be a proxy for
- # it, so it shouldn't be listed as suppressed
- next if $prop_alias->ucd;
- push @suppressed, $prop_alias_name;
- }
- else { # Otherwise
- foreach my $value_alias ($property->table($value_name)->aliases)
- {
- next if $value_alias->ucd;
+ # The property may be suppressed, but there may be a proxy
+ # for it, so it shouldn't be listed as suppressed
+ next if $prop_alias->ucd;
+ push @suppressed, $prop_alias_name;
+ }
+ else { # Otherwise
+ foreach my $value_alias
+ ($property->table($value_name)->aliases)
+ {
+ next if $value_alias->ucd;
- push @suppressed, "$prop_alias_name="
- . standardize($value_alias->name);
+ push @suppressed, "$prop_alias_name="
+ . standardize($value_alias->name);
+ }
}
}
}
|| ($table == $property->table('N')
&& $property->table('Y')->is_empty));
-
- # Some tables should match everything
- my $expected_full =
- ($is_property)
- ? # All these types of map tables will be full because
- # they will have been populated with defaults
- ($type == $ENUM || $type == $FORCED_BINARY)
-
- : # A match table should match everything if its method
- # shows it should
- ($table->matches_all
-
- # The complement of an empty binary table will match
- # everything
- || $is_complement_of_empty_binary
- )
- ;
-
if ($table->is_empty) {
if ($suppress_if_empty_warn_if_not) {
Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
}
+ # Some tables should match everything
+ my $expected_full =
+ ($table->fate == $SUPPRESSED)
+ ? 0
+ : ($is_property)
+ ? # All these types of map tables will be full because
+ # they will have been populated with defaults
+ ($type == $ENUM || $type == $FORCED_BINARY)
+
+ : # A match table should match everything if its method
+ # shows it should
+ ($table->matches_all
+
+ # The complement of an empty binary table will match
+ # everything
+ || $is_complement_of_empty_binary
+ )
+ ;
+
my $count = $table->count;
if ($expected_full) {
if ($count != $MAX_UNICODE_CODEPOINTS) {
}
}
}
- elsif ($count == $MAX_UNICODE_CODEPOINTS) {
- if ($table == $property || $table->leader == $table) {
+ elsif ($count == $MAX_UNICODE_CODEPOINTS
+ && ($table == $property || $table->leader == $table)
+ && $table->property->status != $PLACEHOLDER)
+ {
Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
- }
}
if ($table->fate == $SUPPRESSED) {
make_UCD;
make_property_test_script() if $make_test_script;
+ make_normalization_test_script() if $make_norm_test_script;
return;
}
push @property_aliases, map { Alias->new("Is_" . $_->name,
$_->loose_match,
$_->make_re_pod_entry,
- $_->externally_ok,
+ $_->ok_as_filename,
$_->status,
$_->ucd,
)
return;
}
+sub make_normalization_test_script() {
+ print "Making normalization test script\n" if $verbosity >= $PROGRESS;
+
+ my $n_path = 'TestNorm.pl';
+
+ unshift @normalization_tests, <<'END';
+use utf8;
+use Test::More;
+
+sub ord_string { # Convert packed ords to printable string
+ use charnames ();
+ return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
+ unpack "U*", shift) . "'";
+ #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
+}
+
+sub Test_N {
+ my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
+ my $display_source = ord_string($source);
+ my $display_nfc = ord_string($nfc);
+ my $display_nfd = ord_string($nfd);
+ my $display_nfkc = ord_string($nfkc);
+ my $display_nfkd = ord_string($nfkd);
+
+ use Unicode::Normalize;
+ # NFC
+ # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
+ # nfkc == toNFC(nfkc) == toNFC(nfkd)
+ #
+ # NFD
+ # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
+ # nfkd == toNFD(nfkc) == toNFD(nfkd)
+ #
+ # NFKC
+ # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
+ # toNFKC(nfkc) == toNFKC(nfkd)
+ #
+ # NFKD
+ # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
+ # toNFKD(nfkc) == toNFKD(nfkd)
+
+ is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
+ is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
+ is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
+ is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
+ is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
+
+ is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
+ is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
+ is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
+ is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
+ is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
+
+ is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
+ is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
+ is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
+ is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
+ is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
+
+ is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
+ is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
+ is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
+ is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
+ is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
+}
+END
+
+ &write($n_path,
+ 1, # Is utf8;
+ [
+ @normalization_tests,
+ 'done_testing();'
+ ]);
+ return;
+}
+
# This is a list of the input files and how to handle them. The files are
# processed in their order in this list. Some reordering is possible if
# desired, but the v0 files should be first, and the extracted before the
# others except DAge.txt (as data in an extracted file can be over-ridden by
# the non-extracted. Some other files depend on data derived from an earlier
# file, like UnicodeData requires data from Jamo, and the case changing and
-# folding requires data from Unicode. Mostly, it safest to order by first
+# folding requires data from Unicode. Mostly, it is safest to order by first
# version releases in (except the Jamo). DAge.txt is read before the
# extracted ones because of the rarely used feature $compare_versions. In the
# unlikely event that there were ever an extracted file that contained the Age
Input_file->new('SpecialCasing.txt', v2.1.8,
Each_Line_Handler => \&filter_special_casing_line,
Pre_Handler => \&setup_special_casing,
+ Has_Missings_Defaults => $IGNORED,
),
Input_file->new(
'LineBreak.txt', v3.0.0,
Input_file->new('BidiMirroring.txt', v3.0.1,
Property => 'Bidi_Mirroring_Glyph',
),
- Input_file->new("NormalizationTest.txt", v3.0.1,
- Skip => 'Validation Tests',
+ Input_file->new("NormTest.txt", v3.0.0,
+ Handler => \&process_NormalizationsTest,
+ Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
),
Input_file->new('CaseFolding.txt', v3.0.1,
Pre_Handler => \&setup_case_folding,
: undef,
\&filter_case_folding_line
],
+ Has_Missings_Defaults => $IGNORED,
),
Input_file->new('DCoreProperties.txt', v3.1.0,
# 5.2 changed this file
Input_file->new('NamedSequences.txt', v4.1.0,
Handler => \&process_NamedSequences
),
- Input_file->new('NameAliases.txt', v5.0.0,
+ Input_file->new('NameAliases.txt', v0,
Property => 'Name_Alias',
- Pre_Handler => ($v_version ge v6.0.0)
- ? \&setup_v6_name_alias
+ Pre_Handler => ($v_version le v6.0.0)
+ ? \&setup_early_name_alias
: undef,
+ Each_Line_Handler => ($v_version le v6.0.0)
+ ? \&filter_early_version_name_alias_line
+ : \&filter_later_version_name_alias_line,
),
Input_file->new("BidiTest.txt", v5.2.0,
Skip => 'Validation Tests',
Property => 'Script_Extensions',
Pre_Handler => \&setup_script_extensions,
Each_Line_Handler => \&filter_script_extensions_line,
+ Has_Missings_Defaults => (($v_version le v6.0.0)
+ ? $NO_DEFAULTS
+ : $IGNORED),
+ ),
+ # The two Indic files are actually available starting in v6.0.0, but their
+ # property values are missing from PropValueAliases.txt in that release,
+ # so that further work would have to be done to get them to work properly
+ # for that release.
+ Input_file->new('IndicMatraCategory.txt', v6.1.0,
+ Property => 'Indic_Matra_Category',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ Skip => "Provisional; for the analysis and processing of Indic scripts",
+ ),
+ Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
+ Property => 'Indic_Syllabic_Category',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ Skip => "Provisional; for the analysis and processing of Indic scripts",
),
);
# The paths are stored with relative names, and with '/' as the
# delimiter; convert to absolute on this machine
my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
- $potential_files{$full} = 1
- if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
+ $potential_files{lc $full} = 1
+ if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
}
}
}
my @unknown_input_files;
- foreach my $file (keys %potential_files) {
- next if grep { lc($file) eq lc($_) } @known_files;
+ foreach my $file (keys %potential_files) { # The keys are stored in lc
+ next if grep { $file eq lc($_) } @known_files;
# Here, the file is unknown to us. Get relative path name
$file = File::Spec->abs2rel($file);