# Enum values for to_output_map() method in the Map_Table package.
my $EXTERNAL_MAP = 1;
my $INTERNAL_MAP = 2;
+my $OUTPUT_DELTAS = 3;
# To override computed values for writing the map tables for these properties.
# The default for enum map tables is to write them out, so that the Unicode
# its name
if ($seen_non_extracted_non_age) {
if ($file =~ /$EXTRACTED/i) {
- Carp::my_carp_bug(join_lines(<<END
+ Carp::my_carp_bug(main::join_lines(<<END
$file should be processed just after the 'Prop...Alias' files, and before
anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
have subtle problems
# either a constructor or a method. If called as a method, the result
# will be a new() instance of the calling object, containing the union
# of that object with the other parameter's code points; if called as
- # a constructor, the first parameter gives the class the new object
+ # a constructor, the first parameter gives the class that the new object
# should be, and the second parameter gives the code points to go into
# it.
# In either case, there are two parameters looked at by this routine;
# just a single code point.
#
# If they are ranges, this routine doesn't make any effort to preserve
- # the range values of one input over the other. Therefore this base
- # class should not allow _union to be called from other than
+ # the range values and types of one input over the other. Therefore
+ # this base class should not allow _union to be called from other than
# initialization code, so as to prevent two tables from being added
# together where the range values matter. The general form of this
# routine therefore belongs in a derived class, but it was moved here
# to avoid duplication of code. The failure to overload this in this
# class keeps it safe.
#
+ # It does make the effort during initialization to accept tables with
+ # multiple values for the same code point, and to preserve the order
+ # of these. If there is only one input range or range set, it doesn't
+ # sort (as it should already be sorted to the desired order), and will
+ # accept multiple values per code point. Otherwise it will merge
+ # multiple values into a single one.
my $self;
my @args; # Arguments to pass to the constructor
# Accumulate all records from both lists.
my @records;
+ my $input_count = 0;
for my $arg (@args) {
#local $to_trace = 0 if main::DEBUG;
trace "argument = $arg" if main::DEBUG && $to_trace;
Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
return;
}
+
$arg = [ $arg ] if ! ref $arg;
my $type = ref $arg;
if ($type eq 'ARRAY') {
foreach my $element (@$arg) {
push @records, Range->new($element, $element);
+ $input_count++;
}
}
elsif ($arg->isa('Range')) {
push @records, $arg;
+ $input_count++;
}
elsif ($arg->can('ranges')) {
push @records, $arg->ranges;
+ $input_count++;
}
else {
my $message = "";
# Sort with the range containing the lowest ordinal first, but if
# two ranges start at the same code point, sort with the bigger range
# of the two first, because it takes fewer cycles.
- @records = sort { ($a->start <=> $b->start)
+ if ($input_count > 1) {
+ @records = sort { ($a->start <=> $b->start)
or
# if b is shorter than a, b->end will be
# less than a->end, and we want to select
# a, so want to return -1
($b->end <=> $a->end)
} @records;
+ }
my $new = $class->new(@_);
for my $set (@records) {
my $start = $set->start;
my $end = $set->end;
- my $value = $set->value;
+ my $value = $set->value;
+ my $type = $set->type;
if ($start > $new->max) {
- $new->_add_delete('+', $start, $end, $value);
+ $new->_add_delete('+', $start, $end, $value, Type => $type);
}
elsif ($end > $new->max) {
- $new->_add_delete('+', $new->max +1, $end, $value);
+ $new->_add_delete('+', $new->max +1, $end, $value,
+ Type => $type);
+ }
+ elsif ($input_count == 1) {
+ # Here, overlaps existing range, but is from a single input,
+ # so preserve the multiple values from that input.
+ $new->_add_delete('+', $start, $end, $value, Type => $type,
+ Replace => $MULTIPLE_AFTER);
}
}
# new and old values are identical, the
# replacement is skipped to save cycles
# => $IF_NOT_EQUIVALENT means to replace the existing values
- # with this one if they are not equivalent.
+ # (the default) with this one if they are not equivalent.
# Ranges are equivalent if their types are the
# same, and they are the same string; or if
# both are type 0 ranges, if their Unicode
for my $try_hard (0, 1) {
# Look through all the ranges for a usable code point.
- for my $set ($self->ranges) {
+ for my $set (reverse $self->ranges) {
# Try the edge cases first, starting with the end point of the
# range.
# Write a representation of the table to its file. It calls several
# functions furnished by sub-classes of this abstract base class to
# handle non-normal ranges, to add stuff before the table, and at its
- # end.
+ # end. If the table is to be written using deltas from the current
+ # code point, this does that conversion.
my $self = shift;
+ my $use_delta_cp = shift; # ? output deltas or not
my $tab_stops = shift; # The number of tab stops over to put any
# comment.
my $suppress_value = shift; # Optional, if the value associated with
if ($annotate) {
- # if annotating each code point, must print 1 per line.
+ # If annotating each code point, must print 1 per line.
# The variable could point to a subroutine, and we don't want
# to lose that fact, so only set if not set already
$range_size_1 = 1 if ! $range_size_1;
);
}
+ # Values for previous time through the loop. Initialize to
+ # something that won't be adjacent to the first iteration;
+ # only $previous_end matters for that.
+ my $previous_start;
+ my $previous_end = -2;
+ my $previous_value;
+
+ # Values for next time through the portion of the loop that splits
+ # the range. 0 in $next_start means there is no remaining portion
+ # to deal with.
+ my $next_start = 0;
+ my $next_end;
+ my $next_value;
+
# Output each range as part of the here document.
RANGE:
for my $set ($range_list{$addr}->ranges) {
next RANGE if defined $suppress_value
&& $value eq $suppress_value;
- # If there is a range and doesn't need a single point range
- # output
- if ($start != $end && ! $range_size_1) {
- push @OUT, sprintf "%04X\t%04X", $start, $end;
- $OUT[-1] .= "\t$value" if $value ne "";
-
- # Add a comment with the size of the range, if requested.
- # Expand Tabs to make sure they all start in the same
- # column, and then unexpand to use mostly tabs.
- if (! $output_range_counts{$addr}) {
- $OUT[-1] .= "\n";
+ { # This bare block encloses the scope where we may need to
+ # split a range (when outputting deltas), and each time
+ # through we handle the next portion of the original by
+ # ending the block with a 'redo'. The values to use for
+ # that next time through are set up just below in the
+ # scalars whose names begin with '$next_'.
+
+ if ($use_delta_cp) {
+
+ # When converting to deltas, we can handle only single
+ # element ranges. Set up so that this time through
+ # the loop, we look at the first element, and the next
+ # time through, we start off with the remainder. Thus
+ # each time through we look at the first element of
+ # the range
+ if ($end != $start) {
+ $next_start = $start + 1;
+ $next_end = $end;
+ $next_value = $value;
+ $end = $start;
+ }
+
+ # The values for these tables is stored as hex
+ # strings. Get the delta by subtracting the code
+ # point.
+ $value = hex($value) - $start;
+
+ # If this range is adjacent to the previous one, and
+ # the values in each are the same, then this range
+ # really extends the previous one that is already in
+ # element $OUT[-1]. So we pop that element, and
+ # pretend that the range starts with whatever it
+ # started with.
+ if ($start == $previous_end + 1
+ && $value == $previous_value)
+ {
+ pop @OUT;
+ $start = $previous_start;
+ }
+
+ # Save the current values for the next time through
+ # the loop.
+ $previous_start = $start;
+ $previous_end = $end;
+ $previous_value = $value;
}
- else {
- $OUT[-1] = Text::Tabs::expand($OUT[-1]);
- my $count = main::clarify_number($end - $start + 1);
- use integer;
-
- my $width = $tab_stops * 8 - 1;
- $OUT[-1] = sprintf("%-*s # [%s]\n",
- $width,
- $OUT[-1],
- $count);
- $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
+
+ # If there is a range and doesn't need a single point range
+ # output
+ if ($start != $end && ! $range_size_1) {
+ push @OUT, sprintf "%04X\t%04X", $start, $end;
+ $OUT[-1] .= "\t$value" if $value ne "";
+
+ # Add a comment with the size of the range, if
+ # requested. Expand Tabs to make sure they all start
+ # in the same column, and then unexpand to use mostly
+ # tabs.
+ if (! $output_range_counts{$addr}) {
+ $OUT[-1] .= "\n";
+ }
+ else {
+ $OUT[-1] = Text::Tabs::expand($OUT[-1]);
+ my $count = main::clarify_number($end - $start + 1);
+ use integer;
+
+ my $width = $tab_stops * 8 - 1;
+ $OUT[-1] = sprintf("%-*s # [%s]\n",
+ $width,
+ $OUT[-1],
+ $count);
+ $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
+ }
}
- next RANGE;
- }
- # Here to output a single code point per line
+ # Here to output a single code point per line.
+ # If not to annotate, use the simple formats
+ elsif (! $annotate) {
- # If not to annotate, use the simple formats
- if (! $annotate) {
+ # Use any passed in subroutine to output.
+ if (ref $range_size_1 eq 'CODE') {
+ for my $i ($start .. $end) {
+ push @OUT, &{$range_size_1}($i, $value);
+ }
+ }
+ else {
- # Use any passed in subroutine to output.
- if (ref $range_size_1 eq 'CODE') {
- for my $i ($start .. $end) {
- push @OUT, &{$range_size_1}($i, $value);
+ # Here, caller is ok with default output.
+ for (my $i = $start; $i <= $end; $i++) {
+ push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
+ }
}
}
else {
- # Here, caller is ok with default output.
+ # Here, wants annotation.
for (my $i = $start; $i <= $end; $i++) {
- push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
- }
- }
- next RANGE;
- }
- # Here, wants annotation.
- for (my $i = $start; $i <= $end; $i++) {
-
- # Get character information if don't have it already
- main::populate_char_info($i)
- if ! defined $viacode[$i];
- my $type = $annotate_char_type[$i];
-
- # Figure out if should output the next code points as part
- # of a range or not. If this is not in an annotation
- # range, then won't output as a range, so returns $i.
- # Otherwise use the end of the annotation range, but no
- # further than the maximum possible end point of the loop.
- my $range_end = main::min($annotate_ranges->value_of($i)
- || $i,
- $end);
-
- # Use a range if it is a range, and either is one of the
- # special annotation ranges, or the range is at most 3
- # long. This last case causes the algorithmically named
- # code points to be output individually in spans of at
- # most 3, as they are the ones whose $type is > 0.
- if ($range_end != $i
- && ( $type < 0 || $range_end - $i > 2))
- {
- # Here is to output a range. We don't allow a
- # caller-specified output format--just use the
- # standard one.
- push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
+ # Get character information if don't have it already
+ main::populate_char_info($i)
+ if ! defined $viacode[$i];
+ my $type = $annotate_char_type[$i];
+
+ # Figure out if should output the next code points
+ # as part of a range or not. If this is not in an
+ # annotation range, then won't output as a range,
+ # so returns $i. Otherwise use the end of the
+ # annotation range, but no further than the
+ # maximum possible end point of the loop.
+ my $range_end = main::min(
+ $annotate_ranges->value_of($i) || $i,
+ $end);
+
+ # Use a range if it is a range, and either is one
+ # of the special annotation ranges, or the range
+ # is at most 3 long. This last case causes the
+ # algorithmically named code points to be output
+ # individually in spans of at most 3, as they are
+ # the ones whose $type is > 0.
+ if ($range_end != $i
+ && ( $type < 0 || $range_end - $i > 2))
+ {
+ # Here is to output a range. We don't allow a
+ # caller-specified output format--just use the
+ # standard one.
+ push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
$range_end,
$value;
- my $range_name = $viacode[$i];
-
- # For the code points which end in their hex value, we
- # eliminate that from the output annotation, and
- # capitalize only the first letter of each word.
- if ($type == $CP_IN_NAME) {
- my $hex = sprintf "%04X", $i;
- $range_name =~ s/-$hex$//;
- my @words = split " ", $range_name;
- for my $word (@words) {
- $word = ucfirst(lc($word)) if $word ne 'CJK';
- }
- $range_name = join " ", @words;
- }
- elsif ($type == $HANGUL_SYLLABLE) {
- $range_name = "Hangul Syllable";
- }
+ my $range_name = $viacode[$i];
+
+ # For the code points which end in their hex
+ # value, we eliminate that from the output
+ # annotation, and capitalize only the first
+ # letter of each word.
+ if ($type == $CP_IN_NAME) {
+ my $hex = sprintf "%04X", $i;
+ $range_name =~ s/-$hex$//;
+ my @words = split " ", $range_name;
+ for my $word (@words) {
+ $word =
+ ucfirst(lc($word)) if $word ne 'CJK';
+ }
+ $range_name = join " ", @words;
+ }
+ elsif ($type == $HANGUL_SYLLABLE) {
+ $range_name = "Hangul Syllable";
+ }
- $OUT[-1] .= " $range_name" if $range_name;
+ $OUT[-1] .= " $range_name" if $range_name;
- # Include the number of code points in the range
- my $count = main::clarify_number($range_end - $i + 1);
- $OUT[-1] .= " [$count]\n";
+ # Include the number of code points in the
+ # range
+ my $count =
+ main::clarify_number($range_end - $i + 1);
+ $OUT[-1] .= " [$count]\n";
- # Skip to the end of the range
- $i = $range_end;
- }
- else { # Not in a range.
- my $comment = "";
-
- # When outputting the names of each character, use
- # the character itself if printable
- $comment .= "'" . chr($i) . "' " if $printable[$i];
-
- # To make it more readable, use a minimum indentation
- my $comment_indent;
-
- # Determine the annotation
- if ($format eq $DECOMP_STRING_FORMAT) {
-
- # This is very specialized, with the type of
- # decomposition beginning the line enclosed in
- # <...>, and the code points that the code point
- # decomposes to separated by blanks. Create two
- # strings, one of the printable characters, and
- # one of their official names.
- (my $map = $value) =~ s/ \ * < .*? > \ +//x;
- my $tostr = "";
- my $to_name = "";
- my $to_chr = "";
- foreach my $to (split " ", $map) {
- $to = CORE::hex $to;
- $to_name .= " + " if $to_name;
- $to_chr .= chr($to);
- main::populate_char_info($to)
- if ! defined $viacode[$to];
- $to_name .= $viacode[$to];
+ # Skip to the end of the range
+ $i = $range_end;
}
+ else { # Not in a range.
+ my $comment = "";
+
+ # When outputting the names of each character,
+ # use the character itself if printable
+ $comment .= "'" . chr($i) . "' "
+ if $printable[$i];
+
+ # To make it more readable, use a minimum
+ # indentation
+ my $comment_indent;
+
+ # Determine the annotation
+ if ($format eq $DECOMP_STRING_FORMAT) {
+
+ # This is very specialized, with the type
+ # of decomposition beginning the line
+ # enclosed in <...>, and the code points
+ # that the code point decomposes to
+ # separated by blanks. Create two
+ # strings, one of the printable
+ # characters, and one of their official
+ # names.
+ (my $map = $value) =~ s/ \ * < .*? > \ +//x;
+ my $tostr = "";
+ my $to_name = "";
+ my $to_chr = "";
+ foreach my $to (split " ", $map) {
+ $to = CORE::hex $to;
+ $to_name .= " + " if $to_name;
+ $to_chr .= chr($to);
+ main::populate_char_info($to)
+ if ! defined $viacode[$to];
+ $to_name .= $viacode[$to];
+ }
- $comment .=
+ $comment .=
"=> '$to_chr'; $viacode[$i] => $to_name";
- $comment_indent = 25; # Determined by experiment
- }
- else {
-
- # Assume that any table that has hex format is a
- # mapping of one code point to another.
- if ($format eq $HEX_FORMAT) {
- my $decimal_value = CORE::hex $value;
- main::populate_char_info($decimal_value)
+ $comment_indent = 25; # Determined by
+ # experiment
+ }
+ else {
+
+ # Assume that any table that has hex
+ # format is a mapping of one code point to
+ # another.
+ if ($format eq $HEX_FORMAT) {
+ my $decimal_value = CORE::hex $value;
+ main::populate_char_info($decimal_value)
if ! defined $viacode[$decimal_value];
- $comment .= "=> '"
- . chr($decimal_value)
- . "'; " if $printable[$decimal_value];
- }
- $comment .= $viacode[$i] if $include_name
- && $viacode[$i];
- if ($format eq $HEX_FORMAT) {
- my $decimal_value = CORE::hex $value;
- $comment .= " => $viacode[$decimal_value]"
- if $viacode[$decimal_value];
- }
+ $comment .= "=> '"
+ . chr($decimal_value)
+ . "'; " if $printable[$decimal_value];
+ }
+ $comment .= $viacode[$i] if $include_name
+ && $viacode[$i];
+ if ($format eq $HEX_FORMAT) {
+ my $decimal_value = CORE::hex $value;
+ $comment .=
+ " => $viacode[$decimal_value]"
+ if $viacode[$decimal_value];
+ }
- # If including the name, no need to indent, as the
- # name will already be way across the line.
- $comment_indent = ($include_name) ? 0 : 60;
- }
+ # If including the name, no need to
+ # indent, as the name will already be way
+ # across the line.
+ $comment_indent = ($include_name) ? 0 : 60;
+ }
- # Use any passed in routine to output the base part of
- # the line.
- if (ref $range_size_1 eq 'CODE') {
- my $base_part = &{$range_size_1}($i, $value);
- chomp $base_part;
- push @OUT, $base_part;
- }
- else {
- push @OUT, sprintf "%04X\t\t%s", $i, $value;
+ # Use any passed in routine to output the base
+ # part of the line.
+ if (ref $range_size_1 eq 'CODE') {
+ my $base_part=&{$range_size_1}($i, $value);
+ chomp $base_part;
+ push @OUT, $base_part;
+ }
+ else {
+ push @OUT, sprintf "%04X\t\t%s", $i, $value;
+ }
+
+ # And add the annotation.
+ $OUT[-1] = sprintf "%-*s\t# %s",
+ $comment_indent,
+ $OUT[-1],
+ $comment
+ if $comment;
+ $OUT[-1] .= "\n";
+ }
}
+ }
- # And add the annotation.
- $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
- $OUT[-1],
- $comment if $comment;
- $OUT[-1] .= "\n";
+ # If we split the range, set up so the next time through
+ # we get the remainder, and redo.
+ if ($next_start) {
+ $start = $next_start;
+ $end = $next_end;
+ $value = $next_value;
+ $next_start = 0;
+ redo;
}
}
} # End of loop through all the table's ranges
'readable_array');
my %to_output_map;
- # Enum as to whether or not to write out this map table:
+ # Enum as to whether or not to write out this map table, and how:
# 0 don't output
# $EXTERNAL_MAP means its existence is noted in the documentation, and
# it should not be removed nor its format changed. This
# output.
# $INTERNAL_MAP means Perl reserves the right to do anything it wants
# with this file
+ # $OUTPUT_DELTAS means that it is an $INTERNAL_MAP, and instead of
+ # outputting the actual mappings, we output the delta:
+ # (mapping - code point). Doing this creates much more
+ # compact tables. The default is false unless the
+ # table's default mapping is to $CODE_POINT, and the
+ # range size is not 1.
main::set_access('to_output_map', \%to_output_map, 's');
-
sub new {
my $class = shift;
my $name = shift;
my $default_map = delete $args{'Default_Map'};
my $property = delete $args{'_Property'};
my $full_name = delete $args{'Full_Name'};
+ my $to_output_map = delete $args{'To_Output_Map'};
# Rest of parameters passed on
$anomalous_entries{$addr} = [];
$default_map{$addr} = $default_map;
+ $to_output_map{$addr} = $to_output_map;
$self->initialize($initialize) if defined $initialize;
# Don't want to output binary map tables even for debugging.
return 0 if $type == $BINARY;
- # But do want to output string ones.
- return $EXTERNAL_MAP if $type == $STRING;
+ # But do want to output string ones. All the ones that remain to
+ # be dealt with (i.e. which haven't explicitly been set to external)
+ # are for internal Perl use only.
+ return $INTERNAL_MAP if $type == $STRING;
# Otherwise is an $ENUM, do output it, for Perl's purposes
return $INTERNAL_MAP;
my $return = $self->SUPER::header();
- if ($self->to_output_map == $INTERNAL_MAP) {
+ if ($self->to_output_map >= $INTERNAL_MAP) {
$return .= $INTERNAL_ONLY_HEADER;
}
else {
return unless defined $name;
if (defined $swash_keys{$name}) {
- Carp::my_carp(join_lines(<<END
+ Carp::my_carp(main::join_lines(<<END
Already created a swash name '$name' for $swash_keys{$name}. This means that
the same name desired for $self shouldn't be used. Bad News. This must be
fixed before production use, but proceeding anyway
my $format = $self->format;
- my $return = <<END;
+ my $return = "";
+
+ my $output_deltas = ($self->to_output_map == $OUTPUT_DELTAS);
+ if ($output_deltas) {
+ if ($specials_name) {
+ $return .= <<END;
+# The mappings in the non-hash portion of this file must be modified to get the
+# correct values by adding the code point ordinal number to each.
+END
+ }
+ else {
+ $return .= <<END;
+# The mappings must be modified to get the correct values by adding the code
+# point ordinal number to each.
+END
+ }
+ }
+
+ $return .= <<END;
+
# The name this swash is to be known by, with the format of the mappings in
# the main body of the table, and what all code points missing from this file
# map to.
\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
END
if ($specials_name) {
- $return .= <<END;
+ $return .= <<END;
\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
END
}
my $default_map = $default_map{$addr};
- $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
+
+ # For $CODE_POINT default maps and using deltas, instead the default
+ # becomes zero.
+ $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
+ . (($output_deltas && $default_map eq $CODE_POINT)
+ ? "0"
+ : $default_map)
+ . "';";
if ($default_map eq $CODE_POINT) {
$return .= ' # code point maps to itself';
if $format eq $FLOAT_FORMAT
&& $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
$format = $HEX_FORMAT
- if $format eq $RATIONAL_FORMAT
- && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
+ if ($format eq $RATIONAL_FORMAT
+ && $map !~
+ m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
+ # Assume a leading zero means hex,
+ # even if all digits are 0-9
+ || ($format eq $INTEGER_FORMAT
+ && $map =~ /^0/);
$format = $STRING_FORMAT if $format eq $HEX_FORMAT
&& $map =~ /[^0-9A-F]/;
}
Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
}
+ # If the output is a delta instead of the actual value, the format of
+ # the table that gets output is actually 'i' instead of whatever it is
+ # stored internally as.
+ my $output_deltas = ($self->to_output_map == $OUTPUT_DELTAS);
+ if ($output_deltas) {
+ $format = 'i';
+ }
+
$self->_set_format($format);
# Core Perl has a different definition of mapping ranges than we do,
$self->set_range_size_1(1) if $format eq $HEX_FORMAT;
return $self->SUPER::write(
+ $output_deltas,
($self->property == $block)
? 7 # block file needs more tab stops
: 3,
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- return $self->SUPER::write(2); # 2 tab stops
+ return $self->SUPER::write(0, 2); # No deltas; 2 tab stops
}
sub set_final_comment {
}
}
- # This entry is still missing as of 6.0, perhaps because no short name for
- # it.
- if (-e 'NameAliases.txt') {
- my $aliases = property_ref('Name_Alias');
- if (! defined $aliases) {
- $aliases = Property->new('Name_Alias');
- }
- }
-
# These are used so much, that we set globals for them.
$gc = property_ref('General_Category');
$block = property_ref('Block');
# Perl adds this alias.
$gc->add_alias('Category');
- # For backwards compatibility, these property files have particular names.
- property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what
- # utf8.c calls it
- property_ref('Lowercase_Mapping')->set_file('Lower');
- property_ref('Titlecase_Mapping')->set_file('Title');
-
- my $fold = property_ref('Case_Folding');
- $fold->set_file('Fold') if defined $fold;
-
# Unicode::Normalize expects this file with this name and directory.
my $ccc = property_ref('Canonical_Combining_Class');
if (defined $ccc) {
$urs->add_alias('kRSUnicode');
}
}
+
+ # For backwards compatibility with applications that may read the mapping
+ # file directly (it was documented in 5.12 and 5.14 as being thusly
+ # usable), keep it from being compacted to use deltas. (range_size_1 is
+ # used to force the traditional format.)
+ if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
+ $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
+ $nfkc_cf->set_range_size_1(1);
+ }
+ if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
+ $bmg->set_to_output_map($EXTERNAL_MAP);
+ $bmg->set_range_size_1(1);
+ }
+
return;
}
my $input_field_count = $i;
# This routine in addition outputs these extra fields:
+
my $DECOMP_TYPE = $i++; # Decomposition type
# These fields are modifications of ones above, and are usually
Range_Size_1 => \&output_perl_charnames_line,
Type => $STRING,
);
- $perl_charname->set_proxy_for('Name', 'Name_Alias');
+ $perl_charname->set_proxy_for('Name');
my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
Directory => File::Spec->curdir(),
# body of the table
Map_Type => $COMPUTE_NO_MULTI_CP,
Type => $STRING,
+ To_Output_Map => $INTERNAL_MAP,
);
$Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
$Perl_decomp->add_comment(join_lines(<<END
my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
Default_Map => "",
Perl_Extension => 1,
- File => 'Digit', # Trad. location
Directory => $map_directory,
Type => $STRING,
Range_Size_1 => 1,
# simple ones are in UnicodeData.txt, which should already have been
# read in to the full property data structures, so as to initialize
# these with the simple ones. Then the SpecialCasing.txt entries
- # overwrite the ones which have different full mappings.
+ # add or overwrite the ones which have different full mappings.
# This routine sees if the simple mappings are to be output, and if
# so, copies what has already been put into the full mapping tables,
# relatively few entries in them that have different full mappings,
# and thus skip the simple mapping tables altogether.
- # New tables with just the simple mappings that are overridden by the
- # full ones are constructed. These are for Unicode::UCD, which
- # requires the simple mappings. The Case_Folding table is a combined
- # table of both the simple and full mappings, with the full ones being
- # in the hash, and the simple ones, even those overridden by the hash,
- # being in the base table. That same mechanism could have been
- # employed here, except that the docs have said that the generated
- # files are usuable directly by programs, so we dare not change the
- # format in any way.
-
my $file= shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
$uc = property_ref('uc');
# For each of the case change mappings...
- foreach my $case_table ($lc, $tc, $uc) {
- my $case = $case_table->name;
- my $full = property_ref($case);
- unless (defined $full && ! $full->is_empty) {
+ foreach my $full_table ($lc, $tc, $uc) {
+ my $full_name = $full_table->name;
+ unless (defined $full_table && ! $full_table->is_empty) {
Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
}
- # The simple version's name in each mapping merely has an 's' in
- # front of the full one's
- my $simple_name = 's' . $case;
- my $simple = property_ref($simple_name);
- $simple->initialize($full) if $simple->to_output_map();
+ # Create a table in the old-style format and with the original
+ # file name for backwards compatibility with applications that
+ # read it directly. The new tables contain both the simple and
+ # full maps, and the old are missing simple maps when there is a
+ # conflicting full one. Probably it would have been ok to add
+ # those to the legacy version, as was already done in 5.14 to the
+ # case folding one, but this was not done, out of an abundance of
+ # caution. The tables are set up here before we deal with the
+ # full maps so that as we handle those, we can override the simple
+ # maps for them in the legacy table, and merely add them in the
+ # new-style one.
+ my $legacy = Property->new("Legacy_" . $full_table->full_name,
+ File => $full_table->full_name =~
+ s/case_Mapping//r,
+ Range_Size_1 => 1,
+ Format => $HEX_FORMAT,
+ Default_Map => $CODE_POINT,
+ UCD => 0,
+ Initialize => $full_table,
+ To_Output_Map => $EXTERNAL_MAP,
+ );
- my $simple_only = Property->new("_s$case",
- Type => $STRING,
- Default_Map => $CODE_POINT,
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Description => "This contains the simple mappings for $case for just the code points that have different full mappings");
- $simple_only->set_to_output_map($INTERNAL_MAP);
- $simple_only->add_comment(join_lines( <<END
-This file is for UCD.pm so that it can construct simple mappings that would
-otherwise be lost because they are overridden by full mappings.
+ $full_table->add_comment(join_lines( <<END
+This file includes both the simple and full case changing maps. The simple
+ones are in the main body of the table below, and the full ones adding to or
+overriding them are in the hash.
END
));
+ # The simple version's name in each mapping merely has an 's' in
+ # front of the full one's
+ my $simple_name = 's' . $full_name;
+ my $simple = property_ref($simple_name);
+ $simple->initialize($full_table) if $simple->to_output_map();
+
unless ($simple->to_output_map()) {
- $simple_only->set_proxy_for($simple_name);
+ $full_table->set_proxy_for($simple_name);
}
}
return;
}
- $_ = "$fields[0]; lc; $fields[1]";
- $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
- $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
+ my $decimal_code_point = hex $fields[0];
- # Copy any simple case change to the special tables constructed if
- # being overridden by a multi-character case change.
- if ($fields[1] ne $fields[0]
- && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _slc; $value");
- }
- if ($fields[2] ne $fields[0]
- && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _stc; $value");
- }
- if ($fields[3] ne $fields[0]
- && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _suc; $value");
+ # Loop to handle each of the three mappings in the input line, in
+ # order, with $i indicating the current field number.
+ my $i = 0;
+ for my $object ($lc, $tc, $uc) {
+ $i++; # First time through, $i = 0 ... 3rd time = 3
+
+ my $value = $object->value_of($decimal_code_point);
+ $value = ($value eq $CODE_POINT)
+ ? $decimal_code_point
+ : hex $value;
+
+ # If this isn't a multi-character mapping, it should already have
+ # been read in.
+ if ($fields[$i] !~ / /) {
+ if ($value != hex $fields[$i]) {
+ Carp::my_carp("Bad news. UnicodeData.txt thinks "
+ . $object->name
+ . "(0x$fields[0]) is $value"
+ . " and SpecialCasing.txt thinks it is "
+ . hex $fields[$i]
+ . ". Good luck. Proceeding anyway.");
+ }
+ }
+ else {
+
+ # The mapping goes into both the legacy table, in which it
+ # replaces the simple one...
+ $file->insert_adjusted_lines("$fields[0]; Legacy_"
+ . $object->full_name
+ . "; $fields[$i]");
+
+ # ... and, the The regular table, in which it is additional,
+ # beyond the simple mapping.
+ $file->insert_adjusted_lines("$fields[0]; "
+ . $object->name
+ . "; "
+ . $CMD_DELIM
+ . "$REPLACE_CMD=$MULTIPLE_BEFORE"
+ . $CMD_DELIM
+ . $fields[$i]);
+ }
}
+ # Everything has been handled by the insert_adjusted_lines()
+ $_ = "";
+
return;
}
}
# Create the map for simple only if are going to output it, for otherwise
# it takes no part in anything we do.
my $to_output_simple;
+ my $non_final_folds;
sub setup_case_folding($) {
# Read in the case foldings in CaseFolding.txt. This handles both
property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
}
+ $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
+ );
+
# If we ever wanted to show that these tables were combined, a new
# property method could be created, like set_combined_props()
property_ref('Case_Folding')->add_comment(join_lines( <<END
if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
$_ = "$range; Case_Folding; "
. "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
+ if ($type eq 'F') {
+ my @string = split " ", $map;
+ for my $i (0 .. @string - 1 -1) {
+ $non_final_folds->add_range(hex $string[$i], hex $string[$i]);
+ }
+ }
}
else {
$_ = "";
my $scx = property_ref("Script_Extensions");
$scx = Property->new("scx", Full_Name => "Script_Extensions")
- if ! defined $scx;
+ if ! defined $scx;
$scx->_set_format($STRING_WHITE_SPACE_LIST);
$scx->initialize($script);
$scx->set_default_map($script->default_map);
return;
}
-sub setup_v6_name_alias {
- property_ref('Name_Alias')->add_map(7, 7, "ALERT: control");
+sub setup_early_name_alias {
+ my $aliases = property_ref('Name_Alias');
+ $aliases = Property->new('Name_Alias') if ! defined $aliases;
+
+ # Before 6.0, this wasn't a problem, and after it, this alias is part of
+ # the Unicode-delivered file.
+ $aliases->add_map(7, 7, "ALERT: control") if $v_version eq v6.0.0;
+ return;
+}
+
+sub filter_later_version_name_alias_line {
+
+ # This file has an extra entry per line for the alias type. This is
+ # handled by creating a compound entry: "$alias: $type"; First, split
+ # the line into components.
+ my ($range, $alias, $type, @remainder)
+ = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+
+ # This file contains multiple entries for some components, so tell the
+ # downstream code to allow this in our internal tables; the
+ # $MULTIPLE_AFTER preserves the input ordering.
+ $_ = join ";", $range, $CMD_DELIM
+ . $REPLACE_CMD
+ . '='
+ . $MULTIPLE_AFTER
+ . $CMD_DELIM
+ . "$alias: $type",
+ @remainder;
+ return;
}
sub filter_early_version_name_alias_line {
- $_ .= ": correction";
+
+ # Early versions did not have the trailing alias type field; implicitly it
+ # was 'correction'
+ $_ .= "; correction";
+ filter_later_version_name_alias_line;
return;
}
# 3) Calculates all the regular expression match tables based on the
# mappings.
# 3) Calculates and adds the tables which are defined by Unicode, but
- # which aren't derived by them
+ # which aren't derived by them, and certain derived tables that Perl
+ # uses.
# For each property, fill in any missing mappings, and calculate the re
# match tables. If a property has more than one missing mapping, the
Lowercase_Mapping
Titlecase_Mapping
Case_Folding
- } ) {
+ } )
+ {
my $full = property_ref($map);
if ($full->is_empty) {
my $simple = property_ref('Simple_' . $map);
}
}
+ # Create digit and case fold tables with the original file names for
+ # backwards compatibility with applications that read them directly.
+ my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
+ Default_Map => "",
+ Perl_Extension => 1,
+ File => 'Digit', # Trad. location
+ Directory => $map_directory,
+ UCD => 0,
+ Type => $STRING,
+ To_Output_Map => $EXTERNAL_MAP,
+ Range_Size_1 => 1,
+ Initialize => property_ref('Perl_Decimal_Digit'),
+ );
+ $Digit->add_comment(join_lines(<<END
+This file gives the mapping of all code points which represent a single
+decimal digit [0-9] to their respective digits. For example, the code point
+U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
+that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
+numerals.
+END
+ ));
+
+ Property->new('Legacy_Case_Folding',
+ File => "Fold",
+ Directory => $map_directory,
+ Default_Map => $CODE_POINT,
+ UCD => 0,
+ Range_Size_1 => 1,
+ Type => $STRING,
+ To_Output_Map => $EXTERNAL_MAP,
+ Format => $HEX_FORMAT,
+ Initialize => property_ref('cf'),
+ );
+
# The Script_Extensions property started out as a clone of the Script
# property. But processing its data file caused some elements to be
# replaced with different data. (These elements were for the Common and
my $alias = property_ref('Name_Alias');
if (defined $alias) {
push @composition, 'Name_Alias';
+ $perl_charname->set_proxy_for('Name_Alias');
+ my $unicode_1 = property_ref('Unicode_1_Name');
+ my %abbreviations;
+
+ # Add each entry in Name_Alias to Perl_Charnames. Where these go with
+ # respect to any existing entry depends on the entry type.
+ # Corrections go before said entry, as they should be returned in
+ # preference over the existing entry. (A correction to a correction
+ # should be later in the Name_Alias table, so it will correctly
+ # precede the erroneous correction in Perl_Charnames.)
+ #
+ # Abbreviations go after everything else, so they are saved
+ # temporarily in a hash for later.
+ #
+ # Controls are currently added afterwards. This is because Perl has
+ # previously used the Unicode1 name, and so should still use that.
+ # (Most of them will be the same anyway, in which case we don't add a
+ # duplicate)
+
$alias->reset_each_range;
while (my ($range) = $alias->each_range) {
next if $range->value eq "";
- if ($range->start != $range->end) {
- Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
+ my $code_point = $range->start;
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+ my ($value, $type) = split ': ', $range->value;
+ my $replace_type;
+ if ($type eq 'correction') {
+ $replace_type = $MULTIPLE_BEFORE;
}
- $perl_charname->add_duplicate($range->start,
- $range->value =~ s/:.*//r);
+ elsif ($type eq 'abbreviation') {
+
+ # Save for later
+ $abbreviations{$value} = $code_point;
+ next;
+ }
+ elsif ($type eq 'control') {
+ my $unicode_1_value = $unicode_1->value_of($code_point);
+ next if $unicode_1_value eq $value;
+ $replace_type = $MULTIPLE_AFTER;
+ }
+ else {
+ $replace_type = $MULTIPLE_AFTER;
+ }
+
+ # Actually add; before or after current entry(ies) as determined
+ # above.
+ $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ }
+
+ # Now that have everything added, add in abbreviations after
+ # everything else.
+ foreach my $value (keys %abbreviations) {
+ $perl_charname->add_duplicate($abbreviations{$value}, $value, Replace => $MULTIPLE_AFTER);
}
$alias_sentence = <<END;
-The Name_Alias property adds duplicate code point entries with a corrected
-name. The original (less correct, but still valid) name will be physically
-last.
+The Name_Alias property adds duplicate code point entries that are
+alternatives to the original name. If an addition is a corrected
+name, it will be physically first in the table. The original (less correct,
+but still valid) name will be next; then any alternatives, in no particular
+order; and finally any abbreviations, again in no particular order.
END
}
+
my $comment;
if (@composition <= 2) { # Always at least 2
$comment = join " and ", @composition;
$perl_charname->add_comment(join_lines( <<END
This file is for charnames.pm. It is the union of the $comment properties.
-Unicode_1_Name entries are used only for otherwise nameless code
-points.
+Unicode_1_Name entries are used only for nameless code points in the Name
+property.
$alias_sentence
This file doesn't include the algorithmically determinable names. For those,
use 'unicore/Name.pm'
|| $ucd_pod{$standard}{'perl_extension'} == $perl_extension
|| $output_this == $perl_extension)
{
- Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations. Proceeding anyway.");
+ Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
}
# We modifiy the info column of the one being output to
# others except DAge.txt (as data in an extracted file can be over-ridden by
# the non-extracted. Some other files depend on data derived from an earlier
# file, like UnicodeData requires data from Jamo, and the case changing and
-# folding requires data from Unicode. Mostly, it safest to order by first
+# folding requires data from Unicode. Mostly, it is safest to order by first
# version releases in (except the Jamo). DAge.txt is read before the
# extracted ones because of the rarely used feature $compare_versions. In the
# unlikely event that there were ever an extracted file that contained the Age
),
Input_file->new('NameAliases.txt', v5.0.0,
Property => 'Name_Alias',
- Pre_Handler => ($v_version ge v6.0.0)
- ? \&setup_v6_name_alias
+ Pre_Handler => ($v_version le v6.0.0)
+ ? \&setup_early_name_alias
: undef,
- Each_Line_Handler =>
- \&filter_early_version_name_alias_line,
+ Each_Line_Handler => ($v_version le v6.0.0)
+ ? \&filter_early_version_name_alias_line
+ : \&filter_later_version_name_alias_line,
),
Input_file->new("BidiTest.txt", v5.2.0,
Skip => 'Validation Tests',