-makelist : Rewrite the file list $file_list based on current setup
-annotate : Output an annotation for each character in the table files;
useful for debugging mktables, looking at diffs; but is slow,
- memory intensive; resulting tables are usable but slow and
- very large.
+ memory intensive; resulting tables are usable but are slow and
+ very large (and currently fail the Unicode::UCD.t tests).
-check A B : Executes $0 only if A and B are the same
END
}
if $v_version ge v4.1.0;
push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
if $v_version ge v6.0.0;
+push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
+ if $v_version ge v6.1.0;
+push @tables_that_may_be_empty, '_stc';
# The lists below are hashes, so the key is the item in the list, and the
# value is the reason why it is in the list. This makes generation of
# its name
if ($seen_non_extracted_non_age) {
if ($file =~ /$EXTRACTED/i) {
- Carp::my_carp_bug(join_lines(<<END
+ Carp::my_carp_bug(main::join_lines(<<END
$file should be processed just after the 'Prop...Alias' files, and before
anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
have subtle problems
for my $try_hard (0, 1) {
# Look through all the ranges for a usable code point.
- for my $set ($self->ranges) {
+ for my $set (reverse $self->ranges) {
# Try the edge cases first, starting with the end point of the
# range.
return unless defined $name;
if (defined $swash_keys{$name}) {
- Carp::my_carp(join_lines(<<END
+ Carp::my_carp(main::join_lines(<<END
Already created a swash name '$name' for $swash_keys{$name}. This means that
the same name desired for $self shouldn't be used. Bad News. This must be
fixed before production use, but proceeding anyway
if $format eq $FLOAT_FORMAT
&& $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
$format = $HEX_FORMAT
- if $format eq $RATIONAL_FORMAT
- && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
+ if ($format eq $RATIONAL_FORMAT
+ && $map !~
+ m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
+ # Assume a leading zero means hex,
+ # even if all digits are 0-9
+ || ($format eq $INTEGER_FORMAT
+ && $map =~ /^0/);
$format = $STRING_FORMAT if $format eq $HEX_FORMAT
&& $map =~ /[^0-9A-F]/;
}
}
}
- # This entry is still missing as of 6.0, perhaps because no short name for
- # it.
- if (-e 'NameAliases.txt') {
- my $aliases = property_ref('Name_Alias');
- if (! defined $aliases) {
- $aliases = Property->new('Name_Alias');
- }
- }
-
# These are used so much, that we set globals for them.
$gc = property_ref('General_Category');
$block = property_ref('Block');
Range_Size_1 => \&output_perl_charnames_line,
Type => $STRING,
);
- $perl_charname->set_proxy_for('Name', 'Name_Alias');
+ $perl_charname->set_proxy_for('Name');
my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
Directory => File::Spec->curdir(),
return;
}
- $_ = "$fields[0]; lc; $fields[1]";
- $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
- $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
+ my $decimal_code_point = hex $fields[0];
- # Copy any simple case change to the special tables constructed if
- # being overridden by a multi-character case change.
- if ($fields[1] ne $fields[0]
- && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _slc; $value");
- }
- if ($fields[2] ne $fields[0]
- && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _stc; $value");
- }
- if ($fields[3] ne $fields[0]
- && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
- {
- $file->insert_adjusted_lines("$fields[0]; _suc; $value");
+ # Loop to handle each of the three mappings in the input line, in
+ # order, with $i indicating the current field number.
+ my $i = 0;
+ for my $object ($lc, $tc, $uc) {
+ $i++; # First time through, $i = 0 ... 3rd time = 3
+
+ my $value = $object->value_of($decimal_code_point);
+ $value = ($value eq $CODE_POINT)
+ ? $decimal_code_point
+ : hex $value;
+
+ # If this isn't a multi-character mapping, it should already have
+ # been read in.
+ if ($fields[$i] !~ / /) {
+ if ($value != hex $fields[$i]) {
+ Carp::my_carp("Bad news. UnicodeData.txt thinks "
+ . $object->name
+ . "(0x$fields[0]) is $value"
+ . " and SpecialCasing.txt thinks it is "
+ . hex $fields[$i]
+ . ". Good luck. Proceeding anyway.");
+ }
+ }
+ else {
+ $file->insert_adjusted_lines("$fields[0]; "
+ . $object->full_name
+ . "; $fields[$i]");
+
+ # Copy any simple case change to the special tables
+ # constructed if being overridden by a multi-character case
+ # change.
+ if ($value != $decimal_code_point) {
+ $file->insert_adjusted_lines(sprintf("%s; _s%s; %04X",
+ $fields[0],
+ $object->name,
+ $value));
+ }
+ }
}
+ # Everything has been handled by the insert_adjusted_lines()
+ $_ = "";
+
return;
}
}
# Create the map for simple only if are going to output it, for otherwise
# it takes no part in anything we do.
my $to_output_simple;
+ my $non_final_folds;
sub setup_case_folding($) {
# Read in the case foldings in CaseFolding.txt. This handles both
property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
}
+ $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Description => "Code points that particpate in a multi-char fold not in the final position",
+ );
+
# If we ever wanted to show that these tables were combined, a new
# property method could be created, like set_combined_props()
property_ref('Case_Folding')->add_comment(join_lines( <<END
if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
$_ = "$range; Case_Folding; "
. "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
+ if ($type eq 'F') {
+ my @string = split " ", $map;
+ for my $i (0 .. @string - 1 -1) {
+ $non_final_folds->add_range(hex $string[$i], hex $string[$i]);
+ }
+ }
}
else {
$_ = "";
END
));
- # Make the scx's tables and aliases for them the same as sc's
+ # Initialize scx's tables and the aliases for them to be the same as sc's
foreach my $table ($script->tables) {
my $scx_table = $scx->add_match_table($table->name,
Full_Name => $table->full_name);
return;
}
-sub setup_v6_name_alias {
- property_ref('Name_Alias')->add_map(7, 7, "ALERT");
+sub setup_early_name_alias {
+ my $aliases = property_ref('Name_Alias');
+ $aliases = Property->new('Name_Alias') if ! defined $aliases;
+
+ # Before 6.0, this wasn't a problem, and after it, this alias is part of
+ # the Unicode-delivered file.
+ $aliases->add_map(7, 7, "ALERT: control") if $v_version eq v6.0.0;
+ return;
+}
+
+sub filter_later_version_name_alias_line {
+
+ # This file has an extra entry per line for the alias type. This is
+ # handled by creating a compound entry: "$alias: $type"; First, split
+ # the line into components.
+ my ($range, $alias, $type, @remainder)
+ = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+
+ # This file contains multiple entries for some components, so tell the
+ # downstream code to allow this in our internal tables; the
+ # $MULTIPLE_AFTER preserves the input ordering.
+ $_ = join ";", $range, $CMD_DELIM
+ . $REPLACE_CMD
+ . '='
+ . $MULTIPLE_AFTER
+ . $CMD_DELIM
+ . "$alias: $type",
+ @remainder;
+ return;
+}
+
+sub filter_early_version_name_alias_line {
+
+ # Early versions did not have the trailing alias type field; implicitly it
+ # was 'correction'
+ $_ .= "; correction";
+ filter_later_version_name_alias_line;
+ return;
}
sub finish_Unicode() {
my $alias = property_ref('Name_Alias');
if (defined $alias) {
push @composition, 'Name_Alias';
+ $perl_charname->set_proxy_for('Name_Alias');
+ my $unicode_1 = property_ref('Unicode_1_Name');
+ my %abbreviations;
+
+ # Add each entry in Name_Alias to Perl_Charnames. Where these go with
+ # respect to any existing entry depends on the entry type.
+ # Corrections go before said entry, as they should be returned in
+ # preference over the existing entry. (A correction to a correction
+ # should be later in the Name_Alias table, so it will correctly
+ # precede the erroneous correction in Perl_Charnames.)
+ #
+ # Abbreviations go after everything else, so they are saved
+ # temporarily in a hash for later.
+ #
+ # Controls are currently added afterwards. This is because Perl has
+ # previously used the Unicode1 name, and so should still use that.
+ # (Most of them will be the same anyway, in which case we don't add a
+ # duplicate)
+
$alias->reset_each_range;
while (my ($range) = $alias->each_range) {
next if $range->value eq "";
- if ($range->start != $range->end) {
- Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
+ my $code_point = $range->start;
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
}
- $perl_charname->add_duplicate($range->start, $range->value);
+ my ($value, $type) = split ': ', $range->value;
+ my $replace_type;
+ if ($type eq 'correction') {
+ $replace_type = $MULTIPLE_BEFORE;
+ }
+ elsif ($type eq 'abbreviation') {
+
+ # Save for later
+ $abbreviations{$value} = $code_point;
+ next;
+ }
+ elsif ($type eq 'control') {
+ my $unicode_1_value = $unicode_1->value_of($code_point);
+ next if $unicode_1_value eq $value;
+ $replace_type = $MULTIPLE_AFTER;
+ }
+ else {
+ $replace_type = $MULTIPLE_AFTER;
+ }
+
+ # Actually add; before or after current entry(ies) as determined
+ # above.
+ $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ }
+
+ # Now that have everything added, add in abbreviations after
+ # everything else.
+ foreach my $value (keys %abbreviations) {
+ $perl_charname->add_duplicate($abbreviations{$value}, $value, Replace => $MULTIPLE_AFTER);
}
$alias_sentence = <<END;
-The Name_Alias property adds duplicate code point entries with a corrected
-name. The original (less correct, but still valid) name will be physically
-last.
+The Name_Alias property adds duplicate code point entries that are
+alternatives to the original name. If an addition is a corrected
+name, it will be physically first in the table. The original (less correct,
+but still valid) name will be next; then any alternatives, in no particular
+order; and finally any abbreviations, again in no particular order.
END
}
+
my $comment;
if (@composition <= 2) { # Always at least 2
$comment = join " and ", @composition;
$perl_charname->add_comment(join_lines( <<END
This file is for charnames.pm. It is the union of the $comment properties.
-Unicode_1_Name entries are used only for otherwise nameless code
-points.
+Unicode_1_Name entries are used only for nameless code points in the Name
+property.
$alias_sentence
This file doesn't include the algorithmically determinable names. For those,
use 'unicore/Name.pm'
# others except DAge.txt (as data in an extracted file can be over-ridden by
# the non-extracted. Some other files depend on data derived from an earlier
# file, like UnicodeData requires data from Jamo, and the case changing and
-# folding requires data from Unicode. Mostly, it safest to order by first
+# folding requires data from Unicode. Mostly, it is safest to order by first
# version releases in (except the Jamo). DAge.txt is read before the
# extracted ones because of the rarely used feature $compare_versions. In the
# unlikely event that there were ever an extracted file that contained the Age
),
Input_file->new('NameAliases.txt', v5.0.0,
Property => 'Name_Alias',
- Pre_Handler => ($v_version ge v6.0.0)
- ? \&setup_v6_name_alias
+ Pre_Handler => ($v_version le v6.0.0)
+ ? \&setup_early_name_alias
: undef,
+ Each_Line_Handler => ($v_version le v6.0.0)
+ ? \&filter_early_version_name_alias_line
+ : \&filter_later_version_name_alias_line,
),
Input_file->new("BidiTest.txt", v5.2.0,
Skip => 'Validation Tests',