$0 = File::Spec->canonpath($0);
my $make_test_script = 0; # ? Should we output a test script
+my $make_norm_test_script = 0; # ? Should we output a normalization test script
my $write_unchanged_files = 0; # ? Should we update the output files even if
# we don't think they have changed
my $use_directory = ""; # ? Should we chdir somewhere.
{
$make_test_script = 1;
}
+ elsif ($arg eq '-makenormtest')
+ {
+ $make_norm_test_script = 1;
+ }
elsif ($arg eq '-makelist') {
$make_list = 1;
}
# anonymous hash.
my @code_points_ending_in_code_point;
+# To hold Unicode's normalization test suite
+my @normalization_tests;
+
# Boolean: does this Unicode version have the hangul syllables, and are we
# writing out a table for them?
my $has_hangul_syllables = 0;
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
;
sub _operator_stringify {
# multiple times. They are stored LIFO, so
# that the final one inserted is the first one
# returned in an ordered search of the table.
+ # If this is an exact duplicate, including the
+ # value, the original will be moved to be
+ # first, before any other duplicate ranges
+ # with different values.
# => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
# FIFO, so that this one is inserted after all
- # others that currently exist.
+ # others that currently exist. If this is an
+ # exact duplicate, including value, of an
+ # existing range, this one is discarded
+ # (leaving the existing one in its original,
+ # higher priority position
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
# "same value" means identical for non-type-0 ranges, and it means
Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
return;
}
+ if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
+ Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway");
+ }
#local $to_trace = 1 if main::DEBUG;
if ($operation eq '-') {
# If to place this new record after, move to beyond all existing
# ones; but don't add this one if identical to any of them, as it
- # isn't really a multiple
+ # isn't really a multiple. This leaves the original order, so
+ # that the current request is ignored. The reasoning is that the
+ # previous request that wanted this record to have high priority
+ # should have precedence.
if ($replace == $MULTIPLE_AFTER) {
while ($i < @$r && $r->[$i]->start == $start) {
return if $value eq $r->[$i]->value
$i++;
}
}
+ else {
+ # If instead we are to place this new record before any
+ # existing ones, remove any identical ones that come after it.
+ # This changes the existing order so that the new one is
+ # first, as is being requested.
+ for (my $j = $i + 1;
+ $j < @$r && $r->[$j]->start == $start;
+ $j++)
+ {
+ if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
+ splice @$r, $j, 1;
+ last; # There should only be one instance, so no
+ # need to keep looking
+ }
+ }
+ }
trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
my @return = splice @$r,
return $self->_union($other)
},
+ '+=' => sub { my $self = shift;
+ my $other = shift;
+ my $reversed = shift;
+
+ if ($reversed) {
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' += '
+ . ref($self)
+ . "'. undef returned.");
+ return;
+ }
+
+ return $self->_union($other)
+ },
'&' => sub { my $self = shift;
my $other = shift;
use overload
fallback => 0,
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
'!=' => \&main::_operator_not_equal,
'==' => \&main::_operator_equal,
;
my $comment = "";
my $status = $self->status;
- if ($status) {
+ if ($status && $status ne $PLACEHOLDER) {
my $warn = uc $status_past_participles{$status};
$comment .= <<END;
'+=' => sub {
my $self = shift;
my $other = shift;
+ my $reversed = shift;
+
+ if ($reversed) {
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' += '
+ . ref($self)
+ . "'. undef returned.");
+ return;
+ }
return if $self->carp_if_locked;
}
return $self;
},
+ '&=' => sub {
+ my $self = shift;
+ my $other = shift;
+
+ return if $self->carp_if_locked;
+ $self->_set_range_list($self->_range_list & $other);
+ return $self;
+ },
'-' => sub { my $self = shift;
my $other = shift;
my $reversed = shift;
my $flag = $property->status
|| $table->status
|| $table_alias_object->status;
- $flags{$flag} = $status_past_participles{$flag} if $flag;
+ if ($flag && $flag ne $PLACEHOLDER) {
+ $flags{$flag} = $status_past_participles{$flag};
+ }
$loose_count++;
fallback => 0,
qw("") => "_operator_stringify",
"." => \&main::_operator_dot,
+ ".=" => \&main::_operator_dot_equal,
'==' => \&main::_operator_equal,
'!=' => \&main::_operator_not_equal,
'=' => sub { return shift },
: "$self$other";
}
+sub _operator_dot_equal {
+ # Overloaded '.=' method that is common to all packages.
+
+ my $self = shift;
+ my $other = shift;
+ my $reversed = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ $other = "" unless defined $other;
+
+ if ($reversed) {
+ return $other .= "$self";
+ }
+ else {
+ return "$self" . "$other";
+ }
+}
+
sub _operator_equal {
# Generic overloaded '==' routine. To be equal, they must be the exact
# same object
# Process each line of the file ...
while ($file->next_line) {
+ # Fix typo in input file
+ s/CCC133/CCC132/g if $v_version eq v6.1.0;
+
my ($property, @data) = split /\s*;\s*/;
# The ccc property has an extra field at the beginning, which is the
return @return;
}
+sub process_NormalizationsTest {
+
+ # Each line looks like:
+ # source code point; NFC; NFD; NFKC; NFKD
+ # e.g.
+ # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
+
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ # Process each line of the file ...
+ while ($file->next_line) {
+
+ next if /^@/;
+
+ my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
+
+ foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
+ $$var = pack "U0U*", map { hex } split " ", $$var;
+ $$var =~ s/(\\)/$1$1/g;
+ }
+
+ push @normalization_tests,
+ "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
+ } # End of looping through the file
+}
+
sub output_perl_charnames_line ($$) {
# Output the entries in Perl_charnames specially, using 5 digits instead
$file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
$file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
+ $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
$fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
}
elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Flush the buffers.
- foreach my $i (1 .. $last_field) {
+ foreach my $i (0 .. $last_field) {
$file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
}
# http://www.unicode.org/versions/corrigendum8.html
$fields[$BIDI] = "AL";
}
- elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
+ elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
$fields[$CHARNAME] = "";
}
}
sub setup_early_name_alias {
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ # This has the effect of pretending that the Name_Alias property was
+ # available in all Unicode releases. Strictly speaking, this property
+ # should not be availabe in early releases, but doing this allows
+ # charnames.pm to work on older releases without change. Prior to v5.16
+ # it had these names hard-coded inside it. Unicode 6.1 came along and
+ # created these names, and so they were removed from charnames.
+
my $aliases = property_ref('Name_Alias');
- $aliases = Property->new('Name_Alias') if ! defined $aliases;
+ if (! defined $aliases) {
+ $aliases = Property->new('Name_Alias', Default_Map => "");
+ }
+
+ $file->insert_lines(get_old_name_aliases());
- # Before 6.0, this wasn't a problem, and after it, this alias is part of
- # the Unicode-delivered file.
- $aliases->add_map(7, 7, "ALERT: control") if $v_version eq v6.0.0;
return;
}
+sub get_old_name_aliases () {
+
+ # The Unicode_1_Name field, contains most of these names. One would
+ # expect, given the field's name, that its values would be fixed across
+ # versions, giving the true Unicode version 1 name for the character.
+ # Sadly, this is not the case. Actually Version 1.1.5 had no names for
+ # any of the controls; Version 2.0 introduced names for the C0 controls,
+ # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2
+ # changed some names: it
+ # changed to parenthesized versions like "NEXT LINE" to
+ # "NEXT LINE (NEL)";
+ # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
+ # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
+ # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
+ # This list contains all the names that were defined so that
+ # charnames::vianame(), etc. understand them all EVEN if this version of
+ # Unicode didn't specify them (this could be construed as a bug).
+ # mktables elsewhere gives preference to the Unicode_1_Name field over
+ # these names, so that viacode() will return the correct value for that
+ # version of Unicode, except when that version doesn't define a name,
+ # viacode() will return one anyway (this also could be construed as a
+ # bug). But these potential "bugs" allow for the smooth working of code
+ # on earlier Unicode releases.
+
+ my @return = split /\n/, <<'END';
+0000;NULL;control
+0000;NUL;abbreviation
+0001;START OF HEADING;control
+0001;SOH;abbreviation
+0002;START OF TEXT;control
+0002;STX;abbreviation
+0003;END OF TEXT;control
+0003;ETX;abbreviation
+0004;END OF TRANSMISSION;control
+0004;EOT;abbreviation
+0005;ENQUIRY;control
+0005;ENQ;abbreviation
+0006;ACKNOWLEDGE;control
+0006;ACK;abbreviation
+0007;BELL;control
+0007;BEL;abbreviation
+0008;BACKSPACE;control
+0008;BS;abbreviation
+0009;CHARACTER TABULATION;control
+0009;HORIZONTAL TABULATION;control
+0009;HT;abbreviation
+0009;TAB;abbreviation
+000A;LINE FEED;control
+000A;LINE FEED (LF);control
+000A;NEW LINE;control
+000A;END OF LINE;control
+000A;LF;abbreviation
+000A;NL;abbreviation
+000A;EOL;abbreviation
+000B;LINE TABULATION;control
+000B;VERTICAL TABULATION;control
+000B;VT;abbreviation
+000C;FORM FEED;control
+000C;FORM FEED (FF);control
+000C;FF;abbreviation
+000D;CARRIAGE RETURN;control
+000D;CARRIAGE RETURN (CR);control
+000D;CR;abbreviation
+000E;SHIFT OUT;control
+000E;LOCKING-SHIFT ONE;control
+000E;SO;abbreviation
+000F;SHIFT IN;control
+000F;LOCKING-SHIFT ZERO;control
+000F;SI;abbreviation
+0010;DATA LINK ESCAPE;control
+0010;DLE;abbreviation
+0011;DEVICE CONTROL ONE;control
+0011;DC1;abbreviation
+0012;DEVICE CONTROL TWO;control
+0012;DC2;abbreviation
+0013;DEVICE CONTROL THREE;control
+0013;DC3;abbreviation
+0014;DEVICE CONTROL FOUR;control
+0014;DC4;abbreviation
+0015;NEGATIVE ACKNOWLEDGE;control
+0015;NAK;abbreviation
+0016;SYNCHRONOUS IDLE;control
+0016;SYN;abbreviation
+0017;END OF TRANSMISSION BLOCK;control
+0017;ETB;abbreviation
+0018;CANCEL;control
+0018;CAN;abbreviation
+0019;END OF MEDIUM;control
+0019;EOM;abbreviation
+001A;SUBSTITUTE;control
+001A;SUB;abbreviation
+001B;ESCAPE;control
+001B;ESC;abbreviation
+001C;INFORMATION SEPARATOR FOUR;control
+001C;FILE SEPARATOR;control
+001C;FS;abbreviation
+001D;INFORMATION SEPARATOR THREE;control
+001D;GROUP SEPARATOR;control
+001D;GS;abbreviation
+001E;INFORMATION SEPARATOR TWO;control
+001E;RECORD SEPARATOR;control
+001E;RS;abbreviation
+001F;INFORMATION SEPARATOR ONE;control
+001F;UNIT SEPARATOR;control
+001F;US;abbreviation
+0020;SP;abbreviation
+007F;DELETE;control
+007F;DEL;abbreviation
+0080;PADDING CHARACTER;figment
+0080;PAD;abbreviation
+0081;HIGH OCTET PRESET;figment
+0081;HOP;abbreviation
+0082;BREAK PERMITTED HERE;control
+0082;BPH;abbreviation
+0083;NO BREAK HERE;control
+0083;NBH;abbreviation
+0084;INDEX;control
+0084;IND;abbreviation
+0085;NEXT LINE;control
+0085;NEXT LINE (NEL);control
+0085;NEL;abbreviation
+0086;START OF SELECTED AREA;control
+0086;SSA;abbreviation
+0087;END OF SELECTED AREA;control
+0087;ESA;abbreviation
+0088;CHARACTER TABULATION SET;control
+0088;HORIZONTAL TABULATION SET;control
+0088;HTS;abbreviation
+0089;CHARACTER TABULATION WITH JUSTIFICATION;control
+0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
+0089;HTJ;abbreviation
+008A;LINE TABULATION SET;control
+008A;VERTICAL TABULATION SET;control
+008A;VTS;abbreviation
+008B;PARTIAL LINE FORWARD;control
+008B;PARTIAL LINE DOWN;control
+008B;PLD;abbreviation
+008C;PARTIAL LINE BACKWARD;control
+008C;PARTIAL LINE UP;control
+008C;PLU;abbreviation
+008D;REVERSE LINE FEED;control
+008D;REVERSE INDEX;control
+008D;RI;abbreviation
+008E;SINGLE SHIFT TWO;control
+008E;SINGLE-SHIFT-2;control
+008E;SS2;abbreviation
+008F;SINGLE SHIFT THREE;control
+008F;SINGLE-SHIFT-3;control
+008F;SS3;abbreviation
+0090;DEVICE CONTROL STRING;control
+0090;DCS;abbreviation
+0091;PRIVATE USE ONE;control
+0091;PRIVATE USE-1;control
+0091;PU1;abbreviation
+0092;PRIVATE USE TWO;control
+0092;PRIVATE USE-2;control
+0092;PU2;abbreviation
+0093;SET TRANSMIT STATE;control
+0093;STS;abbreviation
+0094;CANCEL CHARACTER;control
+0094;CCH;abbreviation
+0095;MESSAGE WAITING;control
+0095;MW;abbreviation
+0096;START OF GUARDED AREA;control
+0096;START OF PROTECTED AREA;control
+0096;SPA;abbreviation
+0097;END OF GUARDED AREA;control
+0097;END OF PROTECTED AREA;control
+0097;EPA;abbreviation
+0098;START OF STRING;control
+0098;SOS;abbreviation
+0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
+0099;SGC;abbreviation
+009A;SINGLE CHARACTER INTRODUCER;control
+009A;SCI;abbreviation
+009B;CONTROL SEQUENCE INTRODUCER;control
+009B;CSI;abbreviation
+009C;STRING TERMINATOR;control
+009C;ST;abbreviation
+009D;OPERATING SYSTEM COMMAND;control
+009D;OSC;abbreviation
+009E;PRIVACY MESSAGE;control
+009E;PM;abbreviation
+009F;APPLICATION PROGRAM COMMAND;control
+009F;APC;abbreviation
+00A0;NBSP;abbreviation
+00AD;SHY;abbreviation
+200B;ZWSP;abbreviation
+200C;ZWNJ;abbreviation
+200D;ZWJ;abbreviation
+200E;LRM;abbreviation
+200F;RLM;abbreviation
+202A;LRE;abbreviation
+202B;RLE;abbreviation
+202C;PDF;abbreviation
+202D;LRO;abbreviation
+202E;RLO;abbreviation
+FEFF;BYTE ORDER MARK;alternate
+FEFF;BOM;abbreviation
+FEFF;ZWNBSP;abbreviation
+END
+
+ if ($v_version ge v3.0.0) {
+ push @return, split /\n/, <<'END';
+180B; FVS1; abbreviation
+180C; FVS2; abbreviation
+180D; FVS3; abbreviation
+180E; MVS; abbreviation
+202F; NNBSP; abbreviation
+END
+ }
+
+ if ($v_version ge v3.2.0) {
+ push @return, split /\n/, <<'END';
+034F; CGJ; abbreviation
+205F; MMSP; abbreviation
+2060; WJ; abbreviation
+END
+ # Add in VS1..VS16
+ my $cp = 0xFE00 - 1;
+ for my $i (1..16) {
+ push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
+ }
+ }
+ if ($v_version ge v4.0.0) { # Add in VS17..VS256
+ my $cp = 0xE0100 - 17;
+ for my $i (17..256) {
+ push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
+ }
+ }
+
+ # ALERT did not come along until 6.0, at which point it became preferred
+ # over BELL, and was never in the Unicode_1_Name field. For the same
+ # reasons, that the other names are made known to all releases by this
+ # function, we make ALERT known too. By inserting it
+ # last in early releases, BELL is preferred over it; and vice-vers in 6.0
+ my $alert = '0007; ALERT; control';
+ if ($v_version lt v6.0.0) {
+ push @return, $alert;
+ }
+ else {
+ unshift @return, $alert;
+ }
+
+ return @return;
+}
+
sub filter_later_version_name_alias_line {
# This file has an extra entry per line for the alias type. This is
sub filter_early_version_name_alias_line {
# Early versions did not have the trailing alias type field; implicitly it
- # was 'correction'
- $_ .= "; correction";
+ # was 'correction'. But our synthetic lines we add in this program do
+ # have it, so test for the type field.
+ $_ .= "; correction" if $_ !~ /;.*;/;
+
filter_later_version_name_alias_line;
return;
}
sub finish_Unicode() {
# This routine should be called after all the Unicode files have been read
# in. It:
- # 1) Adds the mappings for code points missing from the files which have
+ # 1) Creates properties that are missing from the version of Unicode being
+ # compiled, and which, for whatever reason, are needed for the Perl
+ # core to function properly. These are minimally populated as
+ # necessary.
+ # 2) Adds the mappings for code points missing from the files which have
# defaults specified for them.
- # 2) At this this point all mappings are known, so it computes the type of
+ # 3) At this this point all mappings are known, so it computes the type of
# each property whose type hasn't been determined yet.
- # 3) Calculates all the regular expression match tables based on the
+ # 4) Calculates all the regular expression match tables based on the
# mappings.
- # 3) Calculates and adds the tables which are defined by Unicode, but
+ # 5) Calculates and adds the tables which are defined by Unicode, but
# which aren't derived by them, and certain derived tables that Perl
# uses.
+ # GCB and hst are not in early Unicode releases; create dummy ones if
+ # they don't exist, as the core needs tables generated from them.
+ my $gcb = property_ref('Grapheme_Cluster_Break');
+ if (! defined $gcb) {
+ $gcb = Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
+ Status => $PLACEHOLDER,
+ Type => $ENUM,
+ Default_Map => 'Other');
+ }
+ my $hst = property_ref('Hangul_Syllable_Type');
+ if (!defined $hst) {
+ $hst = Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
+ Status => $PLACEHOLDER,
+ Type => $ENUM,
+ Default_Map => 'Not_Applicable');
+ }
+
# For each property, fill in any missing mappings, and calculate the re
# match tables. If a property has more than one missing mapping, the
# default is a reference to a data structure, and requires data from other
# identical code points, but their caseless equivalents are not the same,
# one being 'Cased' and the other being 'LC', and so now must be kept as
# separate entities.
- $Title += $lt if defined $lt;
+ if (defined $lt) {
+ $Title += $lt;
+ }
+ else {
+ push @tables_that_may_be_empty, $Title->complete_name;
+ }
# If this Unicode version doesn't have Cased, set up our own. From
# Unicode 5.1: Definition D120: A character C is defined to be cased if
# Perl's traditional space doesn't include Vertical Tab
my $XPerlSpace = $perl->add_match_table('XPerlSpace',
Description => '\s, including beyond ASCII',
- Initialize => $Space - 0x000B,
+ #Initialize => $Space - 0x000B,
+ Initialize => $Space,
);
$XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
my $PerlSpace = $perl->add_match_table('PerlSpace',
- Description => '\s, restricted to ASCII = [ \f\n\r\t]',
+ Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
Initialize => $XPerlSpace & $ASCII,
);
# The 'extended' grapheme cluster came in 5.1. The non-extended
# definition differs too much from the traditional Perl one to use.
- if (defined $gcb && defined $gcb->table('SpacingMark')) {
+ if (defined $gcb->table('SpacingMark')) {
- # Note that assumes HST is defined; it came in an earlier release than
+ # Note that assumes hst is defined; it came in an earlier release than
# GCB. In the line below, two negatives means: yes hangul
$begin += ~ property_ref('Hangul_Syllable_Type')
->table('Not_Applicable')
# We set things up so the Perl core degrades gracefully, possibly with
# placeholders that match nothing.
- if (! defined $gcb) {
- $gcb = Property->new('GCB', Status => $PLACEHOLDER);
- }
- my $hst = property_ref('HST');
- if (!defined $hst) {
- $hst = Property->new('HST', Status => $PLACEHOLDER);
- $hst->add_match_table('Not_Applicable',
- Initialize => $Any,
- Matches_All => 1);
- }
+ my $hst = property_ref('Hangul_Syllable_Type');
# On some releases, here we may not have the needed tables for the
# perl core, in some releases we may.
push @tables_that_may_be_empty, $table->complete_name;
}
- # The HST property predates the GCB one, and has identical tables
+ # The hst property predates the GCB one, and has identical tables
# for some of them, so use it if we can.
- if ($table->is_empty
- && defined $hst
- && defined $hst->table($name))
+ if ($table->is_empty && defined $hst->table($name))
{
$table += $hst->table($name);
}
push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
} else {
$lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
- $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
+ $lv_lvt_v->add_comment('For use in \X; matches: hst=LV | hst=LVT | hst=V');
}
# Was previously constructed to contain both Name and Unicode_1_Name
}
my $alias_sentence = "";
+ my %abbreviations;
my $alias = property_ref('Name_Alias');
- if (defined $alias) {
- push @composition, 'Name_Alias';
- $perl_charname->set_proxy_for('Name_Alias');
- my %abbreviations;
-
- # Add each entry in Name_Alias to Perl_Charnames. Where these go with
- # respect to any existing entry depends on the entry type.
- # Corrections go before said entry, as they should be returned in
- # preference over the existing entry. (A correction to a correction
- # should be later in the Name_Alias table, so it will correctly
- # precede the erroneous correction in Perl_Charnames.)
- #
- # Abbreviations go after everything else, so they are saved
- # temporarily in a hash for later.
- #
- # Controls are currently added afterwards. This is because Perl has
- # previously used the Unicode1 name, and so should still use that.
- # (Most of them will be the same anyway, in which case we don't add a
- # duplicate)
-
- $alias->reset_each_range;
- while (my ($range) = $alias->each_range) {
- next if $range->value eq "";
- my $code_point = $range->start;
- if ($code_point != $range->end) {
- Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
- }
- my ($value, $type) = split ': ', $range->value;
- my $replace_type;
- if ($type eq 'correction') {
- $replace_type = $MULTIPLE_BEFORE;
- }
- elsif ($type eq 'abbreviation') {
-
- # Save for later
- $abbreviations{$value} = $code_point;
- next;
- }
- elsif ($type eq 'control') {
- $replace_type = $MULTIPLE_AFTER;
- }
- else {
- $replace_type = $MULTIPLE_AFTER;
- }
-
- # Actually add; before or after current entry(ies) as determined
- # above.
- $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ push @composition, 'Name_Alias';
+ $perl_charname->set_proxy_for('Name_Alias');
+
+ # Add each entry in Name_Alias to Perl_Charnames. Where these go with
+ # respect to any existing entry depends on the entry type. Corrections go
+ # before said entry, as they should be returned in preference over the
+ # existing entry. (A correction to a correction should be later in the
+ # Name_Alias table, so it will correctly precede the erroneous correction
+ # in Perl_Charnames.)
+ #
+ # Abbreviations go after everything else, so they are saved temporarily in
+ # a hash for later.
+ #
+ # Controls are currently added afterwards. This is because Perl has
+ # previously used the Unicode1 name, and so should still use that. (Most
+ # of them will be the same anyway, in which case we don't add a duplicate)
+
+ $alias->reset_each_range;
+ while (my ($range) = $alias->each_range) {
+ next if $range->value eq "";
+ my $code_point = $range->start;
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+ my ($value, $type) = split ': ', $range->value;
+ my $replace_type;
+ if ($type eq 'correction') {
+ $replace_type = $MULTIPLE_BEFORE;
+ }
+ elsif ($type eq 'abbreviation') {
+
+ # Save for later
+ $abbreviations{$value} = $code_point;
+ next;
}
-
- # Now add the Unicode_1 names for the controls. These come after the
- # official names, as they are only recommended (by TR18; unclear as of
- # this writing if that recommendation will be withdrawn, but if it is,
- # we want to add them anyway for backwards compatibility). Only a few
- # differ from the official names.
- foreach my $range (property_ref('Unicode_1_Name')->ranges) {
- my $code_point = $range->start;
- my $unicode_1_value = $range->value;
- next if $unicode_1_value eq ""; # Skip if name doesn't exist.
-
- if ($code_point != $range->end) {
- Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
- }
-
- # To handle EBCDIC, we don't hard code in the code points of the
- # controls; instead realizing that all of them are below 256.
- last if $code_point > 255;
-
- # We only add in the controls.
- next if $gc->value_of($code_point) ne 'Cc';
-
- # This won't add an exact duplicate.
- $perl_charname->add_duplicate($code_point, $unicode_1_value,
- Replace => $MULTIPLE_AFTER);
+ else {
+ $replace_type = $MULTIPLE_AFTER;
}
- # Now that have everything added, add in abbreviations after
- # everything else.
- foreach my $value (keys %abbreviations) {
- $perl_charname->add_duplicate($abbreviations{$value}, $value, Replace => $MULTIPLE_AFTER);
- }
- $alias_sentence = <<END;
+ # Actually add; before or after current entry(ies) as determined
+ # above.
+
+ $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+ }
+ $alias_sentence = <<END;
The Name_Alias property adds duplicate code point entries that are
alternatives to the original name. If an addition is a corrected
name, it will be physically first in the table. The original (less correct,
but still valid) name will be next; then any alternatives, in no particular
order; and finally any abbreviations, again in no particular order.
END
+
+ # Now add the Unicode_1 names for the controls. The Unicode_1 names had
+ # precedence before 6.1, so should be first in the file; the other names
+ # have precedence starting in 6.1,
+ my $before_or_after = ($v_version lt v6.1.0)
+ ? $MULTIPLE_BEFORE
+ : $MULTIPLE_AFTER;
+
+ foreach my $range (property_ref('Unicode_1_Name')->ranges) {
+ my $code_point = $range->start;
+ my $unicode_1_value = $range->value;
+ next if $unicode_1_value eq ""; # Skip if name doesn't exist.
+
+ if ($code_point != $range->end) {
+ Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
+ }
+
+ # To handle EBCDIC, we don't hard code in the code points of the
+ # controls; instead realizing that all of them are below 256.
+ last if $code_point > 255;
+
+ # We only add in the controls.
+ next if $gc->value_of($code_point) ne 'Cc';
+
+ # This won't add an exact duplicate.
+ $perl_charname->add_duplicate($code_point, $unicode_1_value,
+ Replace => $before_or_after);
+ }
+
+ # Now that have everything added, add in abbreviations after
+ # everything else.
+ foreach my $value (keys %abbreviations) {
+ $perl_charname->add_duplicate($abbreviations{$value}, $value,
+ Replace => $MULTIPLE_AFTER);
}
my $comment;
Uppercase_Mapping uc()
Also, Case_Folding is accessible through the C</i> modifier in regular
-expressions.
+expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
+operator.
And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
interpolation in double-quoted strings and regular expressions; and functions
# Make a list of all combinations of properties/values that are suppressed.
my @suppressed;
- foreach my $property_name (keys %why_suppressed) {
+ if (! $debug_skip) { # This tends to fail in this debug mode
+ foreach my $property_name (keys %why_suppressed) {
- # Just the value
- my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
+ # Just the value
+ my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
- # The hash may contain properties not in this release of Unicode
- next unless defined (my $property = property_ref($property_name));
+ # The hash may contain properties not in this release of Unicode
+ next unless defined (my $property = property_ref($property_name));
- # Find all combinations
- foreach my $prop_alias ($property->aliases) {
- my $prop_alias_name = standardize($prop_alias->name);
+ # Find all combinations
+ foreach my $prop_alias ($property->aliases) {
+ my $prop_alias_name = standardize($prop_alias->name);
- # If no =value, there's just one combination possibe for this
- if (! $value_name) {
+ # If no =value, there's just one combination possibe for this
+ if (! $value_name) {
- # The property may be suppressed, but there may be a proxy for
- # it, so it shouldn't be listed as suppressed
- next if $prop_alias->ucd;
- push @suppressed, $prop_alias_name;
- }
- else { # Otherwise
- foreach my $value_alias ($property->table($value_name)->aliases)
- {
- next if $value_alias->ucd;
+ # The property may be suppressed, but there may be a proxy
+ # for it, so it shouldn't be listed as suppressed
+ next if $prop_alias->ucd;
+ push @suppressed, $prop_alias_name;
+ }
+ else { # Otherwise
+ foreach my $value_alias
+ ($property->table($value_name)->aliases)
+ {
+ next if $value_alias->ucd;
- push @suppressed, "$prop_alias_name="
- . standardize($value_alias->name);
+ push @suppressed, "$prop_alias_name="
+ . standardize($value_alias->name);
+ }
}
}
}
}
}
}
- elsif ($count == $MAX_UNICODE_CODEPOINTS) {
- if ($table == $property || $table->leader == $table) {
+ elsif ($count == $MAX_UNICODE_CODEPOINTS
+ && ($table == $property || $table->leader == $table)
+ && $table->property->status != $PLACEHOLDER)
+ {
Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
- }
}
if ($table->fate == $SUPPRESSED) {
make_UCD;
make_property_test_script() if $make_test_script;
+ make_normalization_test_script() if $make_norm_test_script;
return;
}
return;
}
+sub make_normalization_test_script() {
+ print "Making normalization test script\n" if $verbosity >= $PROGRESS;
+
+ my $n_path = 'TestNorm.pl';
+
+ unshift @normalization_tests, <<'END';
+use utf8;
+use Test::More;
+
+sub ord_string { # Convert packed ords to printable string
+ use charnames ();
+ return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
+ unpack "U*", shift) . "'";
+ #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
+}
+
+sub Test_N {
+ my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
+ my $display_source = ord_string($source);
+ my $display_nfc = ord_string($nfc);
+ my $display_nfd = ord_string($nfd);
+ my $display_nfkc = ord_string($nfkc);
+ my $display_nfkd = ord_string($nfkd);
+
+ use Unicode::Normalize;
+ # NFC
+ # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
+ # nfkc == toNFC(nfkc) == toNFC(nfkd)
+ #
+ # NFD
+ # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
+ # nfkd == toNFD(nfkc) == toNFD(nfkd)
+ #
+ # NFKC
+ # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
+ # toNFKC(nfkc) == toNFKC(nfkd)
+ #
+ # NFKD
+ # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
+ # toNFKD(nfkc) == toNFKD(nfkd)
+
+ is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
+ is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
+ is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
+ is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
+ is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
+
+ is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
+ is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
+ is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
+ is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
+ is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
+
+ is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
+ is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
+ is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
+ is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
+ is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
+
+ is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
+ is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
+ is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
+ is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
+ is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
+}
+END
+
+ &write($n_path,
+ 1, # Is utf8;
+ [
+ @normalization_tests,
+ 'done_testing();'
+ ]);
+ return;
+}
+
# This is a list of the input files and how to handle them. The files are
# processed in their order in this list. Some reordering is possible if
# desired, but the v0 files should be first, and the extracted before the
Input_file->new('BidiMirroring.txt', v3.0.1,
Property => 'Bidi_Mirroring_Glyph',
),
- Input_file->new("NormalizationTest.txt", v3.0.1,
- Skip => 'Validation Tests',
+ Input_file->new("NormTest.txt", v3.0.0,
+ Handler => \&process_NormalizationsTest,
+ Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
),
Input_file->new('CaseFolding.txt', v3.0.1,
Pre_Handler => \&setup_case_folding,
Input_file->new('NamedSequences.txt', v4.1.0,
Handler => \&process_NamedSequences
),
- Input_file->new('NameAliases.txt', v5.0.0,
+ Input_file->new('NameAliases.txt', v0,
Property => 'Name_Alias',
Pre_Handler => ($v_version le v6.0.0)
? \&setup_early_name_alias