#
# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
# from the Unicode database files (lib/unicore/.../*.txt), It also generates
-# a pod file and a .t file
+# a pod file and .t files, depending on option parameters.
#
# The structure of this file is:
# First these introductory comments; then
# the small actual loop to process the input files and finish up; then
# a __DATA__ section, for the .t tests
#
-# This program works on all releases of Unicode through at least 6.0. The
-# outputs have been scrutinized most intently for release 5.1. The others
-# have been checked for somewhat more than just sanity. It can handle all
-# existing Unicode character properties in those releases.
+# This program works on all releases of Unicode so far. The outputs have been
+# scrutinized most intently for release 5.1. The others have been checked for
+# somewhat more than just sanity. It can handle all non-provisional Unicode
+# character properties in those releases.
#
# This program is mostly about Unicode character (or code point) properties.
# A property describes some attribute or quality of a code point, like if it
# into some corresponding value. In the case of it being lowercase or not,
# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
# property maps each Unicode code point to a single value, called a "property
-# value". (Hence each Unicode property is a true mathematical function with
-# exactly one value per code point.)
+# value". (Some more recently defined properties, map a code point to a set
+# of values.)
#
# When using a property in a regular expression, what is desired isn't the
# mapping of the code point to its property's value, but the reverse (or the
# are for mappings that don't fit into the normal scheme of things. Mappings
# that require a hash entry to communicate with utf8.c are one example;
# another example is mappings for charnames.pm to use which indicate a name
-# that is algorithmically determinable from its code point (and vice-versa).
+# that is algorithmically determinable from its code point (and the reverse).
# These are used to significantly compact these tables, instead of listing
# each one of the tens of thousands individually.
#
#
# Actually, there are two types of range lists, "Range_Map" is the one
# associated with map tables, and "Range_List" with match tables.
-# Again, this is so that methods can be defined on one and not the other so as
-# to prevent operating on them in incorrect ways.
+# Again, this is so that methods can be defined on one and not the others so
+# as to prevent operating on them in incorrect ways.
#
# Eventually, most tables are written out to files to be read by utf8_heavy.pl
# in the perl core. All tables could in theory be written, but some are
# takes every code point and maps it to Y or N (but having ranges cuts the
# number of entries in that table way down), and two match tables, one
# which has a list of all the code points that map to Y, and one for all the
-# code points that map to N. (For each of these, a third table is also
+# code points that map to N. (For each binary property, a third table is also
# generated for the pseudo Perl property. It contains the identical code
-# points as the Y table, but can be written, not in the compound form, but in
-# a "single" form like \p{IsUppercase}.) Many properties are binary, but some
-# properties have several possible values, some have many, and properties like
-# Name have a different value for every named code point. Those will not,
-# unless the controlling lists are changed, have their match tables written
-# out. But all the ones which can be used in regular expression \p{} and \P{}
-# constructs will. Prior to 5.14, generally a property would have either its
-# map table or its match tables written but not both. Again, what gets
-# written is controlled by lists which can easily be changed. Starting in
-# 5.14, advantage was taken of this, and all the map tables needed to
-# reconstruct the Unicode db are now written out, while suppressing the
-# Unicode .txt files that contain the data. Our tables are much more compact
-# than the .txt files, so a significant space savings was achieved.
-
-# Properties have a 'Type', like binary, or string, or enum depending on how
-# many match tables there are and the content of the maps. This 'Type' is
+# points as the Y table, but can be written in regular expressions, not in the
+# compound form, but in a "single" form like \p{IsUppercase}.) Many
+# properties are binary, but some properties have several possible values,
+# some have many, and properties like Name have a different value for every
+# named code point. Those will not, unless the controlling lists are changed,
+# have their match tables written out. But all the ones which can be used in
+# regular expression \p{} and \P{} constructs will. Prior to 5.14, generally
+# a property would have either its map table or its match tables written but
+# not both. Again, what gets written is controlled by lists which can easily
+# be changed. Starting in 5.14, advantage was taken of this, and all the map
+# tables needed to reconstruct the Unicode db are now written out, while
+# suppressing the Unicode .txt files that contain the data. Our tables are
+# much more compact than the .txt files, so a significant space savings was
+# achieved. Also, tables are not written out that are trivially derivable
+# from tables that do get written. So, there typically is no file containing
+# the code points not matched by a binary property (the table for \P{} versus
+# lowercase \p{}), since you just need to invert the True table to get the
+# False table.
+
+# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
+# how many match tables there are and the content of the maps. This 'Type' is
# different than a range 'Type', so don't get confused by the two concepts
# having the same name.
#
# As stated earlier, this program will work on any release of Unicode so far.
# Most obvious problems in earlier data have NOT been corrected except when
# necessary to make Perl or this program work reasonably, and to keep out
-# potential security issues. For example, no
-# folding information was given in early releases, so this program substitutes
-# lower case instead, just so that a regular expression with the /i option
-# will do something that actually gives the right results in many cases.
-# There are also a couple other corrections for version 1.1.5, commented at
-# the point they are made. As an example of corrections that weren't made
-# (but could be) is this statement from DerivedAge.txt: "The supplementary
-# private use code points and the non-character code points were assigned in
-# version 2.0, but not specifically listed in the UCD until versions 3.0 and
-# 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0) More information
-# on Unicode version glitches is further down in these introductory comments.
+# potential security issues. For example, no folding information was given in
+# early releases, so this program substitutes lower case instead, just so that
+# a regular expression with the /i option will do something that actually
+# gives the right results in many cases. There are also a couple other
+# corrections for version 1.1.5, commented at the point they are made. As an
+# example of corrections that weren't made (but could be) is this statement
+# from DerivedAge.txt: "The supplementary private use code points and the
+# non-character code points were assigned in version 2.0, but not specifically
+# listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise
+# it was 3.0.1 not 3.0.0) More information on Unicode version glitches is
+# further down in these introductory comments.
#
-# This program works on all non-provisional properties as of 6.0, though the
-# files for some are suppressed from apparent lack of demand for them. You
-# can change which are output by changing lists in this program.
+# This program works on all non-provisional properties as of the current
+# Unicode release, though the files for some are suppressed for various
+# reasons. You can change which are output by changing lists in this program.
#
# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
# loose matchings rules (from Unicode TR18):
# recognized, and that loose matching of property names be used,
# whereby the case distinctions, whitespace, hyphens, and underbar
# are ignored.
+#
# The program still allows Fuzzy to override its determination of if loose
# matching should be used, but it isn't currently used, as it is no longer
# needed; the calculations it makes are good enough.
# values. That is, they list code points and say what the mapping
# is under the given property. Some files give the mappings for
# just one property; and some for many. This program goes through
-# each file and populates the properties from them. Some properties
-# are listed in more than one file, and Unicode has set up a
-# precedence as to which has priority if there is a conflict. Thus
-# the order of processing matters, and this program handles the
-# conflict possibility by processing the overriding input files
-# last, so that if necessary they replace earlier values.
+# each file and populates the properties and their map tables from
+# them. Some properties are listed in more than one file, and
+# Unicode has set up a precedence as to which has priority if there
+# is a conflict. Thus the order of processing matters, and this
+# program handles the conflict possibility by processing the
+# overriding input files last, so that if necessary they replace
+# earlier values.
# After this is all done, the program creates the property mappings not
# furnished by Unicode, but derivable from what it does give.
# The tables of code points that match each property value in each
# can't just take the intersection of two map tables, for example, as that
# is nonsensical.
#
+# What about 'fate' and 'status'. The concept of a table's fate was created
+# late when it became clear that something more was needed. The difference
+# between this and 'status' is unclean, and could be improved if someone
+# wanted to spend the effort.
+#
# DEBUGGING
#
# This program is written so it will run under miniperl. Occasionally changes
#
# local $to_trace = 1 if main::DEBUG;
#
-# can be added to enable tracing in its lexical scope or until you insert
-# another line:
+# can be added to enable tracing in its lexical scope (plus dynamic) or until
+# you insert another line:
#
# local $to_trace = 0 if main::DEBUG;
#
-# then use a line like "trace $a, @b, %c, ...;
+# To actually trace, use a line like "trace $a, @b, %c, ...;
#
# Some of the more complex subroutines already have trace statements in them.
# Permanent trace statements should be like:
# my $debug_skip = 0;
#
# to 1, and every file whose object is in @input_file_objects and doesn't have
-# a, 'non_skip => 1,' in its constructor will be skipped.
+# a, 'non_skip => 1,' in its constructor will be skipped. However, skipping
+# Jamo.txt or UnicodeData.txt will likely cause fatal errors.
#
# To compare the output tables, it may be useful to specify the -annotate
# flag. This causes the tables to expand so there is one entry for each
# ones. The program should warn you if its name will clash with others on
# restrictive file systems, like DOS. If so, figure out a better name, and
# add lines to the README.perl file giving that. If the file is a character
-# property, it should be in the format that Unicode has by default
+# property, it should be in the format that Unicode has implicitly
# standardized for such files for the more recently introduced ones.
# If so, the Input_file constructor for @input_file_objects can just be the
# file name and release it first appeared in. If not, then it should be
#
# Here are some observations about some of the issues in early versions:
#
-# The number of code points in \p{alpha} halved in 2.1.9. It turns out that
-# the reason is that the CJK block starting at 4E00 was removed from PropList,
-# and was not put back in until 3.1.0
+# Prior to version 3.0, there were 3 character decompositions. These are not
+# handled by Unicode::Normalize, nor will it compile when presented a version
+# that has them. However, you can trivially get it to compile by simply
+# ignoring those decompositions, by changing the croak to a carp. At the time
+# of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
+#
+# croak("Weird Canonical Decomposition of U+$h");
+#
+# Simply change to a carp. It will compile, but will not know about any three
+# character decomposition.
+
+# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
+# that the reason is that the CJK block starting at 4E00 was removed from
+# PropList, and was not put back in until 3.1.0. The Perl extension (the
+# single property name \p{alpha}) has the correct values. But the compound
+# form is simply not generated until 3.1, as it can be argued that prior to
+# this release, this was not an official property. The comments for
+# filter_old_style_proplist() give more details.
#
# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
# reclassified it correctly.
#
# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
-# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
-# was left with no code points, as all the ones that mapped to 202 stayed
-# mapped to 202. Thus if your program used the numeric name for the class,
-# it would not have been affected, but if it used the mnemonic, it would have
-# been.
+# this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB
+# became 202, and ATBL was left with no code points, as all the ones that
+# mapped to 202 stayed mapped to 202. Thus if your program used the numeric
+# name for the class, it would not have been affected, but if it used the
+# mnemonic, it would have been.
#
# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
# points which eventually came to have this script property value, instead
# tries to do the best it can for earlier releases. It is done in
# process_PropertyAliases()
#
+# In version 2.1.2, the entry in UnicodeData.txt:
+# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
+# should instead be
+# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
+# Without this change, there are casing problems for this character.
+#
##############################################################################
my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
# before normal completion.
my $debug_skip = 0;
+
+# Normally these are suppressed.
+my $write_Unicode_deprecated_tables = 0;
+
# Set to 1 to enable tracing.
our $to_trace = 0;
}
}
+if ($write_Unicode_deprecated_tables) {
+ foreach my $property (keys %why_suppressed) {
+ delete $why_suppressed{$property} if $property =~
+ / ^ Other | Grapheme /x;
+ }
+}
+
if ($v_version ge 4.0.0) {
$why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
if ($v_version ge 6.0.0) {
Word_Break => 'Other',
);
-# Below are files that Unicode furnishes, but this program ignores, and why
+# Below are files that Unicode furnishes, but this program ignores, and why.
+# NormalizationCorrections.txt requires some more explanation. It documents
+# the cumulative fixes to erroneous normalizations in earlier Unicode
+# versions. Its main purpose is so that someone running on an earlier version
+# can use this file to override what got published in that earlier release.
+# It would be easy for mktables to read and handle this file. But all the
+# corrections in it should already be in the other files for the release it
+# is. To get it to actually mean something useful, someone would have to be
+# using an earlier Unicode release, and copy it to the files for that release
+# and recomplile. So far there has been no demand to do that, so this hasn't
+# been implemented.
my %ignored_files = (
'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
'Index.txt' => 'Alphabetical index of Unicode characters',
my $MAP_PROXIED = 1; # The map table for the property isn't written out,
# but there is a file written that can be used to
# reconstruct this table
-my $SUPPRESSED = 3; # The file for this table is not written out.
-my $INTERNAL_ONLY = 4; # The file for this table is written out, but it is
+my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is
# for Perl's internal use only
-my $PLACEHOLDER = 5; # A property that is defined as a placeholder in a
- # Unicode version that doesn't have it, but we need it
- # to be defined, if empty, to have things work.
- # Implies no pod entry generated
+my $SUPPRESSED = 3; # The file for this table is not written out, and as a
+ # result, we don't bother to do many computations on
+ # it.
+my $PLACEHOLDER = 4; # Like $SUPPRESSED, but we go through all the
+ # computations anyway, as the values are needed for
+ # things to work. This happens when we have Perl
+ # extensions that depend on Unicode tables that
+ # wouldn't normally be in a given Unicode version.
# The format of the values of the tables:
my $EMPTY_FORMAT = "";
containing_range($i)->end;
}
elsif ($gc-> table('Control')->contains($i)) {
- $viacode[$i] = 'Control';
+ $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control';
$annotate_char_type[$i] = $CONTROL_TYPE;
$printable[$i] = 0;
- $end = 0x81 if $i == 0x80; # Hard-code this one known case
}
elsif ($gc-> table('Unassigned')->contains($i)) {
$annotate_char_type[$i] = $UNASSIGNED_TYPE;
my %property;
# name of property this file is for. defaults to none, meaning not
# applicable, or is otherwise determinable, for example, from each line.
- main::set_access('property', \%property, qw{ c });
+ main::set_access('property', \%property, qw{ c r });
my %optional;
# If this is true, the file is optional. If not present, no warning is
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
- if (! $type{$addr}) {
- $standard_form{$addr} = main::standardize($value);
- }
-
return $self;
}
}
sub standard_form {
- # The standard form is the value itself if the standard form is
- # undefined (that is if the value is special)
+ # Calculate the standard form only if needed, and cache the result.
+ # The standard form is the value itself if the type is special.
+ # This represents a considerable CPU and memory saving - at the time
+ # of writing there are 368676 non-special objects, but the standard
+ # form is only requested for 22047 of them - ie about 6%.
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $addr = do { no overloading; pack 'J', $self; };
return $standard_form{$addr} if defined $standard_form{$addr};
- return $value{$addr};
+
+ my $value = $value{$addr};
+ return $value if $type{$addr};
+ return $standard_form{$addr} = main::standardize($value);
}
sub dump {
return $self->_intersect($other, 0);
},
+ '&=' => sub { my $self = shift;
+ my $other = shift;
+ my $reversed = shift;
+
+ if ($reversed) {
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' &= '
+ . ref($self)
+ . "'. undef returned.");
+ return;
+ }
+
+ return $self->_intersect($other, 0);
+ },
'~' => "_invert",
'-' => "_subtract",
;
'&=' => sub {
my $self = shift;
my $other = shift;
+ my $reversed = shift;
+
+ if ($reversed) {
+ Carp::my_carp_bug("Bad news. Can't cope with '"
+ . ref($other)
+ . ' &= '
+ . ref($self)
+ . "'. undef returned.");
+ return;
+ }
return if $self->carp_if_locked;
$self->_set_range_list($self->_range_list & $other);
Property->new('JSN', Full_Name => 'Jamo_Short_Name');
}
+ # These two properties must be defined in all releases so we can generate
+ # the tables from them to make regex \X work, but suppress their output so
+ # aren't application visible prior to releases where they should be
+ if (! defined property_ref('GCB')) {
+ Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
+ Fate => $PLACEHOLDER);
+ }
+ if (! defined property_ref('hst')) {
+ Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
+ Fate => $PLACEHOLDER);
+ }
+
# These are used so much, that we set globals for them.
$gc = property_ref('General_Category');
$block = property_ref('Block');
# This first set is in the original old-style proplist.
push @return, split /\n/, <<'END';
-Alpha ; Alphabetic
Bidi_C ; Bidi_Control
Dash ; Dash
Dia ; Diacritic
}
if (-e 'DCoreProperties.txt') {
push @return, split /\n/, <<'END';
+Alpha ; Alphabetic
IDS ; ID_Start
XIDC ; XID_Continue
XIDS ; XID_Start
$file->insert_lines(get_old_property_value_aliases());
}
+ if ($v_version lt 4.0.0) {
+ $file->insert_lines(split /\n/, <<'END'
+hst; L ; Leading_Jamo
+hst; LV ; LV_Syllable
+hst; LVT ; LVT_Syllable
+hst; NA ; Not_Applicable
+hst; T ; Trailing_Jamo
+hst; V ; Vowel_Jamo
+END
+ );
+ }
+ if ($v_version lt 4.1.0) {
+ $file->insert_lines(split /\n/, <<'END'
+GCB; CN ; Control
+GCB; CR ; CR
+GCB; EX ; Extend
+GCB; L ; L
+GCB; LF ; LF
+GCB; LV ; LV
+GCB; LVT ; LVT
+GCB; T ; T
+GCB; V ; V
+GCB; XX ; Other
+END
+ );
+ }
+
+
# Add any explicit cjk values
$file->insert_lines(@cjk_property_values);
sub filter_v6_ucd {
- # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't
- # accepted that yet to allow for some deprecation cycles.
+ # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
+ # it wasn't accepted, to allow for some deprecation cycles. This
+ # function is not called after 5.16
return if $_ !~ /^(?:0007|1F514|070F);/;
# PropList.txt has been in Unicode since version 2.0. Until 3.1, it
# was in a completely different syntax. Ken Whistler of Unicode says
# that it was something he used as an aid for his own purposes, but
- # was never an official part of the standard. However, comments in
- # DAge.txt indicate that non-character code points were available in
- # the UCD as of 3.1. It is unclear to me (khw) how they could be
- # there except through this file (but on the other hand, they first
- # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
- # not. But the claim is that it was published as an aid to others who
- # might want some more information than was given in the official UCD
- # of the time. Many of the properties in it were incorporated into
- # the later PropList.txt, but some were not. This program uses this
- # early file to generate property tables that are otherwise not
- # accessible in the early UCD's, and most were probably not really
- # official at that time, so one could argue that it should be ignored,
- # and you can easily modify things to skip this. And there are bugs
- # in this file in various versions. (For example, the 2.1.9 version
- # removes from Alphabetic the CJK range starting at 4E00, and they
- # weren't added back in until 3.1.0.) Many of this file's properties
- # were later sanctioned, so this code generates tables for those
- # properties that aren't otherwise in the UCD of the time but
- # eventually did become official, and throws away the rest. Here is a
- # list of all the ones that are thrown away:
+ # was never an official part of the standard. Many of the properties
+ # in it were incorporated into the later PropList.txt, but some were
+ # not. This program uses this early file to generate property tables
+ # that are otherwise not accessible in the early UCD's. It does this
+ # for the ones that eventually became official, and don't appear to be
+ # too different in their contents from the later official version, and
+ # throws away the rest. It could be argued that the ones it generates
+ # were probably not really official at that time, so should be
+ # ignored. You can easily modify things to skip all of them by
+ # changing this function to just set $_ to "", and return; and to skip
+ # certain of them by by simply removing their declarations from
+ # get_old_property_aliases().
+ #
+ # Here is a list of all the ones that are thrown away:
+ # Alphabetic The definitions for this are very
+ # defective, so better to not mislead
+ # people into thinking it works.
+ # Instead the Perl extension of the
+ # same name is constructed from first
+ # principles.
# Bidi=* duplicates UnicodeData.txt
# Combining never made into official property;
# is \P{ccc=0}
return;
}
+sub generate_hst {
+
+ # Populates the Hangul Syllable Type property from first principles
+
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ # These few ranges are hard-coded in.
+ $file->insert_lines(split /\n/, <<'END'
+1100..1159 ; L
+115F ; L
+1160..11A2 ; V
+11A8..11F9 ; T
+END
+);
+
+ # The Hangul syllables in version 1 are completely different than what came
+ # after, so just ignore them there.
+ if ($v_version lt v2.0.0) {
+ my $property = property_ref($file->property);
+ push @tables_that_may_be_empty, $property->table('LV')->complete_name;
+ push @tables_that_may_be_empty, $property->table('LVT')->complete_name;
+ return;
+ }
+
+ # The algorithmically derived syllables are almost all LVT ones, so
+ # initialize the whole range with that.
+ $file->insert_lines(sprintf "%04X..%04X; LVT\n",
+ $SBase, $SBase + $SCount -1);
+
+ # Those ones that aren't LVT are LV, and they occur at intervals of
+ # $TCount code points, starting with the first code point, at $SBase.
+ for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
+ $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
+ }
+
+ return;
+}
+
+sub generate_GCB {
+
+ # Populates the Grapheme Cluster Break property from first principles
+
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ # All these definitions are from
+ # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
+ # from http://www.unicode.org/reports/tr29/tr29-4.html
+
+ foreach my $range ($gc->ranges) {
+
+ # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
+ # and gc=Cf
+ if ($range->value =~ / ^ M [en] $ /x) {
+ $file->insert_lines(sprintf "%04X..%04X; Extend",
+ $range->start, $range->end);
+ }
+ elsif ($range->value =~ / ^ C [cf] $ /x) {
+ $file->insert_lines(sprintf "%04X..%04X; Control",
+ $range->start, $range->end);
+ }
+ }
+ $file->insert_lines("2028; Control"); # Line Separator
+ $file->insert_lines("2029; Control"); # Paragraph Separator
+
+ $file->insert_lines("000D; CR");
+ $file->insert_lines("000A; LF");
+
+ # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
+ foreach my $code_point ( qw{
+ 40000
+ 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
+ 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
+ }
+ ) {
+ my $category = $gc->value_of(hex $code_point);
+ next if ! defined $category || $category eq 'Cn'; # But not if
+ # unassigned in this
+ # release
+ $file->insert_lines("$code_point; Extend");
+ }
+
+ my $hst = property_ref('Hangul_Syllable_Type');
+ if ($hst->count > 0) {
+ foreach my $range ($hst->ranges) {
+ $file->insert_lines(sprintf "%04X..%04X; %s",
+ $range->start, $range->end, $range->value);
+ }
+ }
+ else {
+ generate_hst($file);
+ }
+
+ return;
+}
+
sub setup_early_name_alias {
my $file= shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
}
}
- # GCB and hst are not in early Unicode releases; create dummy ones if
- # they don't exist, as the core needs tables generated from them.
- my $gcb = property_ref('Grapheme_Cluster_Break');
- if (! defined $gcb) {
- $gcb = Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
- Status => $PLACEHOLDER,
- Type => $ENUM,
- Default_Map => 'Other');
- }
- my $hst = property_ref('Hangul_Syllable_Type');
- if (!defined $hst) {
- $hst = Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
- Status => $PLACEHOLDER,
- Type => $ENUM,
- Default_Map => 'Not_Applicable');
- }
-
# For each property, fill in any missing mappings, and calculate the re
# match tables. If a property has more than one missing mapping, the
# default is a reference to a data structure, and requires data from other
return;
}
+sub pre_3_dot_1_Nl () {
+
+ # Return a range list for gc=nl for Unicode versions prior to 3.1, which
+ # is when Unicode's became fully usable. These code points were
+ # determined by inspection and experimentation. gc=nl is important for
+ # certain Perl-extension properties that should be available in all
+ # releases.
+
+ my $Nl = Range_List->new();
+ if (defined (my $official = $gc->table('Nl'))) {
+ $Nl += $official;
+ }
+ else {
+ $Nl->add_range(0x2160, 0x2182);
+ $Nl->add_range(0x3007, 0x3007);
+ $Nl->add_range(0x3021, 0x3029);
+ }
+ $Nl->add_range(0xFE20, 0xFE23);
+ $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
+ # these were added
+ return $Nl;
+}
+
sub compile_perl() {
# Create perl-defined tables. Almost all are part of the pseudo-property
# named 'perl' internally to this program. Many of these are recommended
# Get the best available case definitions. Early Unicode versions didn't
# have Uppercase and Lowercase defined, so use the general category
- # instead for them.
+ # instead for them, modified by hard-coding in the code points each is
+ # missing.
my $Lower = $perl->add_match_table('Lower');
my $Unicode_Lower = property_ref('Lowercase');
if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
}
else {
- $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
- Related => 1);
+ $Lower += $gc->table('Lowercase_Letter');
+
+ # There are quite a few code points in Lower, that aren't in gc=lc,
+ # and not all are in all releases.
+ foreach my $code_point ( 0x00AA,
+ 0x00BA,
+ 0x02B0 .. 0x02B8,
+ 0x02C0 .. 0x02C1,
+ 0x02E0 .. 0x02E4,
+ 0x0345,
+ 0x037A,
+ 0x1D2C .. 0x1D6A,
+ 0x1D78,
+ 0x1D9B .. 0x1DBF,
+ 0x2071,
+ 0x207F,
+ 0x2090 .. 0x209C,
+ 0x2170 .. 0x217F,
+ 0x24D0 .. 0x24E9,
+ 0x2C7C .. 0x2C7D,
+ 0xA770,
+ 0xA7F8 .. 0xA7F9,
+ ) {
+ # Don't include the code point unless it is assigned in this
+ # release
+ my $category = $gc->value_of(hex $code_point);
+ next if ! defined $category || $category eq 'Cn';
+
+ $Lower += $code_point;
+ }
}
$Lower->add_alias('XPosixLower');
my $Posix_Lower = $perl->add_match_table("PosixLower",
$Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
}
else {
- $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
- Related => 1);
+
+ # Unlike Lower, there are only two ranges in Upper that aren't in
+ # gc=Lu, and all code points were assigned in all releases.
+ $Upper += $gc->table('Uppercase_Letter');
+ $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
+ $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
}
$Upper->add_alias('XPosixUpper');
my $Posix_Upper = $perl->add_match_table("PosixUpper",
}
else {
- # For early releases, we don't get it exactly right. The below
- # includes more than it should, which in 5.2 terms is: L + Nl +
- # Other_Alphabetic. Other_Alphabetic contains many characters from
- # Mn and Mc. It's better to match more than we should, than less than
- # we should.
+ # The Alphabetic property doesn't exist for early releases, so
+ # generate it. The actual definition, in 5.2 terms is:
+ #
+ # gc=L + gc=Nl + Other_Alphabetic
+ #
+ # Other_Alphabetic is also not defined in these early releases, but it
+ # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
+ # those last two as well, then subtract the relatively few of them that
+ # shouldn't have been added. (The gc=So range is the circled capital
+ # Latin characters. Early releases mistakenly didn't also include the
+ # lower-case versions of these characters, and so we don't either, to
+ # maintain consistency with those releases that first had this
+ # property.
$Alpha->initialize($gc->table('Letter')
- + $gc->table('Mn')
- + $gc->table('Mc'));
- $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
+ + pre_3_dot_1_Nl()
+ + $gc->table('Mn')
+ + $gc->table('Mc')
+ );
+ $Alpha->add_range(0x24D0, 0x24E9); # gc=So
+ foreach my $range ( [ 0x0300, 0x0344 ],
+ [ 0x0346, 0x034E ],
+ [ 0x0360, 0x0362 ],
+ [ 0x0483, 0x0486 ],
+ [ 0x0591, 0x05AF ],
+ [ 0x06DF, 0x06E0 ],
+ [ 0x06EA, 0x06EC ],
+ [ 0x0740, 0x074A ],
+ 0x093C,
+ 0x094D,
+ [ 0x0951, 0x0954 ],
+ 0x09BC,
+ 0x09CD,
+ 0x0A3C,
+ 0x0A4D,
+ 0x0ABC,
+ 0x0ACD,
+ 0x0B3C,
+ 0x0B4D,
+ 0x0BCD,
+ 0x0C4D,
+ 0x0CCD,
+ 0x0D4D,
+ 0x0DCA,
+ [ 0x0E47, 0x0E4C ],
+ 0x0E4E,
+ [ 0x0EC8, 0x0ECC ],
+ [ 0x0F18, 0x0F19 ],
+ 0x0F35,
+ 0x0F37,
+ 0x0F39,
+ [ 0x0F3E, 0x0F3F ],
+ [ 0x0F82, 0x0F84 ],
+ [ 0x0F86, 0x0F87 ],
+ 0x0FC6,
+ 0x1037,
+ 0x1039,
+ [ 0x17C9, 0x17D3 ],
+ [ 0x20D0, 0x20DC ],
+ 0x20E1,
+ [ 0x302A, 0x302F ],
+ [ 0x3099, 0x309A ],
+ [ 0xFE20, 0xFE23 ],
+ [ 0x1D165, 0x1D169 ],
+ [ 0x1D16D, 0x1D172 ],
+ [ 0x1D17B, 0x1D182 ],
+ [ 0x1D185, 0x1D18B ],
+ [ 0x1D1AA, 0x1D1AD ],
+ ) {
+ if (ref $range) {
+ $Alpha->delete_range($range->[0], $range->[1]);
+ }
+ else {
+ $Alpha->delete_range($range, $range);
+ }
+ }
$Alpha->add_description('Alphabetic');
+ $Alpha->add_alias('Alphabetic');
}
$Alpha->add_alias('XPosixAlpha');
my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
}
else {
$PosixXDigit->initialize($Xdigit & $ASCII);
+ $PosixXDigit->add_alias('AHex');
+ $PosixXDigit->add_alias('Ascii_Hex_Digit');
}
$PosixXDigit->add_description('[0-9A-Fa-f]');
$CanonDCIJ = $CanonDCIJ & $Assigned;
}
- # For backward compatibility, Perl has its own definition for IDStart
+ # For backward compatibility, Perl has its own definition for IDStart.
# It is regular XID_Start plus the underscore, but all characters must be
# Word characters as well
my $XID_Start = property_ref('XID_Start');
- my $perl_xid_start = $perl->add_match_table('_Perl_IDStart',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => ord('_')
- );
+ my $perl_xids = $perl->add_match_table('_Perl_IDStart',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Initialize => ord('_')
+ );
if (defined $XID_Start
|| defined ($XID_Start = property_ref('ID_Start')))
{
- $perl_xid_start += $XID_Start->table('Y');
+ $perl_xids += $XID_Start->table('Y');
}
else {
# For Unicode versions that don't have the property, construct our own
- # from first principles. The actual definition is: Letters + letter
- # numbers (Nl), minus Pattern_Syntax and Pattern_White_Space code
- # points, plus stability extensions. PatSyn and PatWS are not defined
- # in releases that don't have XIDS defined, so are irrelevant.
- $perl_xid_start += $gc->table('Letter');
- my $nl = $gc->table('Letter_Number');
- $perl_xid_start += $nl if defined $nl;
+ # from first principles. The actual definition is:
+ # Letters
+ # + letter numbers (Nl)
+ # - Pattern_Syntax
+ # - Pattern_White_Space
+ # + stability extensions
+ # - NKFC modifications
+ #
+ # What we do in the code below is to include the identical code points
+ # that are in the first release that had Unicode's version of this
+ # property, essentially extrapolating backwards. There were no
+ # stability extensions until v4.1, so none are included; likewise in
+ # no Unicode version so far do subtracting PatSyn and PatWS make any
+ # difference, so those also are ignored.
+ $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
+
+ # We do subtract the NFKC modifications that are in the first version
+ # that had this property. We don't bother to test if they are in the
+ # version in question, because if they aren't, the operation is a
+ # no-op. The NKFC modifications are discussed in
+ # http://www.unicode.org/reports/tr31/#NFKC_Modifications
+ foreach my $range ( 0x037A,
+ 0x0E33,
+ 0x0EB3,
+ [ 0xFC5E, 0xFC63 ],
+ [ 0xFDFA, 0xFE70 ],
+ [ 0xFE72, 0xFE76 ],
+ 0xFE78,
+ 0xFE7A,
+ 0xFE7C,
+ 0xFE7E,
+ [ 0xFF9E, 0xFF9F ],
+ ) {
+ if (ref $range) {
+ $perl_xids->delete_range($range->[0], $range->[1]);
+ }
+ else {
+ $perl_xids->delete_range($range, $range);
+ }
+ }
}
- $perl_xid_start &= $Word;
+ $perl_xids &= $Word;
+
+ my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
+ my $XIDC = property_ref('XID_Continue');
+ if (defined $XIDC
+ || defined ($XIDC = property_ref('ID_Continue')))
+ {
+ $perl_xidc += $XIDC->table('Y');
+ }
+ else {
+ # Similarly, we construct our own XIDC if necessary for early Unicode
+ # versions. The definition is:
+ # everything in XIDS
+ # + Gc=Mn
+ # + Gc=Mc
+ # + Gc=Nd
+ # + Gc=Pc
+ # - Pattern_Syntax
+ # - Pattern_White_Space
+ # + stability extensions
+ # - NFKC modifications
+ #
+ # The same thing applies to this as with XIDS for the PatSyn, PatWS,
+ # and stability extensions. There is a somewhat different set of NFKC
+ # mods to remove (and add in this case). The ones below make this
+ # have identical code points as in the first release that defined it.
+ $perl_xidc += $perl_xids
+ + $gc->table('L')
+ + $gc->table('Mn')
+ + $gc->table('Mc')
+ + $gc->table('Nd')
+ + 0x00B7
+ ;
+ if (defined (my $pc = $gc->table('Pc'))) {
+ $perl_xidc += $pc;
+ }
+ else { # 1.1.5 didn't have Pc, but these should have been in it
+ $perl_xidc += 0xFF3F;
+ $perl_xidc->add_range(0x203F, 0x2040);
+ $perl_xidc->add_range(0xFE33, 0xFE34);
+ $perl_xidc->add_range(0xFE4D, 0xFE4F);
+ }
+
+ # Subtract the NFKC mods
+ foreach my $range ( 0x037A,
+ [ 0xFC5E, 0xFC63 ],
+ [ 0xFDFA, 0xFE1F ],
+ 0xFE70,
+ [ 0xFE72, 0xFE76 ],
+ 0xFE78,
+ 0xFE7A,
+ 0xFE7C,
+ 0xFE7E,
+ ) {
+ if (ref $range) {
+ $perl_xidc->delete_range($range->[0], $range->[1]);
+ }
+ else {
+ $perl_xidc->delete_range($range, $range);
+ }
+ }
+ }
+
+ $perl_xidc &= $Word;
+
+ # These two tables are for the 'extended' grapheme cluster, which came in
+ # 5.1; create empty ones if not already present. The non-extended
+ # definition differs from the extended (see
+ # http://www.unicode.org/reports/tr29/) only by these two tables, so we
+ # get the older definition automatically when they are empty.
my $gcb = property_ref('Grapheme_Cluster_Break');
+ my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
+ if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
+ $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
+ }
+ else {
+ push @tables_that_may_be_empty, $perl_prepend->complete_name;
+ }
+
+
# These are used in Unicode's definition of \X
my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
Fate => $INTERNAL_ONLY);
my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
Fate => $INTERNAL_ONLY);
- # The 'extended' grapheme cluster came in 5.1. The non-extended
- # definition differs too much from the traditional Perl one to use.
- if (defined $gcb->table('Control')) {
-
- # Note that assumes hst is defined; it came in an earlier release than
- # GCB. In the line below, two negatives means: yes hangul
- $begin += ~ property_ref('Hangul_Syllable_Type')
- ->table('Not_Applicable')
- + ~ ($gcb->table('Control')
- + $gcb->table('CR')
- + $gcb->table('LF'));
- $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
-
- $extend += $gcb->table('Extend');
- my $comment = 'For use in \X; matches: Extend';
- if (defined $gcb->table('SpacingMark')) {
- $extend += $gcb->table('SpacingMark');
- $comment .= ' | SpacingMark';
- }
- $extend->add_comment($comment);
-
- if (!defined $gcb->table('Prepend')) {
- my $table = $gcb->add_match_table('Prepend');
- push @tables_that_may_be_empty, $table->complete_name;
- }
- }
- else { # Old definition, used on early releases.
- $extend += $gc->table('Mark')
- + 0x200C # ZWNJ
- + 0x200D; # ZWJ
- $begin += ~ $extend;
-
- # Here we may have a release that has the regular grapheme cluster
- # defined, or a release that doesn't have anything defined.
- # We set things up so the Perl core degrades gracefully, possibly with
- # placeholders that match nothing.
-
- my $hst = property_ref('Hangul_Syllable_Type');
-
- # On some releases, here we may not have the needed tables for the
- # perl core, in some releases we may.
- foreach my $name (qw{ L LV LVT T V prepend }) {
- my $table = $gcb->table($name);
- if (! defined $table) {
- $table = $gcb->add_match_table($name);
- push @tables_that_may_be_empty, $table->complete_name;
- }
+ # In the line below, two negatives means: yes hangul
+ $begin += ~ property_ref('Hangul_Syllable_Type')
+ ->table('Not_Applicable')
+ + ~ ($gcb->table('Control')
+ + $gcb->table('CR')
+ + $gcb->table('LF'));
+ $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
- # The hst property predates the GCB one, and has identical tables
- # for some of them, so use it if we can.
- if ($table->is_empty && defined $hst->table($name))
- {
- $table += $hst->table($name);
- }
- }
+ $extend += $gcb->table('Extend');
+ if (defined (my $sm = $gcb->table('SpacingMark'))) {
+ $extend += $sm;
}
+ $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
- # More GCB. If we found some hangul syllables, populate a combined
- # table.
+ # More GCB. Populate a combined hangul syllables table
my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
Perl_Extension => 1,
Fate => $INTERNAL_ONLY);
- my $LV = $gcb->table('LV');
- if ($LV->is_empty) {
- push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
- } else {
- $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
- $lv_lvt_v->add_comment('For use in \X; matches: hst=LV | hst=LVT | hst=V');
+ foreach my $gcb_name (qw{ L V T LV LVT }) {
+
+ # The perl internal extension's name is the gcb table name prepended
+ # with an '_X_'
+ my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Initialize => $gcb->table($gcb_name),
+ );
+ # Version 1 had mostly different Hangul syllables that were removed
+ # from later versions, so some of the tables may not apply.
+ if ($v_version lt v2.0) {
+ push @tables_that_may_be_empty, $perl_table->complete_name;
+ }
}
+ my $perl_na = $perl->add_match_table('_X_HST_Not_Applicable',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Initialize => property_ref('HST')->table('NA'),
+ );
+ $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
+ $lv_lvt_v->add_comment('For use in \X; matches: hst=LV | hst=LVT | hst=V');
- # Was earlier constructed to contain both Name and Unicode_1_Name
- my @composition = ('Name', 'Unicode_1_Name');
+ my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
if (@named_sequences) {
push @composition, 'Named_Sequence';
my $alias_sentence = "";
my %abbreviations;
my $alias = property_ref('Name_Alias');
- push @composition, 'Name_Alias';
$perl_charname->set_proxy_for('Name_Alias');
# Add each entry in Name_Alias to Perl_Charnames. Where these go with
# Abbreviations go after everything else, so they are saved temporarily in
# a hash for later.
#
- # Controls are currently added afterwards. This is because Perl has
- # previously used the Unicode1 name, and so should still use that. (Most
- # of them will be the same anyway, in which case we don't add a duplicate)
+ # Everything else is added added afterwards, which preserves the input
+ # ordering
- $alias->reset_each_range;
- while (my ($range) = $alias->each_range) {
+ foreach my $range ($alias->ranges) {
next if $range->value eq "";
my $code_point = $range->start;
if ($code_point != $range->end) {
# We only add in the controls.
next if $gc->value_of($code_point) ne 'Cc';
+ # We reject this Unicode1 name for later Perls, as it is used for
+ # another code point
+ next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
+
# This won't add an exact duplicate.
$perl_charname->add_duplicate($code_point, $unicode_1_value,
Replace => $before_or_after);
# First, gather all the info that applies to this table as a whole.
- push @zero_match_tables, $table if $count == 0;
+ push @zero_match_tables, $table if $count == 0
+ # Don't mention special tables
+ # as being zero length
+ && $table->fate == $ORDINARY;
my $table_property = $table->property;
}
elsif ($count == $MAX_UNICODE_CODEPOINTS
&& ($table == $property || $table->leader == $table)
- && $table->property->status != $PLACEHOLDER)
+ && $table->property->status ne $PLACEHOLDER)
{
Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
}
- if ($table->fate == $SUPPRESSED) {
+ if ($table->fate >= $SUPPRESSED) {
if (! $is_property) {
my @children = $table->children;
foreach my $child (@children) {
- if ($child->fate != $SUPPRESSED) {
+ if ($child->fate < $SUPPRESSED) {
Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
}
}
# And for 5.14 Perls with 6.0,
# have to also make changes
- : ($v_version ge v6.0.0)
+ : ($v_version ge v6.0.0
+ && $^V lt v5.17.0)
? \&filter_v6_ucd
: undef),
? \&filter_old_style_normalization_lines
: undef),
),
- Input_file->new('HangulSyllableType.txt', v4.0.0,
+ Input_file->new('HangulSyllableType.txt', v0,
Has_Missings_Defaults => $NOT_IGNORED,
- Property => 'Hangul_Syllable_Type'),
+ Property => 'Hangul_Syllable_Type',
+ Pre_Handler => ($v_version lt v4.0.0)
+ ? \&generate_hst
+ : undef,
+ ),
Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
Property => 'Word_Break',
Has_Missings_Defaults => $NOT_IGNORED,
),
- Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
+ Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
Property => 'Grapheme_Cluster_Break',
Has_Missings_Defaults => $NOT_IGNORED,
+ Pre_Handler => ($v_version lt v4.1.0)
+ ? \&generate_GCB
+ : undef,
),
Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
Handler => \&process_GCB_test,