if $v_version ge v6.0.0;
push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
if $v_version ge v6.1.0;
+push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
+ if $v_version ge v6.2.0;
# The lists below are hashes, so the key is the item in the list, and the
# value is the reason why it is in the list. This makes generation of
$why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
}
if ($v_version ge v6.0.0) {
- $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
+ $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
$why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
}
# The input files don't list every code point. Those not listed are to be
# defaulted to some value. Below are hard-coded what those values are for
# non-binary properties as of 5.1. Starting in 5.0, there are
-# machine-parsable comment lines in the files the give the defaults; so this
+# machine-parsable comment lines in the files that give the defaults; so this
# list shouldn't have to be extended. The claim is that all missing entries
# for binary properties will default to 'N'. Unicode tried to change that in
# 5.2, but the beta period produced enough protest that they backed off.
'ReadMe.txt' => 'Documentation',
'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
+ 'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters',
+ 'USourceData.pdf' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters',
'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
## This 'require' doesn't necessarily work in miniperl, and even if it does,
## the native perl version of it (which is what would operate under miniperl)
## is extremely slow, as it does a string eval every call.
-#my $has_fast_scalar_util = $\18 !~ /miniperl/
+#my $has_fast_scalar_util = $^X !~ /miniperl/
# && defined eval "require Scalar::Util";
#
#sub objaddr($) {
# basically be a while(next_line()) {...} loop.
#
# You can also set up handlers to
-# 1) call before the first line is read for pre processing
+# 1) call before the first line is read, for pre processing
# 2) call to adjust each line of the input before the main handler gets them
# 3) call upon EOF before the main handler exits its loop
-# 4) call at the end for post processing
+# 4) call at the end, for post processing
#
# $_ is used to store the input line, and is to be filtered by the
# each_line_handler()s. So, if the format of the line is not in the desired
our $addr;
+ # Max is initialized to a negative value that isn't adjacent to 0, for
+ # simpler tests
+ my $max_init = -2;
+
main::setup_package();
my %ranges;
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
- # Max is initialized to a negative value that isn't adjacent to 0,
- # for simpler tests
- $max{$addr} = -2;
+ $max{$addr} = $max_init;
$_search_ranges_cache{$addr} = 0;
$ranges{$addr} = [];
# otherwise recalculate it. This is done too rarely to worry about
# performance.
if ($operation eq '-' && @return) {
- $max{$addr} = $r->[-1]->end;
+ if (@$r) {
+ $max{$addr} = $r->[-1]->end;
+ }
+ else { # Now empty
+ $max{$addr} = $max_init;
+ }
}
return @return;
}
# These are used to indicate, for example, that the mapping for a
# range has a non-default type.
#
- # This loops through the file, calling it's next_line() method, and
+ # This loops through the file, calling its next_line() method, and
# then taking the map and adding it to the property's table.
# Complications arise because any number of properties can be in the
# file, in any order, interspersed in any way. The first time a
# Create the map for simple only if are going to output it, for otherwise
# it takes no part in anything we do.
my $to_output_simple;
- my $non_final_folds;
sub setup_case_folding($) {
# Read in the case foldings in CaseFolding.txt. This handles both
property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
}
- $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
- );
-
# If we ever wanted to show that these tables were combined, a new
# property method could be created, like set_combined_props()
property_ref('Case_Folding')->add_comment(join_lines( <<END
if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
$_ = "$range; Case_Folding; "
. "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
- if ($type eq 'F') {
- my @string = split " ", $map;
- for my $i (0 .. @string - 1 -1) {
- $non_final_folds->add_range(hex $string[$i], hex $string[$i]);
- }
- }
}
else {
$_ = "";
else {
$Word += ord('_'); # Make sure this is a $Word
}
+ my $JC = property_ref('Join_Control'); # Wasn't in release 1
+ if (defined $JC) {
+ $Word += $JC->table('Y');
+ }
+ else {
+ $Word += 0x200C + 0x200D;
+ }
# This is a Perl extension, so the name doesn't begin with Posix.
my $PerlWord = $perl->add_match_table('PerlWord',
# No Posix equivalent for vertical space
my $Space = $perl->add_match_table('Space',
- Description => '\s including beyond ASCII plus vertical tab',
+ Description => '\s including beyond ASCII and vertical tab',
Initialize => $Blank + $VertSpace,
);
$Space->add_alias('XPosixSpace');
- $perl->add_match_table("PosixSpace",
+ my $posix_space = $perl->add_match_table("PosixSpace",
Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
Initialize => $Space & $ASCII,
);
- # Perl's traditional space doesn't include Vertical Tab
+ # Perl's traditional space doesn't include Vertical Tab prior to v5.18
my $XPerlSpace = $perl->add_match_table('XPerlSpace',
Description => '\s, including beyond ASCII',
#Initialize => $Space - 0x000B,
$Graph->add_alias('XPosixGraph');
$perl->add_match_table("PosixGraph",
Description =>
- '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
+ '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
Initialize => $Graph & $ASCII,
);
$print->add_alias('XPosixPrint');
$perl->add_match_table("PosixPrint",
Description =>
- '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
+ '[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
Initialize => $print & $ASCII,
);
Perl_Extension => 1
);
$perl->add_match_table('PosixPunct', Perl_Extension => 1,
- Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
+ Description => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
Initialize => $ASCII & $XPosixPunct,
);
}
$PosixXDigit->add_description('[0-9A-Fa-f]');
+ my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
+ Description => "Code points that particpate in some fold",
+ );
+ #
+ foreach my $range (property_ref('Case_Folding')->ranges) {
+ $any_folds->add_range($range->start, $range->end);
+ foreach my $hex_code_point (split " ", $range->value) {
+ my $code_point = hex $hex_code_point;
+ $any_folds->add_range($code_point, $code_point);
+ }
+ }
+
my $dt = property_ref('Decomposition_Type');
$dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
$perl_xidc &= $Word;
- # These two tables are for the 'extended' grapheme cluster, which came in
- # 5.1; create empty ones if not already present. The non-extended
- # definition differs from the extended (see
+ my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
+ );
+
+ my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Initialize => $perl_xidc
+ + 0x0020 # SPACE
+ + 0x0028 # (
+ + 0x0029 # )
+ + 0x002D # -
+ + 0x00A0 # NBSP
+ );
+
+ # These two tables are for matching \X, which is based on the 'extended'
+ # grapheme cluster, which came in 5.1; create empty ones if not already
+ # present. The straight 'grapheme cluster' (non-extended) is used prior
+ # to 5.1, and differs from the extended (see
# http://www.unicode.org/reports/tr29/) only by these two tables, so we
# get the older definition automatically when they are empty.
my $gcb = property_ref('Grapheme_Cluster_Break');
push @tables_that_may_be_empty, $perl_prepend->complete_name;
}
+ # All the tables with _X_ in their names are used in defining \X handling,
+ # and are based on the Unicode GCB property. Basically, \X matches:
+ # CR LF
+ # | Prepend* Begin Extend*
+ # | .
+ # Begin is: ( Special_Begin | ! Control )
+ # Begin is also: ( Regular_Begin | Special_Begin )
+ # where Regular_Begin is defined as ( ! Control - Special_Begin )
+ # Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
+ # Extend is: ( Grapheme_Extend | Spacing_Mark )
+ # Control is: [ GCB_Control | CR | LF ]
+ # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
- # These are used in Unicode's definition of \X
- my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
-
- # In the line below, two negatives means: yes hangul
- $begin += ~ property_ref('Hangul_Syllable_Type')
- ->table('Not_Applicable')
- + ~ ($gcb->table('Control')
- + $gcb->table('CR')
- + $gcb->table('LF'));
- $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
-
- $extend += $gcb->table('Extend');
- if (defined (my $sm = $gcb->table('SpacingMark'))) {
- $extend += $sm;
- }
- $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
-
- # More GCB. Populate a combined hangul syllables table
- my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
foreach my $gcb_name (qw{ L V T LV LVT }) {
# The perl internal extension's name is the gcb table name prepended
push @tables_that_may_be_empty, $perl_table->complete_name;
}
}
- $perl->add_match_table('_X_HST_Not_Applicable',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => property_ref('HST')->table('NA'),
- );
+
+ # More GCB. Populate a combined hangul syllables table
+ my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
$lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
- $lv_lvt_v->add_comment('For use in \X; matches: hst=LV | hst=LVT | hst=V');
+ $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
+
+ my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
+ if ($v_version ge v6.2) {
+ $ri += $gcb->table('RI');
+ }
+ else {
+ push @tables_that_may_be_empty, $ri->full_name;
+ }
+
+ my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Initialize => $lv_lvt_v
+ + $gcb->table('L')
+ + $gcb->table('T')
+ + $ri
+ );
+ $specials_begin->add_comment(join_lines( <<END
+For use in \\X; matches first (perhaps only) character of potential
+multi-character sequences that can begin an extended grapheme cluster. They
+need special handling because of their complicated nature.
+END
+ ));
+ my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Initialize => ~ $gcb->table('Control')
+ - $specials_begin
+ - $gcb->table('CR')
+ - $gcb->table('LF')
+ );
+ $regular_begin->add_comment(join_lines( <<END
+For use in \\X; matches first character of anything that can begin an extended
+grapheme cluster, except those that require special handling.
+END
+ ));
+
+ my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Initialize => $gcb->table('Extend')
+ );
+ if (defined (my $sm = $gcb->table('SpacingMark'))) {
+ $extend += $sm;
+ }
+ $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
+
+ # End of GCB \X processing
my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
}
# Now that have everything added, add in abbreviations after
- # everything else.
- foreach my $value (keys %abbreviations) {
+ # everything else. Sort so results don't change between runs of this
+ # program
+ foreach my $value (sort keys %abbreviations) {
$perl_charname->add_duplicate($abbreviations{$value}, $value,
Replace => $MULTIPLE_AFTER);
}
(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
for improved legibility.
-Also, white space, hyphens, and underscores are also normally ignored
+Also, white space, hyphens, and underscores are normally ignored
everywhere between the {braces}, and hence can be freely added or removed
even if the C</x> modifier hasn't been specified on the regular expression.
But $a_bold_stricter at the beginning of an entry in the table below
=over 4
-=item Z<>B<*> is a wild-card
+=item *
+
+B<*> is a wild-card
-=item B<(\\d+)> in the info column gives the number of code points matched by
+=item *
+
+B<(\\d+)> in the info column gives the number of code points matched by
this property.
-=item B<$DEPRECATED> means this is deprecated.
+=item *
+
+B<$DEPRECATED> means this is deprecated.
+
+=item *
+
+B<$OBSOLETE> means this is obsolete.
-=item B<$OBSOLETE> means this is obsolete.
+=item *
-=item B<$STABILIZED> means this is stabilized.
+B<$STABILIZED> means this is stabilized.
-=item B<$STRICTER> means tighter (stricter) name matching applies.
+=item *
-=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
+B<$STRICTER> means tighter (stricter) name matching applies.
+
+=item *
+
+B<$DISCOURAGED> means use of this form is discouraged, and may not be
stable.
=back
my \$run_on_code_point_re = qr/$run_on_code_point_re/;
my \$code_point_re = qr/$code_point_re/;
- # In the following hash, the keys are the bases of names which includes
- # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
+ # In the following hash, the keys are the bases of names which include
+ # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value
# of each key is another hash which is used to get the low and high ends
# for each range of code points that apply to the name.
my %names_ending_in_code_point = (
}
}
}
+ @suppressed = sort @suppressed; # So doesn't change between runs of this
+ # program
# Convert the structure below (designed for Name.pm) to a form that UCD
# wants, so it doesn't have to modify it at all; i.e. so that it includes
# For each property ...
# (sort so that if there is an immutable file name, it has precedence, so
- # some other property can't come in and take over its file name. If b's
- # file name is defined, will return 1, meaning to take it first; don't
- # care if both defined, as they had better be different anyway. And the
- # property named 'Perl' needs to be first (it doesn't have any immutable
- # file name) because empty properties are defined in terms of it's table
- # named 'Any'.)
+ # some other property can't come in and take over its file name. (We
+ # don't care if both defined, as they had better be different anyway.)
+ # The property named 'Perl' needs to be first (it doesn't have any
+ # immutable file name) because empty properties are defined in terms of
+ # it's table named 'Any'.) We also sort by the property's name. This is
+ # just for repeatability of the outputs between runs of this program, but
+ # does not affect correctness.
PROPERTY:
- foreach my $property (sort { return -1 if $a == $perl;
- return 1 if $b == $perl;
- return defined $b->file
- } property_ref('*'))
+ foreach my $property ($perl,
+ sort { return -1 if defined $a->file;
+ return 1 if defined $b->file;
+ return $a->name cmp $b->name;
+ } grep { $_ != $perl } property_ref('*'))
{
my $type = $property->type;
# or multiple lines. main::write doesn't count the lines.
my @output;
- foreach my $property (property_ref('*')) {
- foreach my $table ($property->tables) {
+ # Sort these so get results in same order on different runs of this
+ # program
+ foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) {
+ foreach my $table (sort { $a->name cmp $b->name } $property->tables) {
# Find code points that match, and don't match this table.
my $valid = $table->get_valid_code_point;
),
Input_file->new('BidiMirroring.txt', v3.0.1,
Property => 'Bidi_Mirroring_Glyph',
+ Has_Missings_Defaults => ($v_version lt v6.2.0)
+ ? $NO_DEFAULTS
+ # Is <none> which doesn't mean
+ # anything to us, we will use the
+ # null string
+ : $IGNORED,
+
),
Input_file->new("NormTest.txt", v3.0.0,
Handler => \&process_NormalizationsTest,
# Create the list of input files from the objects we have defined, plus
# version
-my @input_files = 'version';
+my @input_files = qw(version Makefile);
foreach my $object (@input_file_objects) {
my $file = $object->file;
next if ! defined $file; # Not all objects have files
}
}
+# We use 'Makefile' just to see if it has changed since the last time we
+# rebuilt. Now discard it.
+@input_files = grep { $_ ne 'Makefile' } @input_files;
+
my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
|| ! scalar @mktables_list_output_files # or if no outputs known
|| $old_start_time < $most_recent; # or out-of-date