sub NON_ASCII_PLATFORM { ord("A") != 65 }
+# When a new version of Unicode is published, unfortunately the algorithms for
+# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
+# manually. The changes may or may not be backward compatible with older
+# releases. The code is in regen/mk_invlist.pl and regexec.c. Make the
+# changes, then come back here and set the variable below to what version the
+# code is expecting. If a newer version of Unicode is being compiled than
+# expected, a warning will be generated. If an older version is being
+# compiled, any bounds tests that fail in the generated test file (-maketest
+# option) will be marked as TODO.
+my $version_of_mk_invlist_bounds = v10.0.0;
+
##########################################################################
#
# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
# each one of the tens of thousands individually.
#
# In a match table, the value of a range is irrelevant (and hence the type as
-# well, which will always be 0), and arbitrarily set to the null string.
+# well, which will always be 0), and arbitrarily set to the empty string.
# Using the example above, there would be two match tables for those two
# entries, one named Upper would contain the 0x41..0x5A range, and the other
# named Lower would contain 0x61..0x7A.
#
# trace ... if main::DEBUG && $to_trace;
#
+# main::stack_trace() will display what its name implies
+#
# If there is just one or a few files that you're debugging, you can easily
# cause most everything else to be skipped. Change the line
#
#
# A NOTE ON UNIHAN
#
-# This program can generate tables from the Unihan database. But that db
+# This program can generate tables from the Unihan database. But that DB
# isn't normally available, so it is marked as optional. Prior to version
# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database
# was split into 8 different files, all beginning with the letters 'Unihan'.
# handled by Unicode::Normalize, nor will it compile when presented a version
# that has them. However, you can trivially get it to compile by simply
# ignoring those decompositions, by changing the croak to a carp. At the time
-# of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or
-# cpan/Unicode-Normalize/mkheader) reads
+# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
+# dist/Unicode-Normalize/mkheader) reads
#
# croak("Weird Canonical Decomposition of U+$h");
#
}
}
+sub stack_trace() {
+ local $to_trace = 1 if main::DEBUG;
+ my $line = (caller(0))[2];
+ my $i = 1;
+
+ # Accumulate the stack trace
+ while (1) {
+ my ($pkg, $file, $caller_line, $caller) = caller $i++;
+
+ last unless defined $caller;
+
+ trace "called from $caller() at line $line";
+ $line = $caller_line;
+ }
+}
+
# This is for a rarely used development feature that allows you to compare two
# versions of the Unicode standard without having to deal with changes caused
# by the code points introduced in the later version. You probably also want
# to use the -annotate option when using this. Run this program on a unicore
# containing the starting release you want to compare. Save that output
-# structrue. Then, switching to a unicore with the ending release, change the
+# structure. Then, switching to a unicore with the ending release, change the
# 0 in the $string_compare_versions definition just below to a string
# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
# to the starting release. This program will then compile, but throw away all
# common to both releases, and you can see the changes caused just by the
# underlying release semantic changes. For versions earlier than 3.2, you
# must copy a version of DAge.txt into the directory.
-my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
+my $string_compare_versions = DEBUG && ""; # e.g., "2.1";
my $compare_versions = DEBUG
&& $string_compare_versions
&& pack "C*", split /\./, $string_compare_versions;
$verbosity = 0;
}
elsif ($arg eq '-w') {
- $write_unchanged_files = 1; # update the files even if havent changed
+ # update the files even if they haven't changed
+ $write_unchanged_files = 1;
}
elsif ($arg eq '-check') {
my $this = shift @ARGV;
my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
-# We work with above-Unicode code points, up to UV_MAX. But when you get
-# that high, above IV_MAX, some operations don't work, and you can easily get
-# overflow. Therefore for internal use, we use a much smaller number,
-# translating it to UV_MAX only for output. The exact number is immaterial
-# (all Unicode code points are treated exactly the same), but the algorithm
-# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
+# We work with above-Unicode code points, up to IV_MAX, but we may want to use
+# sentinels above that number. Therefore for internal use, we use a much
+# smaller number, translating it to IV_MAX only for output. The exact number
+# is immaterial (all above-Unicode code points are treated exactly the same),
+# but the algorithm requires it to be at least
+# 2 * $MAX_UNICODE_CODEPOINTS + 1
my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
-my $MAX_PLATFORM_CODEPOINT = ~0;
+my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
# Matches legal code point. 4-6 hex numbers, If there are 6, the first
# two must be 10; if there are 5, the first must not be a 0. Written this way
qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
my $code_point_re = qr/\b$run_on_code_point_re/;
-# This matches the beginning of the line in the Unicode db files that give the
+# This matches the beginning of the line in the Unicode DB files that give the
# defaults for code points not listed (i.e., missing) in the file. The code
# depends on this ending with a semi-colon, so it can assume it is a valid
# field when the line is split() by semi-colons
my $needing_code_points_ending_in_code_point = 0;
my @backslash_X_tests; # List of tests read in for testing \X
+my @LB_tests; # List of tests read in for testing \b{lb}
my @SB_tests; # List of tests read in for testing \b{sb}
my @WB_tests; # List of tests read in for testing \b{wb}
my @unhandled_properties; # Will contain a list of properties found in
my $DI; # Default_Ignorable_Code_Point property
my $NChar; # Noncharacter_Code_Point property
my $script;
+my $scx; # Script_Extensions property
# Are there conflicting names because of beginning with 'In_', or 'Is_'
my $has_In_conflicts = 0;
$viacode[$i] = $perl_charname->value_of($i) || "";
$age[$i] = (defined $age)
- ? (($age->value_of($i) =~ / ^ \d \. \d $ /x)
+ ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
? $age->value_of($i)
: "")
: "";
# 1) call before the first line is read, for pre processing
# 2) call to adjust each line of the input before the main handler gets
# them. This can be automatically generated, if appropriately simple
-# enough, by specifiying a Properties parameter in the constructor.
+# enough, by specifying a Properties parameter in the constructor.
# 3) call upon EOF before the main handler exits its loop
# 4) call at the end, for post processing
#
# each_line_handler()s. So, if the format of the line is not in the desired
# format for the main handler, these are used to do that adjusting. They can
# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
-# so the $_ output of one is used as the input to the next. The eof handler
+# so the $_ output of one is used as the input to the next. The EOF handler
# is also stackable, but none of the others are, but could easily be changed
# to be so.
#
# not otherwise be processed, and to not raise a warning about not being
# handled. In the constructor call, any value that evaluates to a numeric
# 0 or undef means don't skip. Any other value is a string giving the
- # reason it is being skippped, and this will appear in generated pod.
+ # reason it is being skipped, and this will appear in generated pod.
# However, an empty string reason will suppress the pod entry.
# Internally, calls that evaluate to numeric 0 are changed into undef to
# distinguish them from an empty string call.
# 'handler'
main::set_access('each_line_handler', \%each_line_handler, 'c');
+ my %retain_trailing_comments;
+ # This is used to not discard the comments that end data lines. This
+ # would be used only for files with non-typical syntax, and most code here
+ # assumes that comments have been stripped, so special handlers would have
+ # to be written. It is assumed that the code will use these in
+ # single-quoted contexts, and so any "'" marks in the comment will be
+ # prefixed by a backslash.
+ main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
+
my %properties; # Optional ordered list of the properties that occur in each
# meaningful line of the input file. If present, an appropriate
# each_line_handler() is automatically generated and pushed onto the stack
# of such handlers. This is useful when a file contains multiple
- # proerties per line, but no other special considerations are necessary.
+ # properties per line, but no other special considerations are necessary.
# The special value "<ignored>" means to discard the corresponding input
# field.
# Any @missing lines in the file should also match this syntax; no such
# may not be the actual reality, but probably nobody cares anyway for
# these obsolete characters.)
#
+ # [3] if present is the default value for the property to assign for code
+ # points not given in the input. If not present, the default from the
+ # normal property is used
+ #
+ # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
+ # it means to not add the name in [1] as an alias to the property name
+ # used for these. Normally, when compiling Unicode versions that don't
+ # invoke the early handling, the name is added as a synonym.
+ #
# Not all files can be handled in the above way, and so the code ref
# alternative is available. It can do whatever it needs to. The other
# array elements are optional in this case, and the code is free to use or
# makes for easier testing later on.
main::set_access('early', \%early, 'c');
+ my %only_early;
+ main::set_access('only_early', \%only_early, 'c');
+
my %required_even_in_debug_skip;
# debug_skip is used to speed up compilation during debugging by skipping
# processing files that are not needed for the task at hand. However,
# Set defaults
$handler{$addr} = \&main::process_generic_property_file;
+ $retain_trailing_comments{$addr} = 0;
$non_skip{$addr} = 0;
$skip{$addr} = undef;
$has_missings_defaults{$addr} = $NO_DEFAULTS;
my $progress;
my $function_instead_of_file = 0;
+ if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
+ $only_early{$addr} = 1;
+ pop $early{$addr}->@*;
+ }
+
# If we are compiling a Unicode release earlier than the file became
# available, the constructor may have supplied a substitute
if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
unshift $early{$addr}->@*, 1;
# See the definition of %early for what the array elements mean.
+ # Note that we have just unshifted onto the array, so the numbers
+ # below are +1 of those in the %early description.
# If we have a property this defines, create a table and default
# map for it now (at essentially compile time), so that it will be
# available for the whole of run time. (We will want to add this
Perl_Extension => 1,
);
- # Use the default mapping for the regular property for this
- # substitute one.
- if ( defined $property{$addr}
- && defined $default_mapping{$property{$addr}})
+ # If not specified by the constructor, use the default mapping
+ # for the regular property for this substitute one.
+ if ($early{$addr}[4]) {
+ $prop_object->set_default_map($early{$addr}[4]);
+ }
+ elsif ( defined $property{$addr}
+ && defined $default_mapping{$property{$addr}})
{
$prop_object
->set_default_map($default_mapping{$property{$addr}});
# official property, we still have to allow the publicly
# inaccessible early name so that the core code which uses it
# will work regardless.
- if (! $early{$addr}[0] && $early{$addr}->@* > 2) {
+ if ( ! $only_early{$addr}
+ && ! $early{$addr}[0]
+ && $early{$addr}->@* > 2)
+ {
my $early_property_name = $early{$addr}[2];
if ($property{$addr} ne $early_property_name) {
main::property_ref($property{$addr})
next;
}
- # Remove comments and trailing space, and skip this line if the
- # result is empty
- s/#.*//;
+ # Unless to keep, remove comments. If to keep, ignore
+ # comment-only lines
+ if ($retain_trailing_comments{$addr}) {
+ next if / ^ \s* \# /x;
+
+ # But escape any single quotes (done in both the comment and
+ # non-comment portion; this could be a bug someday, but not
+ # likely)
+ s/'/\\'/g;
+ }
+ else {
+ s/#.*//;
+ }
+
+ # Remove trailing space, and skip this line if the result is empty
s/\s+$//;
next if /^$/;
# Not currently used, not fully tested.
# sub peek {
-# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
+# # Non-destructive lookahead one non-adjusted, non-comment, non-blank
# # record. Not callable from an each_line_handler(), nor does it call
# # an each_line_handler() on the line.
#
return $self->_add_delete('+', @_);
}
+ sub replace_map {
+ # Replace a range
+
+ my $self = shift;
+
+ return $self->_add_delete('+', @_, Replace => $UNCONDITIONALLY);
+ }
+
sub add_duplicate {
# Adds entry to a range list which can duplicate an existing entry
# used to override calculations.
main::set_access('format', \%format, 'r', 'p_s');
+ my %has_dependency;
+ # A boolean that gives whether some other table in this property is
+ # defined as the complement of this table. This is a crude, but currently
+ # sufficient, mechanism to make this table not get destroyed before what
+ # is dependent on it is. Other dependencies could be added, so the name
+ # was chosen to reflect a more general situation than actually is
+ # currently the case.
+ main::set_access('has_dependency', \%has_dependency, 'r', 's');
+
sub new {
# All arguments are key => value pairs, which you can see below, most
# of which match fields documented above. Otherwise: Re_Pod_Entry,
$note{$addr} = [ ];
$file_path{$addr} = [ ];
$locked{$addr} = "";
+ $has_dependency{$addr} = 0;
push @{$description{$addr}}, $description if $description;
push @{$note{$addr}}, $note if $note;
}
if ($write_as_invlist) {
+ if ( $previous_end > 0
+ && $output_range_counts{$addr})
+ {
+ my $complement_count = $start - $previous_end - 1;
+ if ($complement_count > 1) {
+ $OUT[-1] = merge_single_annotation_line(
+ $OUT[-1],
+ "#"
+ . (" " x 17)
+ . "["
+ . main::clarify_code_point_count(
+ $complement_count)
+ . "] in complement\n",
+ $comment_indent);
+ }
+ }
# Inversion list format has a single number per line,
# the starting code point of a range that matches the
sub set_default_map {
# Define what code points that are missing from the input files should
- # map to
+ # map to. The optional second parameter 'full_name' indicates to
+ # force using the full name of the map instead of its standard name.
my $self = shift;
my $map = shift;
+ my $use_full_name = shift // 0;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+ if ($use_full_name && $use_full_name ne 'full_name') {
+ Carp::my_carp_bug("Second parameter to set_default_map() if"
+ . " present, must be 'full_name'");
+ }
+
my $addr = do { no overloading; pack 'J', $self; };
# Convert the input to the standard equivalent, if any (won't have any
# for $STRING properties)
- my $standard = $self->_find_table_from_alias->{$map};
- $map = $standard->name if defined $standard;
+ my $standard = $self->property->table($map);
+ if (defined $standard) {
+ $map = ($use_full_name)
+ ? $standard->full_name
+ : $standard->name;
+ }
# Warn if there already is a non-equivalent default map for this
# property. Note that a default map can be a ref, which means that
# Accessors for the underlying list that should fail if locked.
for my $sub (qw(
add_duplicate
+ replace_map
))
{
no strict "refs";
# version. But manual intervention to decide what the actual behavior
# should be may be required should this happen. The introductory comments
# have more to say about this.
+#
+# 4) Definition. This is a string for human consumption that specifies the
+# code points that this table matches. This is used only for the generated
+# pod file.
sub standardize { return main::standardize($_[0]); }
sub trace { return main::trace(@_); }
# none.
main::set_access('complement', \%complement, 'r');
+ my %definition;
+ # Human readable string of the code points matched by this table
+ main::set_access('definition', \%definition, 'r', 's');
+
sub new {
my $class = shift;
my $initialize = delete $args{'Initialize'};
my $matches_all = delete $args{'Matches_All'} || 0;
my $format = delete $args{'Format'};
+ my $definition = delete $args{'Definition'} // "";
# Rest of parameters passed on.
my $range_list = Range_List->new(Initialize => $initialize,
$leader{$addr} = $self;
$parent{$addr} = $self;
$complement{$addr} = 0;
+ $definition{$addr} = $definition;
if (defined $format && $format ne $EMPTY_FORMAT) {
Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
# complement's if it has one.
my $self = shift;
- my $complement;
- if (($complement = $self->complement) != 0) {
- return ~ $complement->_range_list;
- }
- else {
- return $self->SUPER::_range_list;
+ my $complement = $self->complement;
+
+ # In order to avoid re-complementing on each access, only do the
+ # complement the first time, and store the result in this table's
+ # range list to use henceforth. However, this wouldn't work if the
+ # controlling (complement) table changed after we do this, so lock it.
+ # Currently, the value of the complement isn't needed until after it
+ # is fully constructed, so this works. If this were to change, the
+ # each_range iteration functionality would no longer work on this
+ # complement.
+ if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
+ $self->_set_range_list($self->SUPER::_range_list
+ + ~ $complement->_range_list);
+ $complement->lock;
}
+
+ return $self->SUPER::_range_list;
}
sub add_alias {
# disambiguate with).
if (defined $conflicting_object) {
foreach my $alias ($self->aliases) {
- if ($alias->name eq $conflicting_name) {
+ if (standardize($alias->name) eq standardize($conflicting_name)) {
# Here, there is an exact match. This results in
# ambiguous comments, so disambiguate by changing the
# add_alias()
# instead for same
# property
- && ! $other->perl_extension)
+ && ! $other->perl_extension
+
+ # We allow the sc and scx properties to be marked as
+ # related. They are in fact related, and this allows
+ # the pod to show that better. This test isn't valid
+ # if this is an early Unicode release without the scx
+ # property (having that also implies the sc property
+ # exists, so don't have to test for no 'sc')
+ && ( ! defined $scx
+ && ! ( ( $self->property == $script
+ || $self->property == $scx)
+ && ( $self->property == $script
+ || $self->property == $scx))))
{
Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
$related = 0;
}
my $addr = do { no overloading; pack 'J', $self; };
$complement{$addr} = $other;
+
+ # Be sure the other property knows we are depending on them; or the
+ # other table if it is one in the current property.
+ if ($self->property != $other->property) {
+ $other->property->set_has_dependency(1);
+ }
+ else {
+ $other->set_has_dependency(1);
+ }
$self->lock;
return;
}
main::set_access('pre_declared_maps',
\%pre_declared_maps, 'r', 's');
+ my %has_dependency;
+ # A boolean that gives whether some table somewhere is defined as the
+ # complement of a table in this property. This is a crude, but currently
+ # sufficient, mechanism to make this property not get destroyed before
+ # what is dependent on it is. Other dependencies could be added, so the
+ # name was chosen to reflect a more general situation than actually is
+ # currently the case.
+ main::set_access('has_dependency', \%has_dependency, 'r', 's');
+
sub new {
# The only required parameter is the positionally first, name. All
# other parameters are key => value pairs. See the documentation just
$has_only_code_point_maps{$addr} = 1;
$table_ref{$addr} = { };
$unique_maps{$addr} = { };
+ $has_dependency{$addr} = 0;
$map{$addr} = Map_Table->new($name,
Full_Name => $full_name{$addr},
containing_range
count
default_map
+ definition
delete_range
description
each_range
range_count
ranges
range_size_1
+ replace_map
reset_each_range
set_comment
set_default_map
END
}
- if (-e 'LineBreak.txt') {
- push @return, split /\n/, <<'END';
+ if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
+ my @lb = split /\n/, <<'END';
lb ; AI ; Ambiguous
lb ; AL ; Alphabetic
lb ; B2 ; Break_Both
lb ; XX ; Unknown
lb ; ZW ; ZWSpace
END
+ # If this Unicode version predates the lb property, we use our
+ # substitute one
+ if (-e 'LBsubst.txt') {
+ $_ = s/^lb/_Perl_LB/r for @lb;
+ }
+ push @return, @lb;
}
if (-e 'DNormalizationProps.txt') {
. $CMD_DELIM
. $fields[$CHARNAME];
}
- elsif ($fields[$CHARNAME] =~ /^CJK/) {
+ elsif ($fields[$CATEGORY] eq 'Lo') { # Is a letter
+
+ # All the CJK ranges like this have the name given as a
+ # special case in the next code line. And for the others, we
+ # hope that Unicode continues to use the correct name in
+ # future releases, so we don't have to make further special
+ # cases.
+ my $name = ($fields[$CHARNAME] =~ /^CJK/)
+ ? 'CJK UNIFIED IDEOGRAPH'
+ : uc $fields[$CHARNAME];
# The name for these contains the code point itself, and all
# are defined to have the same base name, regardless of what
. '='
. $CP_IN_NAME
. $CMD_DELIM
- . 'CJK UNIFIED IDEOGRAPH';
+ . $name;
}
elsif ($fields[$CATEGORY] eq 'Co'
return;
}
+sub process_LB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @LB_tests, $_;
+ }
+
+ return;
+}
+
sub process_SB_test {
my $file = shift;
}
}
+sub filter_substitute_lb {
+ # Used on Unicodes that predate the LB property, where there is a
+ # substitute file. This just does the regular ea_lb handling for such
+ # files, and then substitutes the long property value name for the short
+ # one that comes with the file. (The other break files have the long
+ # names in them, so this is the odd one out.) The reason for doing this
+ # kludge is that regen/mk_invlists.pl is expecting the long name. This
+ # also fixes the typo 'Inseperable' that leads to problems.
+
+ filter_early_ea_lb;
+ return unless $_;
+
+ my @fields = split /\s*;\s*/;
+ $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
+ $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
+ $_ = join '; ', @fields;
+}
+
sub filter_old_style_arabic_shaping {
# Early versions used a different term for the later one.
# The Script_Extensions property starts out with a clone of the Script
# property.
- my $scx = property_ref("Script_Extensions");
+ $scx = property_ref("Script_Extensions");
$scx = Property->new("scx", Full_Name => "Script_Extensions")
if ! defined $scx;
$scx->_set_format($STRING_WHITE_SPACE_LIST);
END
));
+ # Make sure this assumption in perl core code is valid in this Unicode
+ # release, with known exceptions
+ foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
+ next if $range->end - $range->start == 9;
+ next if $range->start == 0x1D7CE; # This whole range was added in 3.1
+ next if $range->end == 0x19DA && $v_version eq v5.2.0;
+ next if $range->end - $range->start < 9 && $v_version le 4.0.0;
+ Carp::my_carp("Range $range unexpectedly doesn't contain 10"
+ . " decimal digits. Code in regcomp.c assumes it does,"
+ . " and will have to be fixed. Proceeding anyway.");
+ }
+
Property->new('Legacy_Case_Folding',
File => "Fold",
Directory => $map_directory,
# data is retained in the map table for reference, but the spurious match
# tables are deleted.
- my $scx = property_ref("Script_Extensions");
if (defined $scx) {
foreach my $table ($scx->tables) {
next unless $table->name =~ /\s/; # All the new and only the new
}
$scx->delete_match_table($table);
}
+
+ # Mark the scx table as the parent of the corresponding sc table for
+ # those which are identical. This causes the pod for the script table
+ # to refer to the corresponding scx one.
+ #
+ # This has to be in a separate loop from above, so as to wait until
+ # the tables are stabilized before checking for equivalency.
+ if (defined $pod_directory) {
+ my $sc = property_ref("Script");
+ foreach my $table ($scx->tables) {
+ my $plain_sc_equiv = $sc->table($table->name);
+ if ($table->matches_identically_to($plain_sc_equiv)) {
+ $plain_sc_equiv->set_equivalent_to($table, Related => 1);
+ }
+ }
+ }
}
return;
# since the first compare version.
my $delta = Range_List->new();
foreach my $table ($age->tables) {
+ use version;
next if $table == $age->table('Unassigned');
- next if $table->name le $string_compare_versions;
+ next if version->parse($table->name)
+ le version->parse($string_compare_versions);
$delta += $table;
}
if ($delta->is_empty) {
next if $this_block == $no_block
|| ! ($this_block & $Assigned)->is_empty;
$this_block->set_fate($SUPPRESSED, $after_first_version);
+ foreach my $range ($this_block->ranges) {
+ $block->replace_map($range->start, $range->end, 'No_Block')
+ }
$no_block += $this_block;
}
}
my $Any = $perl->add_match_table('Any',
- Description => "All Unicode code points: [\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
- );
+ Description => "All Unicode code points",
+ Definition => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]");
$Any->add_range(0, $MAX_UNICODE_CODEPOINT);
$Any->add_alias('Unicode');
->set_equivalent_to(property_ref('ccc')->table('Above'),
Related => 1);
- my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
+ my $ASCII = $perl->add_match_table('ASCII');
if (defined $block) { # This is equivalent to the block if have it.
my $Unicode_ASCII = $block->table('Basic_Latin');
if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
$Lower += $temp & $Assigned;
}
my $Posix_Lower = $perl->add_match_table("PosixLower",
- Description => "[a-z]",
+ Definition => "[a-z]",
Initialize => $Lower & $ASCII,
);
$Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
}
my $Posix_Upper = $perl->add_match_table("PosixUpper",
- Description => "[A-Z]",
+ Definition => "[A-Z]",
Initialize => $Upper & $ASCII,
);
$Alpha->add_alias('Alphabetic');
}
my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
- Description => "[A-Za-z]",
+ Definition => "[A-Za-z]",
Initialize => $Alpha & $ASCII,
);
$Posix_Upper->set_caseless_equivalent($Posix_Alpha);
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
$perl->add_match_table("PosixAlnum",
- Description => "[A-Za-z0-9]",
+ Definition => "[A-Za-z0-9]",
Initialize => $Alnum & $ASCII,
);
my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
Description => '\w, including beyond ASCII;'
- . ' = \p{Alnum} + \pM + \p{Pc}',
+ . ' = \p{Alnum} + \pM + \p{Pc}'
+ . ' + \p{Join_Control}',
Initialize => $Alnum + $gc->table('Mark'),
);
my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
# This is a Perl extension, so the name doesn't begin with Posix.
my $PerlWord = $perl->add_match_table('PosixWord',
- Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
+ Description => '\w, restricted to ASCII',
+ Definition => '[A-Za-z0-9_]',
Initialize => $Word & $ASCII,
);
$PerlWord->add_alias('PerlWord');
);
$Blank->add_alias('HorizSpace'); # Another name for it.
$perl->add_match_table("PosixBlank",
- Description => "\\t and ' '",
+ Definition => "\\t and ' '",
Initialize => $Blank & $ASCII,
);
$Space->add_alias('Space') if $v_version lt v4.1.0;
my $Posix_space = $perl->add_match_table("PosixSpace",
- Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
+ Definition => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
Initialize => $Space & $ASCII,
);
$Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
Description => 'Control characters');
$Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
$perl->add_match_table("PosixCntrl",
- Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
+ Description => "ASCII control characters",
+ Definition => "ACK, BEL, BS, CAN, CR, DC1, DC2,"
+ . " DC3, DC4, DEL, DLE, ENQ, EOM,"
+ . " EOT, ESC, ETB, ETX, FF, FS, GS,"
+ . " HT, LF, NAK, NUL, RS, SI, SO,"
+ . " SOH, STX, SUB, SYN, US, VT",
Initialize => $Cntrl & $ASCII,
);
Initialize => ~ ($Space + $controls),
);
$perl->add_match_table("PosixGraph",
- Description =>
+ Definition =>
'[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
Initialize => $Graph & $ASCII,
);
Initialize => $Blank + $Graph - $gc->table('Control'),
);
$perl->add_match_table("PosixPrint",
- Description =>
+ Definition =>
'[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
Initialize => $print & $ASCII,
);
Perl_Extension => 1
);
$perl->add_match_table('PosixPunct', Perl_Extension => 1,
- Description => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
+ Definition => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
Initialize => $ASCII & $XPosixPunct,
);
Description => '[0-9] + all other decimal digits');
$Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
my $PosixDigit = $perl->add_match_table("PosixDigit",
- Description => '[0-9]',
+ Definition => '[0-9]',
Initialize => $Digit & $ASCII,
);
ord('A') .. ord('F'),
ord('a') .. ord('f'),
0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
- $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
+ $Xdigit->set_definition('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
}
# AHex was not present in early releases
$PosixXDigit->add_alias('AHex');
$PosixXDigit->add_alias('Ascii_Hex_Digit');
}
- $PosixXDigit->add_description('[0-9A-Fa-f]');
+ $PosixXDigit->set_definition('[0-9A-Fa-f]');
my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
Description => "Code points that particpate in some fold",
+ ord("(")
+ ord(")")
+ ord("-")
- + utf8::unicode_to_native(0xA0) # NBSP
);
my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
0x2060 .. 0x206F,
0xFE00 .. 0xFE0F,
0xFFF0 .. 0xFFFB,
- 0xE0000 .. 0xE0FFF,
]);
+ $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
$quotemeta += $temp;
}
calculate_DI();
}
}
+ # Perl tailors the WordBreak property so that \b{wb} doesn't split
+ # adjacent spaces into separate words. First create a copy of the regular
+ # WB property as '_Perl_WB'. (On Unicode releases earlier than when WB
+ # was defined for, this will already have been done by the substitute file
+ # portion for 'Input_file' code for WB.)
+ my $perl_wb = property_ref('_Perl_WB');
+ if (! defined $perl_wb) {
+ $perl_wb = Property->new('_Perl_WB',
+ Fate => $INTERNAL_ONLY,
+ Perl_Extension => 1,
+ Directory => $map_directory,
+ Type => $STRING);
+ my $wb = property_ref('Word_Break');
+ $perl_wb->initialize($wb);
+ $perl_wb->set_default_map($wb->default_map);
+ }
+
+ # And simply replace the mappings of horizontal space characters that
+ # otherwise would map to the default to instead map to our tailoring.
+ my $default = $perl_wb->default_map;
+ for my $range ($Blank->ranges) {
+ for my $i ($range->start .. $range->end) {
+ next unless $perl_wb->value_of($i) eq $default;
+ $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
+ Replace => $UNCONDITIONALLY);
+ }
+ }
+
+ # Create a version of the LineBreak property with the mappings that are
+ # omitted in the default algorithm remapped to what
+ # http://www.unicode.org/reports/tr14 says they should be.
+ #
+ # Original Resolved General_Category
+ # AI, SG, XX AL Any
+ # SA CM Only Mn or Mc
+ # SA AL Any except Mn and Mc
+ # CJ NS Any
+ #
+ # All property values are also written out in their long form, as
+ # regen/mk_invlist.pl expects that. This also fixes occurrences of the
+ # typo in early Unicode versions: 'inseperable'.
+ my $perl_lb = property_ref('_Perl_LB');
+ if (! defined $perl_lb) {
+ $perl_lb = Property->new('_Perl_LB',
+ Fate => $INTERNAL_ONLY,
+ Perl_Extension => 1,
+ Directory => $map_directory,
+ Type => $STRING);
+ my $lb = property_ref('Line_Break');
+
+ # Populate from $lb, but use full name and fix typo.
+ foreach my $range ($lb->ranges) {
+ my $full_name = $lb->table($range->value)->full_name;
+ $full_name = 'Inseparable'
+ if standardize($full_name) eq 'inseperable';
+ $perl_lb->add_map($range->start, $range->end, $full_name);
+ }
+ }
+
+ $perl_lb->set_default_map('Alphabetic', 'full_name'); # XX -> AL
+
+ for my $range ($perl_lb->ranges) {
+ my $value = standardize($range->value);
+ if ( $value eq standardize('Unknown')
+ || $value eq standardize('Ambiguous')
+ || $value eq standardize('Surrogate'))
+ {
+ $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
+ Replace => $UNCONDITIONALLY);
+ }
+ elsif ($value eq standardize('Conditional_Japanese_Starter')) {
+ $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
+ Replace => $UNCONDITIONALLY);
+ }
+ elsif ($value eq standardize('Complex_Context')) {
+ for my $i ($range->start .. $range->end) {
+ my $gc_val = $gc->value_of($i);
+ if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
+ $perl_lb->add_map($i, $i, 'Combining_Mark',
+ Replace => $UNCONDITIONALLY);
+ }
+ else {
+ $perl_lb->add_map($i, $i, 'Alphabetic',
+ Replace => $UNCONDITIONALLY);
+ }
+ }
+ }
+ }
+
# Here done with all the basic stuff. Ready to populate the information
# about each character if annotating them.
if ($annotate) {
# the single-form, \p{name}. These are:
# All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
# \p{Is_Name} as synonyms
- # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
+ # \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
# \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
# \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
# conflict, \p{Value} and \p{Is_Value} as well
property_ref('*');
push @tables, $gc->tables;
- # If the version of Unicode includes the Script property, add its tables
- push @tables, $script->tables if defined $script;
+ # If the version of Unicode includes the Script Extensions (preferably),
+ # or Script property, add its tables
+ if (defined $scx) {
+ push @tables, $scx->tables;
+ }
+ else {
+ push @tables, $script->tables if defined $script;
+ }
# The Block tables are kept separate because they are treated differently.
# And the earliest versions of Unicode didn't include them, so add only if
if (! defined $pre_existing) {
- # No name collision, so ok to add the perl synonym.
+ # No name collision, so OK to add the perl synonym.
my $make_re_pod_entry;
my $ok_as_filename;
next;
}
- # Here, there is a name collision, but it still could be ok if
+ # Here, there is a name collision, but it still could be OK if
# the tables match the identical set of code points, in which
# case, we can combine the names. Compare each table's code
# point list to see if they are identical.
if ($table_property != $perl && $table->perl_extension) {
push @info, '(Perl extension)';
}
- push @info, "($string_count)";
+ my $definition = $table->definition // "";
+ $definition = "" if $entry_for_first_alias;
+ $definition = ": $definition" if $definition;
+ push @info, "($string_count$definition)";
# Now, we have both the entry and info so add them to the
# list of all the properties.
: $table->parent->property;
my $perl_extension = $table->perl_extension;
+ my $is_perl_extension_match_table_but_not_dollar_perl
+ = $property != $perl
+ && $perl_extension
+ && $property != $table;
# Get the more official name for for perl extensions that aren't
# stand-alone properties
- if ($perl_extension && $property != $table) {
- if ($property == $perl ||$property->type == $BINARY) {
- $meaning = $table->complete_name;
+ if ($is_perl_extension_match_table_but_not_dollar_perl) {
+ if ($property->type == $BINARY) {
+ $meaning = $property->full_name;
}
else {
- $meaning = $property->full_name . "=$full_name";
+ $meaning = $table->parent->complete_name;
}
}
# There are three types of info column. One for the short name, one for
# the full name, and one for everything else. They mostly are the same,
# so initialize in the same loop.
+
foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
- if ($perl_extension && $property != $table) {
+ if ($info_ref != \$full_info) {
+
+ # The non-full name columns include the full name
+ $$info_ref .= $full_name;
+ }
+
+
+ if ($is_perl_extension_match_table_but_not_dollar_perl) {
# Add the synonymous name for the non-full name entries; and to
# the full-name entry if it adds extra information
- if ($info_ref == \$other_info
- || ($info_ref == \$short_info
- && $standard_short_name ne $standard_full_name)
- || standardize($meaning) ne $standard_full_name
- ) {
- $$info_ref .= "$meaning.";
+ if ( standardize($meaning) ne $standard_full_name
+ || $info_ref == \$other_info
+ || $info_ref == \$short_info)
+ {
+ my $parenthesized = $info_ref != \$full_info;
+ $$info_ref .= " " if $$info_ref && $parenthesized;
+ $$info_ref .= "(=" if $parenthesized;
+ $$info_ref .= "$meaning";
+ $$info_ref .= ")" if $parenthesized;
+ $$info_ref .= ".";
}
}
- elsif ($info_ref != \$full_info) {
-
- # Otherwise, the non-full name columns include the full name
- $$info_ref .= $full_name;
- }
# And the full-name entry includes the short name, if shorter
if ($info_ref == \$full_info
Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
}
- # We modifiy the info column of the one being output to
+ # We modify the info column of the one being output to
# indicate the ambiguity. Set $which to point to that one's
# info.
my $which;
return -1
}
- # Determine if the two operands are numeric property values or not.
- # A numeric property will look like xyz: 3. But the number
- # can begin with an optional minus sign, and may have a
- # fraction or rational component, like xyz: 3/2. If either
- # isn't numeric, use alphabetic sort.
- my ($a_initial, $a_number) =
- ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
- return $a cmp $b unless defined $a_number;
- my ($b_initial, $b_number) =
- ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
- return $a cmp $b unless defined $b_number;
-
- # Here they are both numeric, but use alphabetic sort if the
- # initial parts don't match
- return $a cmp $b if $a_initial ne $b_initial;
+ # Determine if the two operands are compound or not, and if so if are
+ # "numeric" property values or not, like \p{Age: 3.0}. But there are also
+ # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
+ # all of which this considers numeric, and for sorting, looks just at the
+ # numeric parts. It can also be a rational like \p{Numeric Value=-1/2}.
+ my $split_re = qr/
+ ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
+ # property name
+ [:=] \s* # The syntax for the compound form
+ (?: # followed by ...
+ ( # $2 gets defined if what follows is a "numeric"
+ # expression, which is ...
+ ( -? \d+ (?: [.\/] \d+)? # An integer, float, or rational
+ # number, optionally signed
+ | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131. Either
+ # of these go into $3
+ | ( V \d+ _ \d+ ) # or a Unicode's Age property version
+ # number, into $4
+ )
+ | .* $ # If not "numeric", accept anything so that $1 gets
+ # defined if it is any compound form
+ ) /ix;
+ my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
+ my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
+
+ # Sort alphabeticlly on the whole property name if either operand isn't
+ # compound, or they differ.
+ return $a cmp $b if ! defined $a_initial
+ || ! defined $b_initial
+ || $a_initial ne $b_initial;
+
+ if (! defined $a_numeric) {
+
+ # If neither is numeric, use alpha sort
+ return $a cmp $b if ! defined $b_numeric;
+ return 1; # Sort numeric ahead of alpha
+ }
+
+ # Here $a is numeric
+ return -1 if ! defined $b_numeric; # Numeric sorts before alpha
+
+ # Here they are both numeric in the same property.
+ # Convert version numbers into regular numbers
+ if (defined $a_version) {
+ ($a_number = $a_version) =~ s/^V//i;
+ $a_number =~ s/_/./;
+ }
+ else { # Otherwise get rid of the, e.g., CCC in CCC9 */
+ $a_number =~ s/ ^ [[:alpha:]]+ //x;
+ }
+ if (defined $b_version) {
+ ($b_number = $b_version) =~ s/^V//i;
+ $b_number =~ s/_/./;
+ }
+ else {
+ $b_number =~ s/ ^ [[:alpha:]]+ //x;
+ }
# Convert rationals to floating for the comparison.
$a_number = eval $a_number if $a_number =~ qr{/};
$b_number = eval $b_number if $b_number =~ qr{/};
- return $a_number <=> $b_number;
+ return $a_number <=> $b_number || $a cmp $b;
}
sub make_pod () {
push @bad_re_properties, "\n=back\n";
}
- # Similiarly, generate a list of files that we don't use, grouped by the
+ # Similarly, generate a list of files that we don't use, grouped by the
# reasons why (Don't output if the reason is empty). First, create a hash
# whose keys are the reasons, and whose values are anonymous arrays of all
# the files that share that reason.
B<Compound forms> consist of two components, separated by an equals sign or a
colon. The first component is the property name, and the second component is
the particular value of the property to match against, for example,
-C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
-whose Script property value is Greek.
+C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean
+to match characters whose Script_Extensions property value is Greek.
+(C<Script_Extensions> is an improved version of the C<Script> property.)
B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
their equivalent compound forms. The table shows these equivalences. (In our
-example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
-There are also a few Perl-defined single forms that are not shortcuts for a
-compound form. One such is C<\\p{Word}>. These are also listed in the table.
+example, C<\\p{Greek}> is a just a shortcut for
+C<\\p{Script_Extensions=Greek}>). There are also a few Perl-defined single
+forms that are not shortcuts for a compound form. One such is C<\\p{Word}>.
+These are also listed in the table.
In parsing these constructs, Perl always ignores Upper/lower case differences
everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
=over 4
-=item *
-
-B<*> is a wild-card
+=item Z<>B<*> is a wild-card
-=item *
-
-B<(\\d+)> in the info column gives the number of Unicode code points matched
+=item B<(\\d+)> in the info column gives the number of Unicode code points matched
by this property.
-=item *
-
-B<$DEPRECATED> means this is deprecated.
-
-=item *
+=item B<$DEPRECATED> means this is deprecated.
-B<$OBSOLETE> means this is obsolete.
+=item B<$OBSOLETE> means this is obsolete.
-=item *
+=item B<$STABILIZED> means this is stabilized.
-B<$STABILIZED> means this is stabilized.
+=item B<$STRICTER> means tighter (stricter) name matching applies.
-=item *
-
-B<$STRICTER> means tighter (stricter) name matching applies.
-
-=item *
-
-B<$DISCOURAGED> means use of this form is discouraged, and may not be
+=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
stable.
=back
foreach my $prop_alias ($property->aliases) {
my $prop_alias_name = standardize($prop_alias->name);
- # If no =value, there's just one combination possibe for this
+ # If no =value, there's just one combination possible for this
if (! $value_name) {
# The property may be suppressed, but there may be a proxy
# the children.
make_re_pod_entries($table) if defined $pod_directory;
- # See if the the table matches identical code points with
- # something that has already been output. In that case,
- # no need to have two files with the same code points in
- # them. We use the table's hash() method to store these
- # in buckets, so that it is quite likely that if two
- # tables are in the same bucket they will be identical, so
- # don't have to compare tables frequently. The tables
- # have to have the same status to share a file, so add
- # this to the bucket hash. (The reason for this latter is
- # that Heavy.pl associates a status with a file.)
- # We don't check tables that are inverses of others, as it
- # would lead to some coding complications, and checking
- # all the regular ones should find everything.
+ # See if the table matches identical code points with
+ # something that has already been processed and is ready
+ # for output. In that case, no need to have two files
+ # with the same code points in them. We use the table's
+ # hash() method to store these in buckets, so that it is
+ # quite likely that if two tables are in the same bucket
+ # they will be identical, so don't have to compare tables
+ # frequently. The tables have to have the same status to
+ # share a file, so add this to the bucket hash. (The
+ # reason for this latter is that Heavy.pl associates a
+ # status with a file.) We don't check tables that are
+ # inverses of others, as it would lead to some coding
+ # complications, and checking all the regular ones should
+ # find everything.
if ($table->complement == 0) {
my $hash = $table->hash . ';' . $table->status;
foreach my $comparison
(@{$match_tables_to_write{$hash}})
{
- if ($table->matches_identically_to($comparison)) {
+ # If the table doesn't point back to this one, we
+ # see if it matches identically
+ if ( $comparison->leader != $table
+ && $table->matches_identically_to($comparison))
+ {
$table->set_equivalent_to($comparison,
Related => 0);
next TABLE;
# the table. That is, all the property-values given
# by this table. By agreement with Unicode::UCD,
# if the name and full name are identical, and there
- # are no other names, drop the duplcate entry to save
+ # are no other names, drop the duplicate entry to save
# memory.
if (@values_list == 2
&& $values_list[0] eq $values_list[1])
sub generate_tests($$$$$) {
# This used only for making the test script. It generates test cases that
- # are expected to compile successfully in perl. Note that the lhs and
- # rhs are assumed to already be as randomized as the caller wants.
+ # are expected to compile successfully in perl. Note that the LHS and
+ # RHS are assumed to already be as randomized as the caller wants.
my $lhs = shift; # The property: what's to the left of the colon
# or equals separator
my $rhs = shift; # The property value; what's to the right
my $valid_code = shift; # A code point that's known to be in the
- # table given by lhs=rhs; undef if table is
+ # table given by LHS=RHS; undef if table is
# empty
my $invalid_code = shift; # A code point known to not be in the table;
# undef if the table is all code points
# colon or equals separator
my $rhs = shift; # The property value; what's to the right
my $already_in_error = shift; # Boolean; if true it's known that the
- # unmodified lhs and rhs will cause an error.
+ # unmodified LHS and RHS will cause an error.
# This routine should not force another one
# Get the colon or equal
my $separator = generate_separator($lhs);
# Since this is an error only, don't bother to randomly decide whether to
- # put the error on the left or right side; and assume that the rhs is
+ # put the error on the left or right side; and assume that the RHS is
# loosely matched, again for convenience rather than rigor.
$rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
# or multiple lines. main::write doesn't count the lines.
my @output;
+ push @output, <<'EOF_CODE';
+Error('\p{Script=InGreek}'); # Bug #69018
+Test_GCB("1100 $nobreak 1161"); # Bug #70940
+Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
+Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
+Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
+
+# Make sure this gets tested; it was not part of the official test suite at
+# the time this was added. Note that this is as it would appear in the
+# official suite, and gets modified to check for the perl tailoring by
+# Test_WB()
+Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
+Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
+EOF_CODE
+
# Sort these so get results in same order on different runs of this
# program
- foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) {
- foreach my $table (sort { $a->name cmp $b->name } $property->tables) {
+ foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
+ or
+ lc $a->name cmp lc $b->name
+ } property_ref('*'))
+ {
+ foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
+ or
+ lc $a->name cmp lc $b->name
+ } $property->tables)
+ {
# Find code points that match, and don't match this table.
my $valid = $table->get_valid_code_point;
# conflating is possible. In our example, we
# don't want 2/3 matching 7/10, if there is
# a 7/10 code point.
+
+ # First, integers are not in the rationals
+ # table. Don't generate an error if this
+ # rounds to an integer using the given
+ # precision.
+ my $round = sprintf "%.0f", $table_name;
+ next PLACE if abs($table_name - $round)
+ < $MAX_FLOATING_SLOP;
+
+ # Here, isn't close enough to an integer to be
+ # confusable with one. Now, see it it's
+ # "close" to a known rational
for my $existing
(keys %nv_floating_to_rational)
{
}
}
}
- }
- }
+ $table->DESTROY();
+ }
+ $property->DESTROY();
+ }
+
+ # Make any test of the boundary (break) properties TODO if the code
+ # doesn't match the version being compiled
+ my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
+ ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
+ : "\nsub TODO_FAILING_BREAKS { 0 }\n";
+
+ @output= map {
+ map s/^/ /mgr,
+ map "$_;\n",
+ split /;\n/, $_
+ } @output;
+
+ # Cause there to be 'if' statements to only execute a portion of this
+ # long-running test each time, so that we can have a bunch of .t's running
+ # in parallel
+ my $chunks = 10 # Number of test files
+ - 1 # For GCB & SB
+ - 1 # For WB
+ - 4; # LB split into this many files
+ my @output_chunked;
+ my $chunk_count=0;
+ my $chunk_size= int(@output / $chunks) + 1;
+ while (@output) {
+ $chunk_count++;
+ my @chunk= splice @output, 0, $chunk_size;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ @chunk,
+ "}\n";
+ }
+
+ $chunk_count++;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ (map {" Test_GCB('$_');\n"} @backslash_X_tests),
+ (map {" Test_SB('$_');\n"} @SB_tests),
+ "}\n";
+
+
+ $chunk_size= int(@LB_tests / 4) + 1;
+ @LB_tests = map {" Test_LB('$_');\n"} @LB_tests;
+ while (@LB_tests) {
+ $chunk_count++;
+ my @chunk= splice @LB_tests, 0, $chunk_size;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ @chunk,
+ "}\n";
+ }
+
+ $chunk_count++;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ (map {" Test_WB('$_');\n"} @WB_tests),
+ "}\n";
&write($t_path,
0, # Not utf8;
[$HEADER,
+ $TODO_FAILING_BREAKS,
<DATA>,
- @output,
- (map {"Test_GCB('$_');\n"} @backslash_X_tests),
- (map {"Test_SB('$_');\n"} @SB_tests),
- (map {"Test_WB('$_');\n"} @WB_tests),
- "Finished();\n"
+ @output_chunked,
+ "Finished();\n",
]);
return;
Property => 'Joining_Type',
Has_Missings_Defaults => $NOT_IGNORED,
),
+ Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
+ Skip => 'This file adds no new information not already'
+ . ' present in other files',
+ # And it's unnecessary programmer work to handle this new
+ # format. Previous Derived files actually had bug fixes
+ # in them that were useful, but that should not be the
+ # case here.
+ ),
Input_file->new('Jamo.txt', v2.0.0,
Property => 'Jamo_Short_Name',
Each_Line_Handler => \&filter_jamo_line,
Has_Missings_Defaults => $NOT_IGNORED,
Property => 'Line_Break',
# Early versions had problematic syntax
- Each_Line_Handler => (($v_version lt v3.1.0)
- ? \&filter_early_ea_lb
- : undef),
+ Each_Line_Handler => ($v_version ge v3.1.0)
+ ? undef
+ : ($v_version lt v3.0.0)
+ ? \&filter_substitute_lb
+ : \&filter_early_ea_lb,
+ # Must use long names for property values see comments at
+ # sub filter_substitute_lb
+ Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
+ 'Alphabetic', # default to this because XX ->
+ # AL
+
+ # Don't use _Perl_LB as a synonym for
+ # Line_Break in later perls, as it is tailored
+ # and isn't the same as Line_Break
+ 'ONLY_EARLY' ],
),
Input_file->new('EastAsianWidth.txt', v3.0.0,
Property => 'East_Asian_Width',
# for the release it is. To get it to actually mean
# something useful, someone would have to be using an
# earlier Unicode release, and copy it into the directory
- # for that release and recomplile. So far there has been
+ # for that release and recompile. So far there has been
# no demand to do that, so this hasn't been implemented.
Skip => 'Documentation of corrections already '
. 'incorporated into the Unicode data base',
),
Input_file->new('StandardizedVariants.html', v3.2.0,
- Skip => 'Provides a visual display of the standard '
+ Skip => 'Obsoleted as of Unicode 9.0, but previously '
+ . 'provided a visual display of the standard '
. 'variant sequences derived from '
. 'F<StandardizedVariants.txt>.',
# I don't know why the html came earlier than the
Skip => $Documentation,
),
Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
- Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
+ Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter',
+
+ # Don't use _Perl_WB as a synonym for
+ # Word_Break in later perls, as it is tailored
+ # and isn't the same as Word_Break
+ 'ONLY_EARLY' ],
Property => 'Word_Break',
Has_Missings_Defaults => $NOT_IGNORED,
),
),
Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
Handler => \&process_GCB_test,
+ retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
Skip => $Validation_Documentation,
),
Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
Handler => \&process_SB_test,
+ retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
Skip => $Validation_Documentation,
),
Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
Handler => \&process_WB_test,
+ retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
Skip => $Validation_Documentation,
. 'F<NamedSequences.txt> and recompile perl',
),
Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
- Skip => $Validation,
+ Handler => \&process_LB_test,
+ retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
Skip => $Validation_Documentation,
Property => 'Indic_Positional_Category',
Has_Missings_Defaults => $NOT_IGNORED,
),
+ Input_file->new('TangutSources.txt', v9.0.0,
+ Skip => 'Specifies source mappings for Tangut ideographs'
+ . ' and components. This data file also includes'
+ . ' informative radical-stroke values that are used'
+ . ' internally by Unicode',
+ ),
+ Input_file->new('VerticalOrientation.txt', v10.0.0,
+ Property => 'Vertical_Orientation',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ ),
+ Input_file->new('NushuSources.txt', v10.0.0,
+ Skip => 'Specifies source material for Nushu characters',
+ ),
);
# End of all the preliminaries.
}
}
if (! $rebuild) {
- print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
+ print "$0: Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
exit(0);
}
-print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
+print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
# Ready to do the major processing. First create the perl pseudo-property.
$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
}
print "\nAll done\n" if $verbosity >= $VERBOSE;
}
+
+if ($version_of_mk_invlist_bounds lt $v_version) {
+ Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
+ . " to be checked and possibly updated to Unicode"
+ . " $string_version");
+}
+
exit(0);
# TRAILING CODE IS USED BY make_property_test_script()
$Tests++;
# A string eval is needed because of the 'no warnings'.
- # Assumes no parens in the regular expression
+ # Assumes no parentheses in the regular expression
my $result = eval "$no_warnings
my \$RegObj = qr($regex);
$string =~ \$RegObj ? 1 : 0";
my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
utf8::upgrade($nobreak_utf8);
-use Config;
+my $are_ctype_locales_available;
my $utf8_locale;
chdir 't' if -d 't';
eval { require "./loc_tools.pl" };
-$utf8_locale = &find_utf8_ctype_locale if defined &find_utf8_ctype_locale;
+if (defined &locales_enabled) {
+ $are_ctype_locales_available = locales_enabled('LC_CTYPE');
+ if ($are_ctype_locales_available) {
+ $utf8_locale = &find_utf8_ctype_locale;
+ }
+}
+
+# Eval'd so can run on versions earlier than the property is available in
+my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
+if (! defined $WB_Extend_or_Format_re) {
+ $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
+}
sub _test_break($$) {
# Test various break property matches. The 2nd parameter gives the
# property name. The input is a line from auxiliary/*Test.txt for the
- # given property. Each such line is a sequence of code points given by
- # their hex numbers, separated by the two characters defined just before
- # this subroutine that indicate that either there can or cannot be a break
- # between the adjacent code points. All these are tested.
+ # given property. Each such line is a sequence of Unicode (not native)
+ # code points given by their hex numbers, separated by the two characters
+ # defined just before this subroutine that indicate that either there can
+ # or cannot be a break between the adjacent code points. All these are
+ # tested.
#
# For the gcb property extra tests are made. if there isn't a break, that
# means the sequence forms an extended grapheme cluster, which means that
my $break_type = shift;
my $line = (caller 1)[2]; # Line number
+ my $comment = "";
+
+ if ($template =~ / ( .*? ) \s* \# (.*) /x) {
+ $template = $1;
+ $comment = $2;
+
+ # Replace leading spaces with a single one.
+ $comment =~ s/ ^ \s* / # /x;
+ }
# The line contains characters above the ASCII range, but in Latin1. It
# may or may not be in utf8, and if it is, it may or may not know it. So,
$template =~ s/$breakable_utf8/$breakable/g;
}
+ # Perl customizes wb. So change the official tests accordingly
+ if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
+
+ # Split into elements that alternate between code point and
+ # break/no-break
+ my @line = split / +/, $template;
+
+ # Look at each code point and its following one
+ for (my $i = 1; $i < @line - 1 - 1; $i+=2) {
+
+ # The customization only involves changing some breaks to
+ # non-breaks.
+ next if $line[$i+1] =~ /$nobreak/;
+
+ my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
+ my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
+
+ # And it only affects adjacent space characters.
+ next if $lhs !~ /\s/u;
+
+ # But, we want to make sure to test spaces followed by a Extend
+ # or Format.
+ next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
+
+ # To test the customization, add some white-space before this to
+ # create a span. The $lhs white space may or may not be bound to
+ # that span, and also with the $rhs. If the $rhs is a binding
+ # character, the $lhs is bound to it and not to the span, unless
+ # $lhs is vertical space. In all other cases, the $lhs is bound
+ # to the span. If the $rhs is white space, it is bound to the
+ # $lhs
+ my $bound;
+ my $span;
+ if ($rhs =~ /$WB_Extend_or_Format_re/) {
+ if ($lhs =~ /\v/) {
+ $bound = $breakable;
+ $span = $nobreak;
+ }
+ else {
+ $bound = $nobreak;
+ $span = $breakable;
+ }
+ }
+ else {
+ $span = $nobreak;
+ $bound = $nobreak;
+ }
+
+ splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
+ $i += 4;
+ $line[$i+1] = $bound;
+ }
+ $template = join " ", @line;
+ }
+
# The input is just the break/no-break symbols and sequences of Unicode
# code points as hex digits separated by spaces for legibility. e.g.:
# ÷ 0020 × 0308 ÷ 0020 ÷
$display_upgrade = " (utf8-upgraded)";
}
- # The /l modifier has C after it to indicate the locale to try
- my @modifiers = qw(a aa d lC u i);
- push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+ my @modifiers = qw(a aa d u i);
+ if ($are_ctype_locales_available) {
+ push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+
+ # The /l modifier has C after it to indicate the locale to try
+ push @modifiers, "lC";
+ }
# Test for each of the regex modifiers.
for my $modifier (@modifiers) {
if ($modifier =~ / ^ l (.*) /x) {
my $locale = $1;
$display_locale = "(locale = $locale)";
- use Config;
- if (defined $Config{d_setlocale}) {
- eval { require POSIX; import POSIX 'locale_h'; };
- if (defined &POSIX::LC_CTYPE) {
- POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
- }
- }
+ POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
$modifier = 'l';
}
my $pattern = "(?$modifier:$break_pattern)";
# Actually do the test
+ my $matched_text;
my $matched = $string =~ qr/$pattern/;
- print "not " unless $matched;
+ if ($matched) {
+ $matched_text = "matched";
+ }
+ else {
+ $matched_text = "failed to match";
+ print "not ";
+
+ if (TODO_FAILING_BREAKS) {
+ $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
+ $comment =~ s/#/# TODO/;
+ }
+ }
+ print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
- # Fancy display of test results
- $matched = ($matched) ? "matched" : "failed to match";
- print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
+ # Only print the comment on the first use of this line
+ $comment = "";
# Repeat with the first \B{} in the pattern. This makes sure the
# code in regexec.c:find_byclass() for \B gets executed
my $B_pattern = "$1$2";
$matched = $string =~ qr/$B_pattern/;
print "not " unless $matched;
- $matched = ($matched) ? "matched" : "failed to match";
- print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
+ $matched_text = ($matched) ? "matched" : "failed to match";
+ print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
+ print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
+ print "\n";
}
}
}
print " correctly matched $should_display[$i]; line $line\n";
} else {
- $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
+ $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
split "", $matches[$i]);
- print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
+ print "not ok $Tests -";
+ print " # TODO" if TODO_FAILING_BREAKS;
+ print " In \"$display_string\" =~ /(\\X)/g, \\X #",
$i + 1,
" should have matched $should_display[$i]",
" but instead matched $matches[$i]",
if (@matches == @should_match) {
print "ok $Tests - Nothing was left over; line $line\n";
} else {
- print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
+ print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
+ print " # TODO" if TODO_FAILING_BREAKS;
+ print "\n";
}
}
_test_break(shift, 'gcb');
}
+sub Test_LB($) {
+ _test_break(shift, 'lb');
+}
+
sub Test_SB($) {
_test_break(shift, 'sb');
}
exit($Fails ? -1 : 0);
}
-Error('\p{Script=InGreek}'); # Bug #69018
-Test_GCB("1100 $nobreak 1161"); # Bug #70940
-Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
-Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
-Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726