# compatible, but that has now been abandoned, and newer constructs are used
# as convenient.
+# NOTE: this script can run quite slowly in older/slower systems.
+# It can also consume a lot of memory (128 MB or more), you may need
+# to raise your process resource limits (e.g. in bash, "ulimit -a"
+# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
+
my $start_time;
BEGIN { # Get the time the script started running; do it at compilation to
# get it as close as possible
sub NON_ASCII_PLATFORM { ord("A") != 65 }
+# When a new version of Unicode is published, unfortunately the algorithms for
+# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
+# manually. The changes may or may not be backward compatible with older
+# releases. The code is in regen/mk_invlist.pl and regexec.c. Make the
+# changes, then come back here and set the variable below to what version the
+# code is expecting. If a newer version of Unicode is being compiled than
+# expected, a warning will be generated. If an older version is being
+# compiled, any bounds tests that fail in the generated test file (-maketest
+# option) will be marked as TODO.
+my $version_of_mk_invlist_bounds = v10.0.0;
+
##########################################################################
#
# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
#
# trace ... if main::DEBUG && $to_trace;
#
+# main::stack_trace() will display what its name implies
+#
# If there is just one or a few files that you're debugging, you can easily
# cause most everything else to be skipped. Change the line
#
#
# A NOTE ON UNIHAN
#
-# This program can generate tables from the Unihan database. But that db
+# This program can generate tables from the Unihan database. But that DB
# isn't normally available, so it is marked as optional. Prior to version
# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database
# was split into 8 different files, all beginning with the letters 'Unihan'.
# handled by Unicode::Normalize, nor will it compile when presented a version
# that has them. However, you can trivially get it to compile by simply
# ignoring those decompositions, by changing the croak to a carp. At the time
-# of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or
-# cpan/Unicode-Normalize/mkheader) reads
+# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
+# dist/Unicode-Normalize/mkheader) reads
#
# croak("Weird Canonical Decomposition of U+$h");
#
}
}
+sub stack_trace() {
+ local $to_trace = 1 if main::DEBUG;
+ my $line = (caller(0))[2];
+ my $i = 1;
+
+ # Accumulate the stack trace
+ while (1) {
+ my ($pkg, $file, $caller_line, $caller) = caller $i++;
+
+ last unless defined $caller;
+
+ trace "called from $caller() at line $line";
+ $line = $caller_line;
+ }
+}
+
# This is for a rarely used development feature that allows you to compare two
# versions of the Unicode standard without having to deal with changes caused
# by the code points introduced in the later version. You probably also want
# to use the -annotate option when using this. Run this program on a unicore
# containing the starting release you want to compare. Save that output
-# structrue. Then, switching to a unicore with the ending release, change the
+# structure. Then, switching to a unicore with the ending release, change the
# 0 in the $string_compare_versions definition just below to a string
# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
# to the starting release. This program will then compile, but throw away all
# common to both releases, and you can see the changes caused just by the
# underlying release semantic changes. For versions earlier than 3.2, you
# must copy a version of DAge.txt into the directory.
-my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
+my $string_compare_versions = DEBUG && ""; # e.g., "2.1";
my $compare_versions = DEBUG
&& $string_compare_versions
&& pack "C*", split /\./, $string_compare_versions;
$verbosity = 0;
}
elsif ($arg eq '-w') {
- $write_unchanged_files = 1; # update the files even if havent changed
+ # update the files even if they haven't changed
+ $write_unchanged_files = 1;
}
elsif ($arg eq '-check') {
my $this = shift @ARGV;
my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
-# We work with above-Unicode code points, up to UV_MAX. But when you get
-# that high, above IV_MAX, some operations don't work, and you can easily get
-# overflow. Therefore for internal use, we use a much smaller number,
-# translating it to UV_MAX only for output. The exact number is immaterial
-# (all Unicode code points are treated exactly the same), but the algorithm
-# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
+# We work with above-Unicode code points, up to IV_MAX, but we may want to use
+# sentinels above that number. Therefore for internal use, we use a much
+# smaller number, translating it to IV_MAX only for output. The exact number
+# is immaterial (all Unicode code points are treated exactly the same), but
+# the algorithm requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
-my $MAX_PLATFORM_CODEPOINT = ~0;
+my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
# Matches legal code point. 4-6 hex numbers, If there are 6, the first
# two must be 10; if there are 5, the first must not be a 0. Written this way
qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
my $code_point_re = qr/\b$run_on_code_point_re/;
-# This matches the beginning of the line in the Unicode db files that give the
+# This matches the beginning of the line in the Unicode DB files that give the
# defaults for code points not listed (i.e., missing) in the file. The code
# depends on this ending with a semi-colon, so it can assume it is a valid
# field when the line is split() by semi-colons
my $needing_code_points_ending_in_code_point = 0;
my @backslash_X_tests; # List of tests read in for testing \X
+my @LB_tests; # List of tests read in for testing \b{lb}
my @SB_tests; # List of tests read in for testing \b{sb}
my @WB_tests; # List of tests read in for testing \b{wb}
my @unhandled_properties; # Will contain a list of properties found in
$viacode[$i] = $perl_charname->value_of($i) || "";
$age[$i] = (defined $age)
- ? (($age->value_of($i) =~ / ^ \d \. \d $ /x)
+ ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
? $age->value_of($i)
: "")
: "";
# 1) call before the first line is read, for pre processing
# 2) call to adjust each line of the input before the main handler gets
# them. This can be automatically generated, if appropriately simple
-# enough, by specifiying a Properties parameter in the constructor.
+# enough, by specifying a Properties parameter in the constructor.
# 3) call upon EOF before the main handler exits its loop
# 4) call at the end, for post processing
#
# each_line_handler()s. So, if the format of the line is not in the desired
# format for the main handler, these are used to do that adjusting. They can
# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
-# so the $_ output of one is used as the input to the next. The eof handler
+# so the $_ output of one is used as the input to the next. The EOF handler
# is also stackable, but none of the others are, but could easily be changed
# to be so.
#
# not otherwise be processed, and to not raise a warning about not being
# handled. In the constructor call, any value that evaluates to a numeric
# 0 or undef means don't skip. Any other value is a string giving the
- # reason it is being skippped, and this will appear in generated pod.
+ # reason it is being skipped, and this will appear in generated pod.
# However, an empty string reason will suppress the pod entry.
# Internally, calls that evaluate to numeric 0 are changed into undef to
# distinguish them from an empty string call.
# 'handler'
main::set_access('each_line_handler', \%each_line_handler, 'c');
+ my %retain_trailing_comments;
+ # This is used to not discard the comments that end data lines. This
+ # would be used only for files with non-typical syntax, and most code here
+ # assumes that comments have been stripped, so special handlers would have
+ # to be written. It is assumed that the code will use these in
+ # single-quoted contexts, and so any "'" marks in the comment will be
+ # prefixed by a backslash.
+ main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
+
my %properties; # Optional ordered list of the properties that occur in each
# meaningful line of the input file. If present, an appropriate
# each_line_handler() is automatically generated and pushed onto the stack
# of such handlers. This is useful when a file contains multiple
- # proerties per line, but no other special considerations are necessary.
+ # properties per line, but no other special considerations are necessary.
# The special value "<ignored>" means to discard the corresponding input
# field.
# Any @missing lines in the file should also match this syntax; no such
# may not be the actual reality, but probably nobody cares anyway for
# these obsolete characters.)
#
+ # [3] if present is the default value for the property to assign for code
+ # points not given in the input. If not present, the default from the
+ # normal property is used
+ #
+ # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
+ # it means to not add the name in [1] as an alias to the property name
+ # used for these. Normally, when compiling Unicode versions that don't
+ # invoke the early handling, the name is added as a synonym.
+ #
# Not all files can be handled in the above way, and so the code ref
# alternative is available. It can do whatever it needs to. The other
# array elements are optional in this case, and the code is free to use or
# makes for easier testing later on.
main::set_access('early', \%early, 'c');
+ my %only_early;
+ main::set_access('only_early', \%only_early, 'c');
+
my %required_even_in_debug_skip;
# debug_skip is used to speed up compilation during debugging by skipping
# processing files that are not needed for the task at hand. However,
# Set defaults
$handler{$addr} = \&main::process_generic_property_file;
+ $retain_trailing_comments{$addr} = 0;
$non_skip{$addr} = 0;
$skip{$addr} = undef;
$has_missings_defaults{$addr} = $NO_DEFAULTS;
my $progress;
my $function_instead_of_file = 0;
+ if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
+ $only_early{$addr} = 1;
+ pop $early{$addr}->@*;
+ }
+
# If we are compiling a Unicode release earlier than the file became
# available, the constructor may have supplied a substitute
if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
unshift $early{$addr}->@*, 1;
# See the definition of %early for what the array elements mean.
+ # Note that we have just unshifted onto the array, so the numbers
+ # below are +1 of those in the %early description.
# If we have a property this defines, create a table and default
# map for it now (at essentially compile time), so that it will be
# available for the whole of run time. (We will want to add this
Perl_Extension => 1,
);
- # Use the default mapping for the regular property for this
- # substitute one.
- if ( defined $property{$addr}
- && defined $default_mapping{$property{$addr}})
+ # If not specified by the constructor, use the default mapping
+ # for the regular property for this substitute one.
+ if ($early{$addr}[4]) {
+ $prop_object->set_default_map($early{$addr}[4]);
+ }
+ elsif ( defined $property{$addr}
+ && defined $default_mapping{$property{$addr}})
{
$prop_object
->set_default_map($default_mapping{$property{$addr}});
# once per file, as it destroy's the EOF handlers
# flag to make sure extracted files are processed early
- state $seen_non_extracted_non_age = 0;
+ state $seen_non_extracted = 0;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
$handle{$addr} = 'pretend_is_open';
}
else {
- if ($seen_non_extracted_non_age) {
+ if ($seen_non_extracted) {
if ($file =~ /$EXTRACTED/i) # Some platforms may change the
# case of the file's name
{
# We only do this check for generic property files
&& $handler{$addr} == \&main::process_generic_property_file
- && $file !~ /$EXTRACTED/i
- && lc($file) ne 'dage.txt')
+ && $file !~ /$EXTRACTED/i)
{
# We don't set this (by the 'if' above) if we have no
# extracted directory, so if running on an early version,
# this test won't work. Not worth worrying about.
- $seen_non_extracted_non_age = 1;
+ $seen_non_extracted = 1;
}
# Mark the file as having being processed, and warn if it
# official property, we still have to allow the publicly
# inaccessible early name so that the core code which uses it
# will work regardless.
- if (! $early{$addr}[0] && $early{$addr}->@* > 2) {
+ if ( ! $only_early{$addr}
+ && ! $early{$addr}[0]
+ && $early{$addr}->@* > 2)
+ {
my $early_property_name = $early{$addr}[2];
if ($property{$addr} ne $early_property_name) {
main::property_ref($property{$addr})
next;
}
- # Remove comments and trailing space, and skip this line if the
- # result is empty
- s/#.*//;
+ # Unless to keep, remove comments. If to keep, ignore
+ # comment-only lines
+ if ($retain_trailing_comments{$addr}) {
+ next if / ^ \s* \# /x;
+
+ # But escape any single quotes (done in both the comment and
+ # non-comment portion; this could be a bug someday, but not
+ # likely)
+ s/'/\\'/g;
+ }
+ else {
+ s/#.*//;
+ }
+
+ # Remove trailing space, and skip this line if the result is empty
s/\s+$//;
next if /^$/;
# Not currently used, not fully tested.
# sub peek {
-# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
+# # Non-destructive lookahead one non-adjusted, non-comment, non-blank
# # record. Not callable from an each_line_handler(), nor does it call
# # an each_line_handler() on the line.
#
return $self->_add_delete('+', @_);
}
+ sub replace_map {
+ # Replace a range
+
+ my $self = shift;
+
+ return $self->_add_delete('+', @_, Replace => $UNCONDITIONALLY);
+ }
+
sub add_duplicate {
# Adds entry to a range list which can duplicate an existing entry
# used to override calculations.
main::set_access('format', \%format, 'r', 'p_s');
+ my %has_dependency;
+ # A boolean that gives whether some other table in this property is
+ # defined as the complement of this table. This is a crude, but currently
+ # sufficient, mechanism to make this table not get destroyed before what
+ # is dependent on it is. Other dependencies could be added, so the name
+ # was chosen to reflect a more general situation than actually is
+ # currently the case.
+ main::set_access('has_dependency', \%has_dependency, 'r', 's');
+
sub new {
# All arguments are key => value pairs, which you can see below, most
# of which match fields documented above. Otherwise: Re_Pod_Entry,
$note{$addr} = [ ];
$file_path{$addr} = [ ];
$locked{$addr} = "";
+ $has_dependency{$addr} = 0;
push @{$description{$addr}}, $description if $description;
push @{$note{$addr}}, $note if $note;
}
# Look at each alias
+ my $is_last_resort = 0;
+ my $deprecated_or_discouraged
+ = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x;
foreach my $alias ($self->aliases()) {
# Don't use an alias that isn't ok to use for an external name.
my $name = main::Standardize($alias->name);
trace $self, $name if main::DEBUG && $to_trace;
- # Take the first one, or a shorter one that isn't numeric. This
+ # Take the first one, or any non-deprecated non-discouraged one
+ # over one that is, or a shorter one that isn't numeric. This
# relies on numeric aliases always being last in the array
# returned by aliases(). Any alpha one will have precedence.
- if (! defined $short_name{$addr}
+ if ( ! defined $short_name{$addr}
+ || ( $is_last_resort
+ && $alias->status !~ $deprecated_or_discouraged)
|| ($name =~ /\D/
&& length($name) < length($short_name{$addr})))
{
($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
$nominal_short_name_length{$addr} = length $name;
+ $is_last_resort = $alias->status =~ $deprecated_or_discouraged;
}
}
# If the short name isn't a nice one, perhaps an equivalent table has
# a better one.
- if (! defined $short_name{$addr}
- || $short_name{$addr} eq ""
- || $short_name{$addr} eq "_")
+ if ( $self->can('children')
+ && ( ! defined $short_name{$addr}
+ || $short_name{$addr} eq ""
+ || $short_name{$addr} eq "_"))
{
my $return;
foreach my $follower ($self->children) { # All equivalents
}
if ($write_as_invlist) {
+ if ( $previous_end > 0
+ && $output_range_counts{$addr})
+ {
+ my $complement_count = $start - $previous_end - 1;
+ if ($complement_count > 1) {
+ $OUT[-1] = merge_single_annotation_line(
+ $OUT[-1],
+ "#"
+ . (" " x 17)
+ . "["
+ . main::clarify_code_point_count(
+ $complement_count)
+ . "] in complement\n",
+ $comment_indent);
+ }
+ }
# Inversion list format has a single number per line,
# the starting code point of a range that matches the
$range_name = "Hangul Syllable";
}
- if ($i != $start || $range_end < $end) {
+ # If the annotation would just repeat what's
+ # already being output as the range, skip it.
+ # (When an inversion list is being written, it
+ # isn't a repeat, as that always is in
+ # decimal)
+ if ( $write_as_invlist
+ || $i != $start
+ || $range_end < $end)
+ {
if ($range_end < $MAX_WORKING_CODEPOINT)
{
$annotation = sprintf "%04X..%04X",
else { # Indent if not displaying code points
$annotation = " " x 4;
}
+
if ($range_name) {
$annotation .= " $age[$i]" if $age[$i];
$annotation .= " $range_name";
sub set_default_map {
# Define what code points that are missing from the input files should
- # map to
+ # map to. The optional second parameter 'full_name' indicates to
+ # force using the full name of the map instead of its standard name.
my $self = shift;
my $map = shift;
+ my $use_full_name = shift // 0;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+ if ($use_full_name && $use_full_name ne 'full_name') {
+ Carp::my_carp_bug("Second parameter to set_default_map() if"
+ . " present, must be 'full_name'");
+ }
+
my $addr = do { no overloading; pack 'J', $self; };
# Convert the input to the standard equivalent, if any (won't have any
# for $STRING properties)
- my $standard = $self->_find_table_from_alias->{$map};
- $map = $standard->name if defined $standard;
+ my $standard = $self->property->table($map);
+ if (defined $standard) {
+ $map = ($use_full_name)
+ ? $standard->full_name
+ : $standard->name;
+ }
# Warn if there already is a non-equivalent default map for this
# property. Note that a default map can be a ref, which means that
# Accessors for the underlying list that should fail if locked.
for my $sub (qw(
add_duplicate
+ replace_map
))
{
no strict "refs";
# complement's if it has one.
my $self = shift;
- my $complement;
- if (($complement = $self->complement) != 0) {
- return ~ $complement->_range_list;
- }
- else {
- return $self->SUPER::_range_list;
+ my $complement = $self->complement;
+
+ # In order to avoid re-complementing on each access, only do the
+ # complement the first time, and store the result in this table's
+ # range list to use henceforth. However, this wouldn't work if the
+ # controlling (complement) table changed after we do this, so lock it.
+ # Currently, the value of the complement isn't needed until after it
+ # is fully constructed, so this works. If this were to change, the
+ # each_range iteration functionality would no longer work on this
+ # complement.
+ if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
+ $self->_set_range_list($self->SUPER::_range_list
+ + ~ $complement->_range_list);
+ $complement->lock;
}
+
+ return $self->SUPER::_range_list;
}
sub add_alias {
}
my $addr = do { no overloading; pack 'J', $self; };
$complement{$addr} = $other;
+
+ # Be sure the other property knows we are depending on them; or the
+ # other table if it is one in the current property.
+ if ($self->property != $other->property) {
+ $other->property->set_has_dependency(1);
+ }
+ else {
+ $other->set_has_dependency(1);
+ }
$self->lock;
return;
}
main::set_access('pre_declared_maps',
\%pre_declared_maps, 'r', 's');
+ my %has_dependency;
+ # A boolean that gives whether some table somewhere is defined as the
+ # complement of a table in this property. This is a crude, but currently
+ # sufficient, mechanism to make this property not get destroyed before
+ # what is dependent on it is. Other dependencies could be added, so the
+ # name was chosen to reflect a more general situation than actually is
+ # currently the case.
+ main::set_access('has_dependency', \%has_dependency, 'r', 's');
+
sub new {
# The only required parameter is the positionally first, name. All
# other parameters are key => value pairs. See the documentation just
$has_only_code_point_maps{$addr} = 1;
$table_ref{$addr} = { };
$unique_maps{$addr} = { };
+ $has_dependency{$addr} = 0;
$map{$addr} = Map_Table->new($name,
Full_Name => $full_name{$addr},
range_count
ranges
range_size_1
+ replace_map
reset_each_range
set_comment
set_default_map
END
}
- if (-e 'LineBreak.txt') {
- push @return, split /\n/, <<'END';
+ if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
+ my @lb = split /\n/, <<'END';
lb ; AI ; Ambiguous
lb ; AL ; Alphabetic
lb ; B2 ; Break_Both
lb ; XX ; Unknown
lb ; ZW ; ZWSpace
END
+ # If this Unicode version predates the lb property, we use our
+ # substitute one
+ if (-e 'LBsubst.txt') {
+ $_ = s/^lb/_Perl_LB/r for @lb;
+ }
+ push @return, @lb;
}
if (-e 'DNormalizationProps.txt') {
. $CMD_DELIM
. $fields[$CHARNAME];
}
- elsif ($fields[$CHARNAME] =~ /^CJK/) {
+ elsif ($fields[$CATEGORY] eq 'Lo') { # Is a letter
+
+ # All the CJK ranges like this have the name given as a
+ # special case in the next code line. And for the others, we
+ # hope that Unicode continues to use the correct name in
+ # future releases, so we don't have to make further special
+ # cases.
+ my $name = ($fields[$CHARNAME] =~ /^CJK/)
+ ? 'CJK UNIFIED IDEOGRAPH'
+ : uc $fields[$CHARNAME];
# The name for these contains the code point itself, and all
# are defined to have the same base name, regardless of what
. '='
. $CP_IN_NAME
. $CMD_DELIM
- . 'CJK UNIFIED IDEOGRAPH';
+ . $name;
}
elsif ($fields[$CATEGORY] eq 'Co'
return;
}
+sub process_LB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @LB_tests, $_;
+ }
+
+ return;
+}
+
sub process_SB_test {
my $file = shift;
}
}
+sub filter_substitute_lb {
+ # Used on Unicodes that predate the LB property, where there is a
+ # substitute file. This just does the regular ea_lb handling for such
+ # files, and then substitutes the long property value name for the short
+ # one that comes with the file. (The other break files have the long
+ # names in them, so this is the odd one out.) The reason for doing this
+ # kludge is that regen/mk_invlists.pl is expecting the long name. This
+ # also fixes the typo 'Inseperable' that leads to problems.
+
+ filter_early_ea_lb;
+ return unless $_;
+
+ my @fields = split /\s*;\s*/;
+ $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
+ $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
+ $_ = join '; ', @fields;
+}
+
sub filter_old_style_arabic_shaping {
# Early versions used a different term for the later one.
# For each property, fill in any missing mappings, and calculate the re
# match tables. If a property has more than one missing mapping, the
- # default is a reference to a data structure, and requires data from other
- # properties to resolve. The sort is used to cause these to be processed
- # last, after all the other properties have been calculated.
+ # default is a reference to a data structure, and may require data from
+ # other properties to resolve. The sort is used to cause these to be
+ # processed last, after all the other properties have been calculated.
# (Fortunately, the missing properties so far don't depend on each other.)
foreach my $property
(sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
END
));
+ # Make sure this assumption in perl core code is valid in this Unicode
+ # release, with known exceptions
+ foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
+ next if $range->end - $range->start == 9;
+ next if $range->start == 0x1D7CE; # This whole range was added in 3.1
+ next if $range->end == 0x19DA && $v_version eq v5.2.0;
+ next if $range->end - $range->start < 9 && $v_version le 4.0.0;
+ Carp::my_carp("Range $range unexpectedly doesn't contain 10"
+ . " decimal digits. Code in regcomp.c assumes it does,"
+ . " and will have to be fixed. Proceeding anyway.");
+ }
+
Property->new('Legacy_Case_Folding',
File => "Fold",
Directory => $map_directory,
return $Nl;
}
-sub calculate_Assigned() { # Calculate the gc != Cn code points; may be
+sub calculate_Assigned() { # Set $Assigned to the gc != Cn code points; may be
# called before the Cn's are completely filled.
# Works on Unicodes earlier than ones that
# explicitly specify Cn.
# since the first compare version.
my $delta = Range_List->new();
foreach my $table ($age->tables) {
+ use version;
next if $table == $age->table('Unassigned');
- next if $table->name le $string_compare_versions;
+ next if version->parse($table->name)
+ le version->parse($string_compare_versions);
$delta += $table;
}
if ($delta->is_empty) {
next if $this_block == $no_block
|| ! ($this_block & $Assigned)->is_empty;
$this_block->set_fate($SUPPRESSED, $after_first_version);
+ foreach my $range ($this_block->ranges) {
+ $block->replace_map($range->start, $range->end, 'No_Block')
+ }
$no_block += $this_block;
}
my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
Description => '\w, including beyond ASCII;'
- . ' = \p{Alnum} + \pM + \p{Pc}',
+ . ' = \p{Alnum} + \pM + \p{Pc}'
+ . ' + \p{Join_Control}',
Initialize => $Alnum + $gc->table('Mark'),
);
my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
+ ord("(")
+ ord(")")
+ ord("-")
- + utf8::unicode_to_native(0xA0) # NBSP
);
my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
0x2060 .. 0x206F,
0xFE00 .. 0xFE0F,
0xFFF0 .. 0xFFFB,
- 0xE0000 .. 0xE0FFF,
]);
+ $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
$quotemeta += $temp;
}
calculate_DI();
}
}
+ # Perl tailors the WordBreak property so that \b{wb} doesn't split
+ # adjacent spaces into separate words. First create a copy of the regular
+ # WB property as '_Perl_WB'. (On Unicode releases earlier than when WB
+ # was defined for, this will already have been done by the substitute file
+ # portion for 'Input_file' code for WB.)
+ my $perl_wb = property_ref('_Perl_WB');
+ if (! defined $perl_wb) {
+ $perl_wb = Property->new('_Perl_WB',
+ Fate => $INTERNAL_ONLY,
+ Perl_Extension => 1,
+ Directory => $map_directory,
+ Type => $STRING);
+ my $wb = property_ref('Word_Break');
+ $perl_wb->initialize($wb);
+ $perl_wb->set_default_map($wb->default_map);
+ }
+
+ # And simply replace the mappings of horizontal space characters that
+ # otherwise would map to the default to instead map to our tailoring.
+ my $default = $perl_wb->default_map;
+ for my $range ($Blank->ranges) {
+ for my $i ($range->start .. $range->end) {
+ next unless $perl_wb->value_of($i) eq $default;
+ $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
+ Replace => $UNCONDITIONALLY);
+ }
+ }
+
+ # Create a version of the LineBreak property with the mappings that are
+ # omitted in the default algorithm remapped to what
+ # http://www.unicode.org/reports/tr14 says they should be.
+ #
+ # Original Resolved General_Category
+ # AI, SG, XX AL Any
+ # SA CM Only Mn or Mc
+ # SA AL Any except Mn and Mc
+ # CJ NS Any
+ #
+ # All property values are also written out in their long form, as
+ # regen/mk_invlist.pl expects that. This also fixes occurrences of the
+ # typo in early Unicode versions: 'inseperable'.
+ my $perl_lb = property_ref('_Perl_LB');
+ if (! defined $perl_lb) {
+ $perl_lb = Property->new('_Perl_LB',
+ Fate => $INTERNAL_ONLY,
+ Perl_Extension => 1,
+ Directory => $map_directory,
+ Type => $STRING);
+ my $lb = property_ref('Line_Break');
+
+ # Populate from $lb, but use full name and fix typo.
+ foreach my $range ($lb->ranges) {
+ my $full_name = $lb->table($range->value)->full_name;
+ $full_name = 'Inseparable'
+ if standardize($full_name) eq 'inseperable';
+ $perl_lb->add_map($range->start, $range->end, $full_name);
+ }
+ }
+
+ $perl_lb->set_default_map('Alphabetic', 'full_name'); # XX -> AL
+
+ for my $range ($perl_lb->ranges) {
+ my $value = standardize($range->value);
+ if ( $value eq standardize('Unknown')
+ || $value eq standardize('Ambiguous')
+ || $value eq standardize('Surrogate'))
+ {
+ $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
+ Replace => $UNCONDITIONALLY);
+ }
+ elsif ($value eq standardize('Conditional_Japanese_Starter')) {
+ $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
+ Replace => $UNCONDITIONALLY);
+ }
+ elsif ($value eq standardize('Complex_Context')) {
+ for my $i ($range->start .. $range->end) {
+ my $gc_val = $gc->value_of($i);
+ if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
+ $perl_lb->add_map($i, $i, 'Combining_Mark',
+ Replace => $UNCONDITIONALLY);
+ }
+ else {
+ $perl_lb->add_map($i, $i, 'Alphabetic',
+ Replace => $UNCONDITIONALLY);
+ }
+ }
+ }
+ }
+
# Here done with all the basic stuff. Ready to populate the information
# about each character if annotating them.
if ($annotate) {
# the single-form, \p{name}. These are:
# All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
# \p{Is_Name} as synonyms
- # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
+ # \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
# \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
# \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
# conflict, \p{Value} and \p{Is_Value} as well
property_ref('*');
push @tables, $gc->tables;
- # If the version of Unicode includes the Script property, add its tables
- push @tables, $script->tables if defined $script;
+ # If the version of Unicode includes the Script Extensions (preferably),
+ # or Script property, add its tables
+ my $scx = property_ref("Script_Extensions");
+ if (defined $scx) {
+ push @tables, $scx->tables;
+ }
+ else {
+ push @tables, $script->tables if defined $script;
+ }
# The Block tables are kept separate because they are treated differently.
# And the earliest versions of Unicode didn't include them, so add only if
if (! defined $pre_existing) {
- # No name collision, so ok to add the perl synonym.
+ # No name collision, so OK to add the perl synonym.
my $make_re_pod_entry;
my $ok_as_filename;
my $status = $alias->status;
if ($nominal_property == $block) {
- # For block properties, the 'In' form is preferred for
- # external use; the pod file contains wild cards for
- # this and the 'Is' form so no entries for those; and
- # we don't want people using the name without the
- # 'In', so discourage that.
+ # For block properties, only the compound form is
+ # preferred for external use; the others are
+ # discouraged. The pod file contains wild cards for
+ # the 'In' and 'Is' forms so no entries for those; and
+ # we don't want people using the name without any
+ # prefix, so discourage that.
if ($prefix eq "") {
$make_re_pod_entry = 1;
$status = $status || $DISCOURAGED;
}
elsif ($prefix eq 'In_') {
$make_re_pod_entry = 0;
- $status = $status || $NORMAL;
+ $status = $status || $DISCOURAGED;
$ok_as_filename = 1;
}
else {
next;
}
- # Here, there is a name collision, but it still could be ok if
+ # Here, there is a name collision, but it still could be OK if
# the tables match the identical set of code points, in which
# case, we can combine the names. Compare each table's code
# point list to see if they are identical.
# And if this is a compound form name, see if there is a
# single form equivalent
my $single_form;
- if ($table_property != $perl) {
+ if ($table_property != $perl && $table_property != $block) {
# Special case the binary N tables, so that will print
# \P{single}, but use the Y table values to populate
Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
}
- # We modifiy the info column of the one being output to
+ # We modify the info column of the one being output to
# indicate the ambiguity. Set $which to point to that one's
# info.
my $which;
return -1
}
- # Determine if the two operands are numeric property values or not.
- # A numeric property will look like xyz: 3. But the number
- # can begin with an optional minus sign, and may have a
- # fraction or rational component, like xyz: 3/2. If either
- # isn't numeric, use alphabetic sort.
- my ($a_initial, $a_number) =
- ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
- return $a cmp $b unless defined $a_number;
- my ($b_initial, $b_number) =
- ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
- return $a cmp $b unless defined $b_number;
-
- # Here they are both numeric, but use alphabetic sort if the
- # initial parts don't match
- return $a cmp $b if $a_initial ne $b_initial;
+ # Determine if the two operands are compound or not, and if so if are
+ # "numeric" property values or not, like \p{Age: 3.0}. But there are also
+ # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
+ # all of which this considers numeric, and for sorting, looks just at the
+ # numeric parts. It can also be a rational like \p{Numeric Value=-1/2}.
+ my $split_re = qr/
+ ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
+ # property name
+ [:=] \s* # The syntax for the compound form
+ (?: # followed by ...
+ ( # $2 gets defined if what follows is a "numeric"
+ # expression, which is ...
+ ( -? \d+ (?: [.\/] \d+)? # An integer, float, or rational
+ # number, optionally signed
+ | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131. Either
+ # of these go into $3
+ | ( V \d+ _ \d+ ) # or a Unicode's Age property version
+ # number, into $4
+ )
+ | .* $ # If not "numeric", accept anything so that $1 gets
+ # defined if it is any compound form
+ ) /ix;
+ my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
+ my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
+
+ # Sort alphabeticlly on the whole property name if either operand isn't
+ # compound, or they differ.
+ return $a cmp $b if ! defined $a_initial
+ || ! defined $b_initial
+ || $a_initial ne $b_initial;
+
+ if (! defined $a_numeric) {
+
+ # If neither is numeric, use alpha sort
+ return $a cmp $b if ! defined $b_numeric;
+ return 1; # Sort numeric ahead of alpha
+ }
+
+ # Here $a is numeric
+ return -1 if ! defined $b_numeric; # Numeric sorts before alpha
+
+ # Here they are both numeric in the same property.
+ # Convert version numbers into regular numbers
+ if (defined $a_version) {
+ ($a_number = $a_version) =~ s/^V//i;
+ $a_number =~ s/_/./;
+ }
+ else { # Otherwise get rid of the, e.g., CCC in CCC9 */
+ $a_number =~ s/ ^ [[:alpha:]]+ //x;
+ }
+ if (defined $b_version) {
+ ($b_number = $b_version) =~ s/^V//i;
+ $b_number =~ s/_/./;
+ }
+ else {
+ $b_number =~ s/ ^ [[:alpha:]]+ //x;
+ }
# Convert rationals to floating for the comparison.
$a_number = eval $a_number if $a_number =~ qr{/};
$b_number = eval $b_number if $b_number =~ qr{/};
- return $a_number <=> $b_number;
+ return $a_number <=> $b_number || $a cmp $b;
}
sub make_pod () {
'\p{Block: *}'
. (($has_In_conflicts)
? " $exception_message"
- : ""));
+ : ""),
+ $DISCOURAGED);
@block_warning = << "END";
-Matches in the Block property have shortcuts that begin with "In_". For
-example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
-backward compatibility, if there is no conflict with another shortcut, these
-may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
-are numerous such conflicting shortcuts. Use of these forms for Block is
-discouraged, and are flagged as such, not only because of the potential
-confusion as to what is meant, but also because a later release of Unicode may
-preempt the shortcut, and your program would no longer be correct. Use the
-"In_" form instead to avoid this, or even more clearly, use the compound form,
-e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
-about this.
+In particular, matches in the Block property have single forms
+defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at
+all, Like all B<DISCOURAGED> forms, these are not stable. For example,
+C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>,
+C<\\p{Is_Deseret}>, or C<\\p{Deseret}>. But, a new Unicode version may
+come along that would force Perl to change the meaning of one or more of
+these, and your program would no longer be correct. Currently there are no
+such conflicts with the form that begins C<"In_">, but there are many with the
+other two shortcuts, and Unicode continues to define new properties that begin
+with C<"In">, so it's quite possible that a conflict will occur in the future.
+The compound form is guaranteed to not become obsolete, and its meaning is
+clearer anyway. See L<perlunicode/"Blocks"> for more information about this.
END
}
my $text = $Is_flags_text;
push @bad_re_properties, "\n=back\n";
}
- # Similiarly, generate a list of files that we don't use, grouped by the
+ # Similarly, generate a list of files that we don't use, grouped by the
# reasons why (Don't output if the reason is empty). First, create a hash
# whose keys are the reasons, and whose values are anonymous arrays of all
# the files that share that reason.
obsolete. Generally this designation is given to properties that Unicode once
used for internal purposes (but not any longer).
-=back
+=item Discouraged
+
+This is not actually a Unicode-specified obsolescence, but applies to certain
+Perl extensions that are present for backwards compatibility, but are
+discouraged from being used. These are not obsolete, but their meanings are
+not stable. Future Unicode versions could force any of these extensions to be
+removed without warning, replaced by another property with the same name that
+means something different. $A_bold_discouraged flags each such entry in the
+table. Use the equivalent shown instead.
-Some Perl extensions are present for backwards compatibility and are
-discouraged from being used, but are not obsolete. $A_bold_discouraged
-flags each such entry in the table. Future Unicode versions may force
-some of these extensions to be removed without warning, replaced by another
-property with the same name that means something different. Use the
-equivalent shown instead.
+@block_warning
=back
-@block_warning
+=back
The table below has two columns. The left column contains the C<\\p{}>
constructs to look up, possibly preceded by the flags mentioned above; and
=over 4
-=item *
+=item Z<>B<*> is a wild-card
-B<*> is a wild-card
-
-=item *
-
-B<(\\d+)> in the info column gives the number of Unicode code points matched
+=item B<(\\d+)> in the info column gives the number of Unicode code points matched
by this property.
-=item *
-
-B<$DEPRECATED> means this is deprecated.
-
-=item *
-
-B<$OBSOLETE> means this is obsolete.
-
-=item *
+=item B<$DEPRECATED> means this is deprecated.
-B<$STABILIZED> means this is stabilized.
+=item B<$OBSOLETE> means this is obsolete.
-=item *
+=item B<$STABILIZED> means this is stabilized.
-B<$STRICTER> means tighter (stricter) name matching applies.
+=item B<$STRICTER> means tighter (stricter) name matching applies.
-=item *
-
-B<$DISCOURAGED> means use of this form is discouraged, and may not be
+=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
stable.
=back
foreach my $prop_alias ($property->aliases) {
my $prop_alias_name = standardize($prop_alias->name);
- # If no =value, there's just one combination possibe for this
+ # If no =value, there's just one combination possible for this
if (! $value_name) {
# The property may be suppressed, but there may be a proxy
# the children.
make_re_pod_entries($table) if defined $pod_directory;
- # See if the the table matches identical code points with
+ # See if the table matches identical code points with
# something that has already been output. In that case,
# no need to have two files with the same code points in
# them. We use the table's hash() method to store these
# the table. That is, all the property-values given
# by this table. By agreement with Unicode::UCD,
# if the name and full name are identical, and there
- # are no other names, drop the duplcate entry to save
+ # are no other names, drop the duplicate entry to save
# memory.
if (@values_list == 2
&& $values_list[0] eq $values_list[1])
sub generate_tests($$$$$) {
# This used only for making the test script. It generates test cases that
- # are expected to compile successfully in perl. Note that the lhs and
- # rhs are assumed to already be as randomized as the caller wants.
+ # are expected to compile successfully in perl. Note that the LHS and
+ # RHS are assumed to already be as randomized as the caller wants.
my $lhs = shift; # The property: what's to the left of the colon
# or equals separator
my $rhs = shift; # The property value; what's to the right
my $valid_code = shift; # A code point that's known to be in the
- # table given by lhs=rhs; undef if table is
+ # table given by LHS=RHS; undef if table is
# empty
my $invalid_code = shift; # A code point known to not be in the table;
# undef if the table is all code points
# colon or equals separator
my $rhs = shift; # The property value; what's to the right
my $already_in_error = shift; # Boolean; if true it's known that the
- # unmodified lhs and rhs will cause an error.
+ # unmodified LHS and RHS will cause an error.
# This routine should not force another one
# Get the colon or equal
my $separator = generate_separator($lhs);
# Since this is an error only, don't bother to randomly decide whether to
- # put the error on the left or right side; and assume that the rhs is
+ # put the error on the left or right side; and assume that the RHS is
# loosely matched, again for convenience rather than rigor.
$rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
# or multiple lines. main::write doesn't count the lines.
my @output;
+ push @output, <<'EOF_CODE';
+Error('\p{Script=InGreek}'); # Bug #69018
+Test_GCB("1100 $nobreak 1161"); # Bug #70940
+Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
+Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
+Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
+
+# Make sure this gets tested; it was not part of the official test suite at
+# the time this was added. Note that this is as it would appear in the
+# official suite, and gets modified to check for the perl tailoring by
+# Test_WB()
+Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
+Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
+EOF_CODE
+
# Sort these so get results in same order on different runs of this
# program
- foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) {
- foreach my $table (sort { $a->name cmp $b->name } $property->tables) {
+ foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
+ or
+ lc $a->name cmp lc $b->name
+ } property_ref('*'))
+ {
+ foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
+ or
+ lc $a->name cmp lc $b->name
+ } $property->tables)
+ {
# Find code points that match, and don't match this table.
my $valid = $table->get_valid_code_point;
# conflating is possible. In our example, we
# don't want 2/3 matching 7/10, if there is
# a 7/10 code point.
+
+ # First, integers are not in the rationals
+ # table. Don't generate an error if this
+ # rounds to an integer using the given
+ # precision.
+ my $round = sprintf "%.0f", $table_name;
+ next PLACE if abs($table_name - $round)
+ < $MAX_FLOATING_SLOP;
+
+ # Here, isn't close enough to an integer to be
+ # confusable with one. Now, see it it's
+ # "close" to a known rational
for my $existing
(keys %nv_floating_to_rational)
{
}
}
}
- }
- }
+ $table->DESTROY();
+ }
+ $property->DESTROY();
+ }
+
+ # Make any test of the boundary (break) properties TODO if the code
+ # doesn't match the version being compiled
+ my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
+ ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
+ : "\nsub TODO_FAILING_BREAKS { 0 }\n";
+
+ @output= map {
+ map s/^/ /mgr,
+ map "$_;\n",
+ split /;\n/, $_
+ } @output;
+
+ # Cause there to be 'if' statements to only execute a portion of this
+ # long-running test each time, so that we can have a bunch of .t's running
+ # in parallel
+ my $chunks = 10 # Number of test files
+ - 1 # For GCB & SB
+ - 1 # For WB
+ - 4; # LB split into this many files
+ my @output_chunked;
+ my $chunk_count=0;
+ my $chunk_size= int(@output / $chunks) + 1;
+ while (@output) {
+ $chunk_count++;
+ my @chunk= splice @output, 0, $chunk_size;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ @chunk,
+ "}\n";
+ }
+
+ $chunk_count++;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ (map {" Test_GCB('$_');\n"} @backslash_X_tests),
+ (map {" Test_SB('$_');\n"} @SB_tests),
+ "}\n";
+
+
+ $chunk_size= int(@LB_tests / 4) + 1;
+ @LB_tests = map {" Test_LB('$_');\n"} @LB_tests;
+ while (@LB_tests) {
+ $chunk_count++;
+ my @chunk= splice @LB_tests, 0, $chunk_size;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ @chunk,
+ "}\n";
+ }
+
+ $chunk_count++;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ (map {" Test_WB('$_');\n"} @WB_tests),
+ "}\n";
&write($t_path,
0, # Not utf8;
[$HEADER,
+ $TODO_FAILING_BREAKS,
<DATA>,
- @output,
- (map {"Test_GCB('$_');\n"} @backslash_X_tests),
- (map {"Test_SB('$_');\n"} @SB_tests),
- (map {"Test_WB('$_');\n"} @WB_tests),
- "Finished();\n"
+ @output_chunked,
+ "Finished();\n",
]);
return;
# This is a list of the input files and how to handle them. The files are
# processed in their order in this list. Some reordering is possible if
# desired, but the PropertyAliases and PropValueAliases files should be first,
-# and the extracted before the others except DAge.txt (as data in an extracted
-# file can be over-ridden by the non-extracted. Some other files depend on
-# data derived from an earlier file, like UnicodeData requires data from Jamo,
-# and the case changing and folding requires data from Unicode. Mostly, it is
-# safest to order by first version releases in (except the Jamo). DAge.txt is
-# read before the extracted ones because of the rarely used feature
-# $compare_versions. In the unlikely event that there were ever an extracted
-# file that contained the Age property information, it would have to go in
-# front of DAge.
+# and the extracted before the others (as data in an extracted file can be
+# over-ridden by the non-extracted. Some other files depend on data derived
+# from an earlier file, like UnicodeData requires data from Jamo, and the case
+# changing and folding requires data from Unicode. Mostly, it is safest to
+# order by first version releases in (except the Jamo).
#
# The version strings allow the program to know whether to expect a file or
# not, but if a file exists in the directory, it will be processed, even if it
Has_Missings_Defaults => $NOT_IGNORED,
Required_Even_in_Debug_Skip => 1,
),
- Input_file->new('DAge.txt', v3.2.0,
- Has_Missings_Defaults => $NOT_IGNORED,
- Property => 'Age'
- ),
Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
Property => 'General_Category',
),
Property => 'Joining_Type',
Has_Missings_Defaults => $NOT_IGNORED,
),
+ Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
+ Skip => 'This file adds no new information not already'
+ . ' present in other files',
+ # And it's unnecessary programmer work to handle this new
+ # format. Previous Derived files actually had bug fixes
+ # in them that were useful, but that should not be the
+ # case here.
+ ),
Input_file->new('Jamo.txt', v2.0.0,
Property => 'Jamo_Short_Name',
Each_Line_Handler => \&filter_jamo_line,
Has_Missings_Defaults => $NOT_IGNORED,
Property => 'Line_Break',
# Early versions had problematic syntax
- Each_Line_Handler => (($v_version lt v3.1.0)
- ? \&filter_early_ea_lb
- : undef),
+ Each_Line_Handler => ($v_version ge v3.1.0)
+ ? undef
+ : ($v_version lt v3.0.0)
+ ? \&filter_substitute_lb
+ : \&filter_early_ea_lb,
+ # Must use long names for property values see comments at
+ # sub filter_substitute_lb
+ Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
+ 'Alphabetic', # default to this because XX ->
+ # AL
+
+ # Don't use _Perl_LB as a synonym for
+ # Line_Break in later perls, as it is tailored
+ # and isn't the same as Line_Break
+ 'ONLY_EARLY' ],
),
Input_file->new('EastAsianWidth.txt', v3.0.0,
Property => 'East_Asian_Width',
Withdrawn => v5.1,
Skip => $Documentation,
),
+ Input_file->new('DAge.txt', v3.2.0,
+ Has_Missings_Defaults => $NOT_IGNORED,
+ Property => 'Age'
+ ),
Input_file->new('HangulSyllableType.txt', v4.0,
Has_Missings_Defaults => $NOT_IGNORED,
Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
# for the release it is. To get it to actually mean
# something useful, someone would have to be using an
# earlier Unicode release, and copy it into the directory
- # for that release and recomplile. So far there has been
+ # for that release and recompile. So far there has been
# no demand to do that, so this hasn't been implemented.
Skip => 'Documentation of corrections already '
. 'incorporated into the Unicode data base',
),
Input_file->new('StandardizedVariants.html', v3.2.0,
- Skip => 'Provides a visual display of the standard '
+ Skip => 'Obsoleted as of Unicode 9.0, but previously '
+ . 'provided a visual display of the standard '
. 'variant sequences derived from '
. 'F<StandardizedVariants.txt>.',
# I don't know why the html came earlier than the
Skip => $Documentation,
),
Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
- Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
+ Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter',
+
+ # Don't use _Perl_WB as a synonym for
+ # Word_Break in later perls, as it is tailored
+ # and isn't the same as Word_Break
+ 'ONLY_EARLY' ],
Property => 'Word_Break',
Has_Missings_Defaults => $NOT_IGNORED,
),
),
Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
Handler => \&process_GCB_test,
+ retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
Skip => $Validation_Documentation,
),
Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
Handler => \&process_SB_test,
+ retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
Skip => $Validation_Documentation,
),
Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
Handler => \&process_WB_test,
+ retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
Skip => $Validation_Documentation,
. 'F<NamedSequences.txt> and recompile perl',
),
Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
- Skip => $Validation,
+ Handler => \&process_LB_test,
+ retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
Skip => $Validation_Documentation,
Property => 'Indic_Positional_Category',
Has_Missings_Defaults => $NOT_IGNORED,
),
+ Input_file->new('TangutSources.txt', v9.0.0,
+ Skip => 'Specifies source mappings for Tangut ideographs'
+ . ' and components. This data file also includes'
+ . ' informative radical-stroke values that are used'
+ . ' internally by Unicode',
+ ),
+ Input_file->new('VerticalOrientation.txt', v10.0.0,
+ Property => 'Vertical_Orientation',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ ),
+ Input_file->new('NushuSources.txt', v10.0.0,
+ Skip => 'Specifies source material for Nushu characters',
+ ),
);
# End of all the preliminaries.
}
}
if (! $rebuild) {
- print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
+ print "$0: Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
exit(0);
}
-print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
+print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
# Ready to do the major processing. First create the perl pseudo-property.
$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
}
print "\nAll done\n" if $verbosity >= $VERBOSE;
}
+
+if ($version_of_mk_invlist_bounds lt $v_version) {
+ Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
+ . " to be checked and possibly updated to Unicode"
+ . " $string_version");
+}
+
exit(0);
# TRAILING CODE IS USED BY make_property_test_script()
$Tests++;
# A string eval is needed because of the 'no warnings'.
- # Assumes no parens in the regular expression
+ # Assumes no parentheses in the regular expression
my $result = eval "$no_warnings
my \$RegObj = qr($regex);
$string =~ \$RegObj ? 1 : 0";
my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
utf8::upgrade($nobreak_utf8);
-use Config;
+my $are_ctype_locales_available;
my $utf8_locale;
chdir 't' if -d 't';
eval { require "./loc_tools.pl" };
-$utf8_locale = &find_utf8_ctype_locale if defined &find_utf8_ctype_locale;
+if (defined &locales_enabled) {
+ $are_ctype_locales_available = locales_enabled('LC_CTYPE');
+ if ($are_ctype_locales_available) {
+ $utf8_locale = &find_utf8_ctype_locale;
+ }
+}
+
+# Eval'd so can run on versions earlier than the property is available in
+my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
+if (! defined $WB_Extend_or_Format_re) {
+ $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
+}
sub _test_break($$) {
- # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
- # Each such line is a sequence of code points given by their hex numbers,
- # separated by the two characters defined just before this subroutine that
- # indicate that either there can or cannot be a break between the adjacent
- # code points. If there isn't a break, that means the sequence forms an
- # extended grapheme cluster, which means that \X should match the whole
- # thing. If there is a break, \X should stop there. This is all
- # converted by this routine into a match:
- # $string =~ /(\X)/,
- # Each \X should match the next cluster; and that is what is checked.
+ # Test various break property matches. The 2nd parameter gives the
+ # property name. The input is a line from auxiliary/*Test.txt for the
+ # given property. Each such line is a sequence of Unicode (not native)
+ # code points given by their hex numbers, separated by the two characters
+ # defined just before this subroutine that indicate that either there can
+ # or cannot be a break between the adjacent code points. All these are
+ # tested.
+ #
+ # For the gcb property extra tests are made. if there isn't a break, that
+ # means the sequence forms an extended grapheme cluster, which means that
+ # \X should match the whole thing. If there is a break, \X should stop
+ # there. This is all converted by this routine into a match: $string =~
+ # /(\X)/, Each \X should match the next cluster; and that is what is
+ # checked.
my $template = shift;
my $break_type = shift;
my $line = (caller 1)[2]; # Line number
+ my $comment = "";
+
+ if ($template =~ / ( .*? ) \s* \# (.*) /x) {
+ $template = $1;
+ $comment = $2;
+
+ # Replace leading spaces with a single one.
+ $comment =~ s/ ^ \s* / # /x;
+ }
# The line contains characters above the ASCII range, but in Latin1. It
# may or may not be in utf8, and if it is, it may or may not know it. So,
$template =~ s/$breakable_utf8/$breakable/g;
}
+ # Perl customizes wb. So change the official tests accordingly
+ if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
+
+ # Split into elements that alternate between code point and
+ # break/no-break
+ my @line = split / +/, $template;
+
+ # Look at each code point and its following one
+ for (my $i = 1; $i < @line - 1 - 1; $i+=2) {
+
+ # The customization only involves changing some breaks to
+ # non-breaks.
+ next if $line[$i+1] =~ /$nobreak/;
+
+ my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
+ my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
+
+ # And it only affects adjacent space characters.
+ next if $lhs !~ /\s/u;
+
+ # But, we want to make sure to test spaces followed by a Extend
+ # or Format.
+ next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
+
+ # To test the customization, add some white-space before this to
+ # create a span. The $lhs white space may or may not be bound to
+ # that span, and also with the $rhs. If the $rhs is a binding
+ # character, the $lhs is bound to it and not to the span, unless
+ # $lhs is vertical space. In all other cases, the $lhs is bound
+ # to the span. If the $rhs is white space, it is bound to the
+ # $lhs
+ my $bound;
+ my $span;
+ if ($rhs =~ /$WB_Extend_or_Format_re/) {
+ if ($lhs =~ /\v/) {
+ $bound = $breakable;
+ $span = $nobreak;
+ }
+ else {
+ $bound = $nobreak;
+ $span = $breakable;
+ }
+ }
+ else {
+ $span = $nobreak;
+ $bound = $nobreak;
+ }
+
+ splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
+ $i += 4;
+ $line[$i+1] = $bound;
+ }
+ $template = join " ", @line;
+ }
+
# The input is just the break/no-break symbols and sequences of Unicode
# code points as hex digits separated by spaces for legibility. e.g.:
# ÷ 0020 × 0308 ÷ 0020 ÷
$display_upgrade = " (utf8-upgraded)";
}
- # The /l modifier has C after it to indicate the locale to try
- my @modifiers = qw(a aa d lC u i);
- push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+ my @modifiers = qw(a aa d u i);
+ if ($are_ctype_locales_available) {
+ push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+
+ # The /l modifier has C after it to indicate the locale to try
+ push @modifiers, "lC";
+ }
# Test for each of the regex modifiers.
for my $modifier (@modifiers) {
if ($modifier =~ / ^ l (.*) /x) {
my $locale = $1;
$display_locale = "(locale = $locale)";
- use Config;
- if (defined $Config{d_setlocale}) {
- eval { require POSIX; import POSIX 'locale_h'; };
- if (defined &POSIX::LC_CTYPE) {
- POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
- }
- }
+ POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
$modifier = 'l';
}
my $pattern = "(?$modifier:$break_pattern)";
# Actually do the test
+ my $matched_text;
my $matched = $string =~ qr/$pattern/;
- print "not " unless $matched;
+ if ($matched) {
+ $matched_text = "matched";
+ }
+ else {
+ $matched_text = "failed to match";
+ print "not ";
+
+ if (TODO_FAILING_BREAKS) {
+ $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
+ $comment =~ s/#/# TODO/;
+ }
+ }
+ print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
- # Fancy display of test results
- $matched = ($matched) ? "matched" : "failed to match";
- print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
+ # Only print the comment on the first use of this line
+ $comment = "";
# Repeat with the first \B{} in the pattern. This makes sure the
# code in regexec.c:find_byclass() for \B gets executed
my $B_pattern = "$1$2";
$matched = $string =~ qr/$B_pattern/;
print "not " unless $matched;
- print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
+ $matched_text = ($matched) ? "matched" : "failed to match";
+ print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
+ print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
+ print "\n";
}
}
}
print " correctly matched $should_display[$i]; line $line\n";
} else {
- $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
+ $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
split "", $matches[$i]);
- print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
+ print "not ok $Tests -";
+ print " # TODO" if TODO_FAILING_BREAKS;
+ print " In \"$display_string\" =~ /(\\X)/g, \\X #",
$i + 1,
" should have matched $should_display[$i]",
" but instead matched $matches[$i]",
if (@matches == @should_match) {
print "ok $Tests - Nothing was left over; line $line\n";
} else {
- print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
+ print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
+ print " # TODO" if TODO_FAILING_BREAKS;
+ print "\n";
}
}
_test_break(shift, 'gcb');
}
+sub Test_LB($) {
+ _test_break(shift, 'lb');
+}
+
sub Test_SB($) {
_test_break(shift, 'sb');
}
exit($Fails ? -1 : 0);
}
-Error('\p{Script=InGreek}'); # Bug #69018
-Test_GCB("1100 $nobreak 1161"); # Bug #70940
-Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
-Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
-Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726