# expected, a warning will be generated. If an older version is being
# compiled, any bounds tests that fail in the generated test file (-maketest
# option) will be marked as TODO.
-my $version_of_mk_invlist_bounds = v9.0.0;
+my $version_of_mk_invlist_bounds = v10.0.0;
##########################################################################
#
#
# A NOTE ON UNIHAN
#
-# This program can generate tables from the Unihan database. But that db
+# This program can generate tables from the Unihan database. But that DB
# isn't normally available, so it is marked as optional. Prior to version
# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database
# was split into 8 different files, all beginning with the letters 'Unihan'.
# handled by Unicode::Normalize, nor will it compile when presented a version
# that has them. However, you can trivially get it to compile by simply
# ignoring those decompositions, by changing the croak to a carp. At the time
-# of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or
-# cpan/Unicode-Normalize/mkheader) reads
+# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
+# dist/Unicode-Normalize/mkheader) reads
#
# croak("Weird Canonical Decomposition of U+$h");
#
# by the code points introduced in the later version. You probably also want
# to use the -annotate option when using this. Run this program on a unicore
# containing the starting release you want to compare. Save that output
-# structrue. Then, switching to a unicore with the ending release, change the
+# structure. Then, switching to a unicore with the ending release, change the
# 0 in the $string_compare_versions definition just below to a string
# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
# to the starting release. This program will then compile, but throw away all
# common to both releases, and you can see the changes caused just by the
# underlying release semantic changes. For versions earlier than 3.2, you
# must copy a version of DAge.txt into the directory.
-my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
+my $string_compare_versions = DEBUG && ""; # e.g., "2.1";
my $compare_versions = DEBUG
&& $string_compare_versions
&& pack "C*", split /\./, $string_compare_versions;
$verbosity = 0;
}
elsif ($arg eq '-w') {
- $write_unchanged_files = 1; # update the files even if havent changed
+ # update the files even if they haven't changed
+ $write_unchanged_files = 1;
}
elsif ($arg eq '-check') {
my $this = shift @ARGV;
my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
-# We work with above-Unicode code points, up to UV_MAX. But when you get
-# that high, above IV_MAX, some operations don't work, and you can easily get
-# overflow. Therefore for internal use, we use a much smaller number,
-# translating it to UV_MAX only for output. The exact number is immaterial
-# (all Unicode code points are treated exactly the same), but the algorithm
-# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
+# We work with above-Unicode code points, up to IV_MAX, but we may want to use
+# sentinels above that number. Therefore for internal use, we use a much
+# smaller number, translating it to IV_MAX only for output. The exact number
+# is immaterial (all Unicode code points are treated exactly the same), but
+# the algorithm requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
-my $MAX_PLATFORM_CODEPOINT = ~0;
+my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
# Matches legal code point. 4-6 hex numbers, If there are 6, the first
# two must be 10; if there are 5, the first must not be a 0. Written this way
qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
my $code_point_re = qr/\b$run_on_code_point_re/;
-# This matches the beginning of the line in the Unicode db files that give the
+# This matches the beginning of the line in the Unicode DB files that give the
# defaults for code points not listed (i.e., missing) in the file. The code
# depends on this ending with a semi-colon, so it can assume it is a valid
# field when the line is split() by semi-colons
$viacode[$i] = $perl_charname->value_of($i) || "";
$age[$i] = (defined $age)
- ? (($age->value_of($i) =~ / ^ \d \. \d $ /x)
+ ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
? $age->value_of($i)
: "")
: "";
# 1) call before the first line is read, for pre processing
# 2) call to adjust each line of the input before the main handler gets
# them. This can be automatically generated, if appropriately simple
-# enough, by specifiying a Properties parameter in the constructor.
+# enough, by specifying a Properties parameter in the constructor.
# 3) call upon EOF before the main handler exits its loop
# 4) call at the end, for post processing
#
# each_line_handler()s. So, if the format of the line is not in the desired
# format for the main handler, these are used to do that adjusting. They can
# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
-# so the $_ output of one is used as the input to the next. The eof handler
+# so the $_ output of one is used as the input to the next. The EOF handler
# is also stackable, but none of the others are, but could easily be changed
# to be so.
#
# not otherwise be processed, and to not raise a warning about not being
# handled. In the constructor call, any value that evaluates to a numeric
# 0 or undef means don't skip. Any other value is a string giving the
- # reason it is being skippped, and this will appear in generated pod.
+ # reason it is being skipped, and this will appear in generated pod.
# However, an empty string reason will suppress the pod entry.
# Internally, calls that evaluate to numeric 0 are changed into undef to
# distinguish them from an empty string call.
# meaningful line of the input file. If present, an appropriate
# each_line_handler() is automatically generated and pushed onto the stack
# of such handlers. This is useful when a file contains multiple
- # proerties per line, but no other special considerations are necessary.
+ # properties per line, but no other special considerations are necessary.
# The special value "<ignored>" means to discard the corresponding input
# field.
# Any @missing lines in the file should also match this syntax; no such
return $self->_add_delete('+', @_);
}
+ sub replace_map {
+ # Replace a range
+
+ my $self = shift;
+
+ return $self->_add_delete('+', @_, Replace => $UNCONDITIONALLY);
+ }
+
sub add_duplicate {
# Adds entry to a range list which can duplicate an existing entry
# Accessors for the underlying list that should fail if locked.
for my $sub (qw(
add_duplicate
+ replace_map
))
{
no strict "refs";
range_count
ranges
range_size_1
+ replace_map
reset_each_range
set_comment
set_default_map
END
));
+ # Make sure this assumption in perl core code is valid in this Unicode
+ # release, with known exceptions
+ foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
+ next if $range->end - $range->start == 9;
+ next if $range->start == 0x1D7CE; # This whole range was added in 3.1
+ next if $range->end == 0x19DA && $v_version eq v5.2.0;
+ next if $range->end - $range->start < 9 && $v_version le 4.0.0;
+ Carp::my_carp("Range $range unexpectedly doesn't contain 10"
+ . " decimal digits. Code in regcomp.c assumes it does,"
+ . " and will have to be fixed. Proceeding anyway.");
+ }
+
Property->new('Legacy_Case_Folding',
File => "Fold",
Directory => $map_directory,
# since the first compare version.
my $delta = Range_List->new();
foreach my $table ($age->tables) {
+ use version;
next if $table == $age->table('Unassigned');
- next if $table->name le $string_compare_versions;
+ next if version->parse($table->name)
+ le version->parse($string_compare_versions);
$delta += $table;
}
if ($delta->is_empty) {
next if $this_block == $no_block
|| ! ($this_block & $Assigned)->is_empty;
$this_block->set_fate($SUPPRESSED, $after_first_version);
+ foreach my $range ($this_block->ranges) {
+ $block->replace_map($range->start, $range->end, 'No_Block')
+ }
$no_block += $this_block;
}
# the single-form, \p{name}. These are:
# All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
# \p{Is_Name} as synonyms
- # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
+ # \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
# \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
# \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
# conflict, \p{Value} and \p{Is_Value} as well
property_ref('*');
push @tables, $gc->tables;
- # If the version of Unicode includes the Script property, add its tables
- push @tables, $script->tables if defined $script;
+ # If the version of Unicode includes the Script Extensions (preferably),
+ # or Script property, add its tables
+ my $scx = property_ref("Script_Extensions");
+ if (defined $scx) {
+ push @tables, $scx->tables;
+ }
+ else {
+ push @tables, $script->tables if defined $script;
+ }
# The Block tables are kept separate because they are treated differently.
# And the earliest versions of Unicode didn't include them, so add only if
if (! defined $pre_existing) {
- # No name collision, so ok to add the perl synonym.
+ # No name collision, so OK to add the perl synonym.
my $make_re_pod_entry;
my $ok_as_filename;
next;
}
- # Here, there is a name collision, but it still could be ok if
+ # Here, there is a name collision, but it still could be OK if
# the tables match the identical set of code points, in which
# case, we can combine the names. Compare each table's code
# point list to see if they are identical.
Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
}
- # We modifiy the info column of the one being output to
+ # We modify the info column of the one being output to
# indicate the ambiguity. Set $which to point to that one's
# info.
my $which;
push @bad_re_properties, "\n=back\n";
}
- # Similiarly, generate a list of files that we don't use, grouped by the
+ # Similarly, generate a list of files that we don't use, grouped by the
# reasons why (Don't output if the reason is empty). First, create a hash
# whose keys are the reasons, and whose values are anonymous arrays of all
# the files that share that reason.
=over 4
-=item *
-
-B<*> is a wild-card
-
-=item *
+=item Z<>B<*> is a wild-card
-B<(\\d+)> in the info column gives the number of Unicode code points matched
+=item B<(\\d+)> in the info column gives the number of Unicode code points matched
by this property.
-=item *
-
-B<$DEPRECATED> means this is deprecated.
-
-=item *
-
-B<$OBSOLETE> means this is obsolete.
+=item B<$DEPRECATED> means this is deprecated.
-=item *
+=item B<$OBSOLETE> means this is obsolete.
-B<$STABILIZED> means this is stabilized.
+=item B<$STABILIZED> means this is stabilized.
-=item *
+=item B<$STRICTER> means tighter (stricter) name matching applies.
-B<$STRICTER> means tighter (stricter) name matching applies.
-
-=item *
-
-B<$DISCOURAGED> means use of this form is discouraged, and may not be
+=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
stable.
=back
foreach my $prop_alias ($property->aliases) {
my $prop_alias_name = standardize($prop_alias->name);
- # If no =value, there's just one combination possibe for this
+ # If no =value, there's just one combination possible for this
if (! $value_name) {
# The property may be suppressed, but there may be a proxy
# the children.
make_re_pod_entries($table) if defined $pod_directory;
- # See if the the table matches identical code points with
+ # See if the table matches identical code points with
# something that has already been output. In that case,
# no need to have two files with the same code points in
# them. We use the table's hash() method to store these
# the table. That is, all the property-values given
# by this table. By agreement with Unicode::UCD,
# if the name and full name are identical, and there
- # are no other names, drop the duplcate entry to save
+ # are no other names, drop the duplicate entry to save
# memory.
if (@values_list == 2
&& $values_list[0] eq $values_list[1])
sub generate_tests($$$$$) {
# This used only for making the test script. It generates test cases that
- # are expected to compile successfully in perl. Note that the lhs and
- # rhs are assumed to already be as randomized as the caller wants.
+ # are expected to compile successfully in perl. Note that the LHS and
+ # RHS are assumed to already be as randomized as the caller wants.
my $lhs = shift; # The property: what's to the left of the colon
# or equals separator
my $rhs = shift; # The property value; what's to the right
my $valid_code = shift; # A code point that's known to be in the
- # table given by lhs=rhs; undef if table is
+ # table given by LHS=RHS; undef if table is
# empty
my $invalid_code = shift; # A code point known to not be in the table;
# undef if the table is all code points
# colon or equals separator
my $rhs = shift; # The property value; what's to the right
my $already_in_error = shift; # Boolean; if true it's known that the
- # unmodified lhs and rhs will cause an error.
+ # unmodified LHS and RHS will cause an error.
# This routine should not force another one
# Get the colon or equal
my $separator = generate_separator($lhs);
# Since this is an error only, don't bother to randomly decide whether to
- # put the error on the left or right side; and assume that the rhs is
+ # put the error on the left or right side; and assume that the RHS is
# loosely matched, again for convenience rather than rigor.
$rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
# or multiple lines. main::write doesn't count the lines.
my @output;
+ push @output, <<'EOF_CODE';
+Error('\p{Script=InGreek}'); # Bug #69018
+Test_GCB("1100 $nobreak 1161"); # Bug #70940
+Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
+Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
+Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
+
+# Make sure this gets tested; it was not part of the official test suite at
+# the time this was added. Note that this is as it would appear in the
+# official suite, and gets modified to check for the perl tailoring by
+# Test_WB()
+Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
+Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
+EOF_CODE
+
# Sort these so get results in same order on different runs of this
# program
foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
? "\nsub TODO_FAILING_BREAKS { 1 }\n"
: "\nsub TODO_FAILING_BREAKS { 0 }\n";
+ @output= map {
+ map s/^/ /mgr,
+ map "$_;\n",
+ split /;\n/, $_
+ } @output;
+
+ # Cause there to be 'if' statements to only execute a portion of this
+ # long-running test each time, so that we can have a bunch of .t's running
+ # in parallel
+ my $chunks = 10 # Number of test files
+ - 1 # For GCB & SB
+ - 1 # For WB
+ - 4; # LB split into this many files
+ my @output_chunked;
+ my $chunk_count=0;
+ my $chunk_size= int(@output / $chunks) + 1;
+ while (@output) {
+ $chunk_count++;
+ my @chunk= splice @output, 0, $chunk_size;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ @chunk,
+ "}\n";
+ }
+
+ $chunk_count++;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ (map {" Test_GCB('$_');\n"} @backslash_X_tests),
+ (map {" Test_SB('$_');\n"} @SB_tests),
+ "}\n";
+
+
+ $chunk_size= int(@LB_tests / 4) + 1;
+ @LB_tests = map {" Test_LB('$_');\n"} @LB_tests;
+ while (@LB_tests) {
+ $chunk_count++;
+ my @chunk= splice @LB_tests, 0, $chunk_size;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ @chunk,
+ "}\n";
+ }
+
+ $chunk_count++;
+ push @output_chunked,
+ "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+ (map {" Test_WB('$_');\n"} @WB_tests),
+ "}\n";
+
&write($t_path,
0, # Not utf8;
[$HEADER,
$TODO_FAILING_BREAKS,
<DATA>,
- @output,
- (map {"Test_GCB('$_');\n"} @backslash_X_tests),
- (map {"Test_LB('$_');\n"} @LB_tests),
- (map {"Test_SB('$_');\n"} @SB_tests),
- (map {"Test_WB('$_');\n"} @WB_tests),
- "Finished();\n"
+ @output_chunked,
+ "Finished();\n",
]);
return;
Property => 'Joining_Type',
Has_Missings_Defaults => $NOT_IGNORED,
),
+ Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
+ Skip => 'This file adds no new information not already'
+ . ' present in other files',
+ # And it's unnecessary programmer work to handle this new
+ # format. Previous Derived files actually had bug fixes
+ # in them that were useful, but that should not be the
+ # case here.
+ ),
Input_file->new('Jamo.txt', v2.0.0,
Property => 'Jamo_Short_Name',
Each_Line_Handler => \&filter_jamo_line,
# for the release it is. To get it to actually mean
# something useful, someone would have to be using an
# earlier Unicode release, and copy it into the directory
- # for that release and recomplile. So far there has been
+ # for that release and recompile. So far there has been
# no demand to do that, so this hasn't been implemented.
Skip => 'Documentation of corrections already '
. 'incorporated into the Unicode data base',
. ' informative radical-stroke values that are used'
. ' internally by Unicode',
),
+ Input_file->new('VerticalOrientation.txt', v10.0.0,
+ Property => 'Vertical_Orientation',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ ),
+ Input_file->new('NushuSources.txt', v10.0.0,
+ Skip => 'Specifies source material for Nushu characters',
+ ),
);
# End of all the preliminaries.
$Tests++;
# A string eval is needed because of the 'no warnings'.
- # Assumes no parens in the regular expression
+ # Assumes no parentheses in the regular expression
my $result = eval "$no_warnings
my \$RegObj = qr($regex);
$string =~ \$RegObj ? 1 : 0";
exit($Fails ? -1 : 0);
}
-Error('\p{Script=InGreek}'); # Bug #69018
-Test_GCB("1100 $nobreak 1161"); # Bug #70940
-Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
-Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
-Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
-
-# Make sure this gets tested; it was not part of the official test suite at
-# the time this was addded. Note that this is as it would appear in the
-# official suite, and gets modified to check for the perl tailoring by
-# Test_WB()
-Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
-Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");