# expected, a warning will be generated. If an older version is being
# compiled, any bounds tests that fail in the generated test file (-maketest
# option) will be marked as TODO.
-my $version_of_mk_invlist_bounds = v10.0.0;
+my $version_of_mk_invlist_bounds = v12.1.0;
##########################################################################
#
# to use the -annotate option when using this. Run this program on a unicore
# containing the starting release you want to compare. Save that output
# structure. Then, switching to a unicore with the ending release, change the
-# 0 in the $string_compare_versions definition just below to a string
+# "" in the $string_compare_versions definition just below to a string
# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
# to the starting release. This program will then compile, but throw away all
# code points introduced after the starting release. Finally use a diff tool
# common to both releases, and you can see the changes caused just by the
# underlying release semantic changes. For versions earlier than 3.2, you
# must copy a version of DAge.txt into the directory.
-my $string_compare_versions = DEBUG && ""; # e.g., "2.1";
+my $string_compare_versions = DEBUG && "";
my $compare_versions = DEBUG
&& $string_compare_versions
&& pack "C*", split /\./, $string_compare_versions;
'Canonical_Combining_Class=Attached_Below_Left'
}
+# Obsoleted
+if ($v_version ge v11.0.0) {
+ push @tables_that_may_be_empty, qw(
+ Grapheme_Cluster_Break=E_Base
+ Grapheme_Cluster_Break=E_Base_GAZ
+ Grapheme_Cluster_Break=E_Modifier
+ Grapheme_Cluster_Break=Glue_After_Zwj
+ Word_Break=E_Base
+ Word_Break=E_Base_GAZ
+ Word_Break=E_Modifier
+ Word_Break=Glue_After_Zwj);
+}
+
# Enum values for to_output_map() method in the Map_Table package. (0 is don't
# output)
my $EXTERNAL_MAP = 1;
);
}
-
# Add any explicit cjk values
$file->insert_lines(@cjk_property_values);
next;
}
+ # Code points below 0x0100 need to be converted to native
+ $sequence =~ s/\b 00 ( [0-9A-F]{2} ) \b/
+ sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe
+ if NON_ASCII_PLATFORM;
+
# Note single \t in keeping with special output format of
# Perl_charnames. But it turns out that the code points don't have to
# be 5 digits long, like the rest, based on the internal workings of
my $rational = shift;
- my $float = eval $rational;
- $float = sprintf "%.*e", $E_FLOAT_PRECISION, $float;
- $nv_floating_to_rational{$float} = $rational;
+ my $floating = eval $rational;
+
+ my @floats = sprintf "%.*e", $E_FLOAT_PRECISION, $floating;
+
+ # See if the denominator is a power of 2.
+ $rational =~ m!.*/(.*)!;
+ my $denominator = $1;
+ if (defined $denominator && (($denominator & ($denominator - 1)) == 0)) {
+
+ # Here the denominator is a power of 2. This means it has an exact
+ # representation in binary, so rounding could go either way. It turns
+ # out that Windows doesn't necessarily round towards even, so output
+ # an extra entry. This happens when the final digit we output is even
+ # and the next digits would be 50* to the precision of the machine.
+ my $extra_digit_float = sprintf "%e", $floating;
+ my $q = $E_FLOAT_PRECISION - 1;
+ if ($extra_digit_float =~ / ( .* \. \d{$q} )
+ ( [02468] ) 5 0* ( e .*)
+ /ix)
+ {
+ push @floats, $1 . ($2 + 1) . $3;
+ }
+ }
+
+ foreach my $float (@floats) {
+ # Strip off any leading zeros beyond 2 digits to make it C99
+ # compliant. (Windows has 3 digit exponents, contrary to C99)
+ $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x;
+
+ if ( defined $nv_floating_to_rational{$float}
+ && $nv_floating_to_rational{$float} ne $rational)
+ {
+ die Carp::my_carp_bug("Both '$rational' and"
+ . " '$nv_floating_to_rational{$float}' evaluate to"
+ . " the same floating point number."
+ . " \$E_FLOAT_PRECISION must be increased");
+ }
+ $nv_floating_to_rational{$float} = $rational;
+ }
return;
}
return;
}
+sub setup_emojidata {
+ my $prop_ref = Property->new('XPG',
+ Full_Name => 'Extended_Pictographic',
+ );
+ $prop_ref->set_fate($PLACEHOLDER,
+ "Not part of the Unicode Character Database");
+}
+
+sub filter_emojidata_line {
+ # We only are interested in this single property from this non-UCD data
+ # file, and we turn it into a Perl property, so that it isn't accessible
+ # to the users
+
+ $_ = "" unless /\bExtended_Pictographic\b/;
+
+ return;
+}
+
sub generate_hst {
# Populates the Hangul Syllable Type property from first principles
# Every character 0-255 is problematic because what each folds to depends
# on the current locale
$loc_problem_folds->add_range(0, 255);
+ $loc_problem_folds->add_range(0x130, 0x131); # These are problematic in
+ # Turkic locales
$loc_problem_folds_start += $loc_problem_folds;
# Also problematic are anything these fold to outside the range. Likely
Description =>
"Code points whose fold is a string of more than one character",
);
+ my $in_multi_fold = $perl->add_match_table(
+ "_Perl_Is_In_Multi_Char_Fold",
+ Description =>
+ "Code points that are in some multiple character fold",
+ );
+ my $non_final_fold = $perl->add_match_table(
+ "_Perl_Non_Final_Folds",
+ Description => "Code points that are in some multiple character fold, but not in the final position",
+ );
if ($v_version lt v3.0.1) {
- push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char';
+ push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char',
+ '_Perl_Is_In_Multi_Char_Fold',
+ '_Perl_Non_Final_Folds';
}
# Look through all the known folds to populate these tables.
$loc_problem_folds->add_range($start, $end);
$found_locale_problematic = 1;
}
+
+ if (@hex_folds > 1) {
+ $in_multi_fold->add_range($cp, $cp);
+ next if $i < @hex_folds - 1;
+ $non_final_fold->add_range($cp, $cp);
+ }
}
# If this is a problematic fold, add to the start chars the
}
# Perl tailors the WordBreak property so that \b{wb} doesn't split
- # adjacent spaces into separate words. First create a copy of the regular
- # WB property as '_Perl_WB'. (On Unicode releases earlier than when WB
- # was defined for, this will already have been done by the substitute file
- # portion for 'Input_file' code for WB.)
+ # adjacent spaces into separate words. Unicode 11.0 moved in that
+ # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) NO
+ # BREAK SPACE as breaking, so we retained the original Perl customization.
+ # To do this, in the Perl copy of WB, simply replace the mappings of
+ # horizontal space characters that otherwise would map to the default or
+ # the 11.0 'WSegSpace' to instead map to our tailoring.
my $perl_wb = property_ref('_Perl_WB');
- if (! defined $perl_wb) {
- $perl_wb = Property->new('_Perl_WB',
- Fate => $INTERNAL_ONLY,
- Perl_Extension => 1,
- Directory => $map_directory,
- Type => $STRING);
- my $wb = property_ref('Word_Break');
- $perl_wb->initialize($wb);
- $perl_wb->set_default_map($wb->default_map);
- }
-
- # And simply replace the mappings of horizontal space characters that
- # otherwise would map to the default to instead map to our tailoring.
my $default = $perl_wb->default_map;
for my $range ($Blank->ranges) {
for my $i ($range->start .. $range->end) {
- next unless $perl_wb->value_of($i) eq $default;
+ my $value = $perl_wb->value_of($i);
+
+ next unless $value eq $default || $value eq 'WSegSpace';
$perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
Replace => $UNCONDITIONALLY);
}
}
+ # Also starting in Unicode 11.0, rules for some of the boundary types are
+ # based on a non-UCD property (which we have read in if it exists).
+ # Recall that these boundary properties partition the code points into
+ # equivalence classes (represented as enums).
+ #
+ # The loop below goes through each code point that matches the non-UCD
+ # property, and for each current equivalence class containing such a code
+ # point, splits it so that those that are in both are now in a newly
+ # created equivalence class whose name is a combination of the property
+ # and the old class name, leaving unchanged everything that doesn't match
+ # the non-UCD property.
+ my $pictographic_emoji = property_ref('XPG');
+ if (defined $pictographic_emoji) {
+ foreach my $base_property (property_ref('GCB'),
+ property_ref('WB'))
+ {
+ my $property = property_ref('_Perl_' . $base_property->name);
+ foreach my $range ($pictographic_emoji->table('Y')->ranges) {
+ foreach my $i ($range->start .. $range->end) {
+ my $current = $property->value_of($i);
+ $current = $property->table($current)->short_name;
+ $property->add_map($i, $i, 'XPG_' . $current,
+ Replace => $UNCONDITIONALLY);
+ }
+ }
+ }
+ }
+
# Create a version of the LineBreak property with the mappings that are
# omitted in the default algorithm remapped to what
# http://www.unicode.org/reports/tr14 says they should be.
my $standard_short = standardize($proposed_short);
# If the short name is shorter than the standard one, or
- # even it it's not, but the combination of it and its
+ # even if it's not, but the combination of it and its
# short property name (as in \p{prop=short} ($perl doesn't
# have this form)) saves at least two characters, then,
# cause it to be listed as a shorter synonym.
though not all are enabled by default. The omitted ones are the Unihan
properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
deprecated or Unicode-internal properties. (An installation may choose to
-recompile Perl's tables to change this. See L<Unicode character
+recompile Perl's tables to change this. See L</Unicode character
properties that are NOT accepted by Perl>.)
For most purposes, access to Unicode properties from the Perl core is through
next unless $alias->ucd;
next unless $alias->ok_as_filename;
push @{$perlprop_to_aliases{standardize($alias->name)}},
- @aliases_list;
+ uniques @aliases_list;
}
}
return @output;
}
+sub generate_wildcard_tests($$$$$) {
+ # This used only for making the test script. It generates wildcardl
+ # matching test cases that are expected to compile successfully in perl.
+
+ my $lhs = shift; # The property: what's to the left of the
+ # or equals separator
+ my $rhs = shift; # The property value; what's to the right
+ my $valid_code = shift; # A code point that's known to be in the
+ # table given by LHS=RHS; undef if table is
+ # empty
+ my $invalid_code = shift; # A code point known to not be in the table;
+ # undef if the table is all code points
+ my $warning = shift;
+
+ return if $lhs eq "";
+ return if $lhs =~ / ^ Is_ /x; # These are not currently supported
+
+ # Generate a standardized pattern, with colon being the delimitter
+ my $wildcard = "$lhs=:\\A$rhs\\z:";
+
+ my @output;
+ push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);"
+ if defined $valid_code;
+ push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);"
+ if defined $invalid_code;
+ return @output;
+}
+
sub generate_error($$$) {
# This used only for making the test script. It generates test cases that
# are expected to not only not match, but to be syntax or similar errors
Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
+Error('\p{InKana}'); # 'Kana' is not a block so InKana shouldn't compile
# Make sure this gets tested; it was not part of the official test suite at
# the time this was added. Note that this is as it would appear in the
} property_ref('*'))
{
# Non-binary properties should not match \p{}; Test all for that.
- if ($property->type != $BINARY) {
+ if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
$property->aliases;
foreach my $property_alias ($property->aliases) {
# already guaranteed to be in error
my $already_error = ! $table->file_path;
+ # A table that begins with these could actually be a
+ # user-defined property, so won't be compile time errors, as
+ # the definitions of those can be deferred until runtime
+ next if $already_error && $table_name =~ / ^ I[ns] /x;
+
# Generate error cases for this alias.
push @output, generate_error($property_name,
$table_name,
# quit now without generating success cases.
next if $already_error;
- # Now for the success cases.
+ # Now for the success cases. First, wildcard matching, as it
+ # shouldn't have any randomization.
+ if ($table_alias->status eq $NORMAL) {
+ push @output, generate_wildcard_tests($property_name,
+ $table_name,
+ $valid,
+ $invalid,
+ $warning,
+ );
+ }
my $random;
if ($loose_match) {
$invalid,
$warning,
);
+ if ($table_alias->status eq $NORMAL) {
+ push @output, generate_wildcard_tests(
+ $property_name,
+ $standard,
+ $valid,
+ $invalid,
+ $warning,
+ );
+ }
}
$random = randomize_loose_name($table_name)
}
$warning,
);
- # If the name is a rational number, add tests for a
- # non-reduced form, and for a floating point equivalent.
- if ($table_name =~ qr{/}) {
+ if ($property->name eq 'nv') {
+ if ($table_name !~ qr{/}) {
+ push @output, generate_tests($property_name,
+ sprintf("%.15e", $table_name),
+ $valid,
+ $invalid,
+ $warning,
+ );
+ }
+ else {
+ # If the name is a rational number, add tests for a
+ # non-reduced form, and for a floating point equivalent.
# 60 is a number divisible by a bunch of things
my ($numerator, $denominator) = $table_name
}
# Make tests for each possible precision from 1 to
- # just past the worst case.
+ # just past the worst case.
my $upper_limit = ($min_e_precision > $min_f_precision)
? $min_e_precision
: $min_f_precision;
# case, the representation at this
# precision could actually be a
# valid one for some other rational
- || ! grep { $_ eq $this_table }
+ || ! grep { $this_table
+ =~ / ^ $_ 0* $ /x }
@valid_base_floats)
{
push @output,
}
}
}
+ }
}
}
$table->DESTROY();
Skip => $Documentation,
),
Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
- Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter',
-
- # Don't use _Perl_WB as a synonym for
- # Word_Break in later perls, as it is tailored
- # and isn't the same as Word_Break
- 'ONLY_EARLY' ],
+ Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
Property => 'Word_Break',
Has_Missings_Defaults => $NOT_IGNORED,
),
Skip => 'Maps certain Unicode code points to their '
. 'legacy Japanese cell-phone values',
),
+ # This file is actually not usable as-is until 6.1.0, because the property
+ # is provisional, so its name is missing from PropertyAliases.txt until
+ # that release, so that further work would have to be done to get it to
+ # work properly
Input_file->new('ScriptExtensions.txt', v6.0.0,
Property => 'Script_Extensions',
Early => [ sub {} ], # Doesn't do anything but ensures
: $IGNORED),
),
# These two Indic files are actually not usable as-is until 6.1.0,
- # because their property values are missing from PropValueAliases.txt
- # until that release, so that further work would have to be done to get
- # them to work properly, which isn't worth it because of them being
- # provisional.
+ # because they are provisional, so their property values are missing from
+ # PropValueAliases.txt until that release, so that further work would have
+ # to be done to get them to work properly.
Input_file->new('IndicMatraCategory.txt', v6.0.0,
Withdrawn => v8.0.0,
Property => 'Indic_Matra_Category',
Input_file->new('NushuSources.txt', v10.0.0,
Skip => 'Specifies source material for Nushu characters',
),
+ Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0,
+ Property => 'Equivalent_Unified_Ideograph',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ ),
+ Input_file->new('EmojiData.txt', v11.0.0,
+ # Is in UAX #51 and not the UCD, so must be updated
+ # separately, and the first line edited to indicate the
+ # UCD release we're pretending it to be in. The UTC says
+ # this is a transitional state.
+ Pre_Handler => \&setup_emojidata,
+ Has_Missings_Defaults => $NOT_IGNORED,
+ Each_Line_Handler => \&filter_emojidata_line,
+ ),
);
# End of all the preliminaries.
if ($version_of_mk_invlist_bounds lt $v_version) {
Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
. " to be checked and possibly updated to Unicode"
- . " $string_version");
+ . " $string_version. Failing tests will be marked TODO");
}
exit(0);
use strict;
use warnings;
+no warnings 'experimental::uniprop_wildcards';
# Test qr/\X/ and the \p{} regular expression constructs. This file is
# constructed by mktables from the tables it generates, so if mktables is