# expected, a warning will be generated. If an older version is being
# compiled, any bounds tests that fail in the generated test file (-maketest
# option) will be marked as TODO.
-my $version_of_mk_invlist_bounds = v10.0.0;
+my $version_of_mk_invlist_bounds = v11.0.0;
##########################################################################
#
# to use the -annotate option when using this. Run this program on a unicore
# containing the starting release you want to compare. Save that output
# structure. Then, switching to a unicore with the ending release, change the
-# 0 in the $string_compare_versions definition just below to a string
+# "" in the $string_compare_versions definition just below to a string
# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
# to the starting release. This program will then compile, but throw away all
# code points introduced after the starting release. Finally use a diff tool
'Canonical_Combining_Class=Attached_Below_Left'
}
+# Obsoleted
+if ($v_version ge v11.0.0) {
+ push @tables_that_may_be_empty, qw(
+ Grapheme_Cluster_Break=E_Base
+ Grapheme_Cluster_Break=E_Base_GAZ
+ Grapheme_Cluster_Break=E_Modifier
+ Grapheme_Cluster_Break=Glue_After_Zwj
+ Word_Break=E_Base
+ Word_Break=E_Base_GAZ
+ Word_Break=E_Modifier
+ Word_Break=Glue_After_Zwj);
+}
+
# Enum values for to_output_map() method in the Map_Table package. (0 is don't
# output)
my $EXTERNAL_MAP = 1;
);
}
-
# Add any explicit cjk values
$file->insert_lines(@cjk_property_values);
my $float = eval $rational;
$float = sprintf "%.*e", $E_FLOAT_PRECISION, $float;
+
+ # Strip off any leading zeros beyond 2 digits to make it C99 compliant.
+ # (Windows has 3 digit exponents, contrary to C99)
+ $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x;
+
+ if ( defined $nv_floating_to_rational{$float}
+ && $nv_floating_to_rational{$float} ne $rational)
+ {
+ die Carp::my_carp_bug("Both '$rational' and"
+ . " '$nv_floating_to_rational{$float}' evaluate to"
+ . " the same floating point number."
+ . " \$E_FLOAT_PRECISION must be increased");
+ }
$nv_floating_to_rational{$float} = $rational;
return;
}
return;
}
+sub setup_emojidata {
+ my $prop_ref = Property->new('XPG',
+ Full_Name => 'Extended_Pictographic',
+ );
+ $prop_ref->set_fate($PLACEHOLDER,
+ "Not part of the Unicode Character Database");
+}
+
+sub filter_emojidata_line {
+ # We only are interested in this single property from this non-UCD data
+ # file, and we turn it into a Perl property, so that it isn't accessible
+ # to the users
+
+ $_ = "" unless /\bExtended_Pictographic\b/;
+
+ return;
+}
+
sub generate_hst {
# Populates the Hangul Syllable Type property from first principles
}
# Perl tailors the WordBreak property so that \b{wb} doesn't split
- # adjacent spaces into separate words. First create a copy of the regular
- # WB property as '_Perl_WB'. (On Unicode releases earlier than when WB
- # was defined for, this will already have been done by the substitute file
- # portion for 'Input_file' code for WB.)
+ # adjacent spaces into separate words. Unicode 11.0 moved in that
+ # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) NO
+ # BREAK SPACE as breaking, so we retained the original Perl customization.
+ # To do this, in the Perl copy of WB, simply replace the mappings of
+ # horizontal space characters that otherwise would map to the default or
+ # the 11.0 'WSegSpace' to instead map to our tailoring.
my $perl_wb = property_ref('_Perl_WB');
- if (! defined $perl_wb) {
- $perl_wb = Property->new('_Perl_WB',
- Fate => $INTERNAL_ONLY,
- Perl_Extension => 1,
- Directory => $map_directory,
- Type => $STRING);
- my $wb = property_ref('Word_Break');
- $perl_wb->initialize($wb);
- $perl_wb->set_default_map($wb->default_map);
- }
-
- # And simply replace the mappings of horizontal space characters that
- # otherwise would map to the default to instead map to our tailoring.
my $default = $perl_wb->default_map;
for my $range ($Blank->ranges) {
for my $i ($range->start .. $range->end) {
- next unless $perl_wb->value_of($i) eq $default;
+ my $value = $perl_wb->value_of($i);
+
+ next unless $value eq $default || $value eq 'WSegSpace';
$perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
Replace => $UNCONDITIONALLY);
}
}
+ # Also starting in Unicode 11.0, rules for some of the boundary types are
+ # based on a non-UCD property (which we have read in if it exists).
+ # Recall that these boundary properties partition the code points into
+ # equivalence classes (represented as enums).
+ #
+ # The loop below goes through each code point that matches the non-UCD
+ # property, and for each current equivalence class containing such a code
+ # point, splits it so that those that are in both are now in a newly
+ # created equivalence class whose name is a combination of the property
+ # and the old class name, leaving unchanged everything that doesn't match
+ # the non-UCD property.
+ my $pictographic_emoji = property_ref('XPG');
+ if (defined $pictographic_emoji) {
+ foreach my $base_property (property_ref('GCB'),
+ property_ref('WB'))
+ {
+ my $property = property_ref('_Perl_' . $base_property->name);
+ foreach my $range ($pictographic_emoji->table('Y')->ranges) {
+ foreach my $i ($range->start .. $range->end) {
+ my $current = $property->value_of($i);
+ $current = $property->table($current)->short_name;
+ $property->add_map($i, $i, 'XPG_' . $current,
+ Replace => $UNCONDITIONALLY);
+ }
+ }
+ }
+ }
+
# Create a version of the LineBreak property with the mappings that are
# omitted in the default algorithm remapped to what
# http://www.unicode.org/reports/tr14 says they should be.
though not all are enabled by default. The omitted ones are the Unihan
properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
deprecated or Unicode-internal properties. (An installation may choose to
-recompile Perl's tables to change this. See L<Unicode character
+recompile Perl's tables to change this. See L</Unicode character
properties that are NOT accepted by Perl>.)
For most purposes, access to Unicode properties from the Perl core is through
} property_ref('*'))
{
# Non-binary properties should not match \p{}; Test all for that.
- if ($property->type != $BINARY) {
+ if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
$property->aliases;
foreach my $property_alias ($property->aliases) {
# already guaranteed to be in error
my $already_error = ! $table->file_path;
+ # A table that begins with these could actually be a
+ # user-defined property, so won't be compile time errors, as
+ # the definitions of those can be deferred until runtime
+ next if $already_error && $table_name =~ / ^ I[ns] /x;
+
# Generate error cases for this alias.
push @output, generate_error($property_name,
$table_name,
}
# Make tests for each possible precision from 1 to
- # just past the worst case.
+ # just past the worst case.
my $upper_limit = ($min_e_precision > $min_f_precision)
? $min_e_precision
: $min_f_precision;
Skip => $Documentation,
),
Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
- Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter',
-
- # Don't use _Perl_WB as a synonym for
- # Word_Break in later perls, as it is tailored
- # and isn't the same as Word_Break
- 'ONLY_EARLY' ],
+ Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
Property => 'Word_Break',
Has_Missings_Defaults => $NOT_IGNORED,
),
Skip => 'Maps certain Unicode code points to their '
. 'legacy Japanese cell-phone values',
),
+ # This file is actually not usable as-is until 6.1.0, because the property
+ # is provisional, so its name is missing from PropertyAliases.txt until
+ # that release, so that further work would have to be done to get it to
+ # work properly
Input_file->new('ScriptExtensions.txt', v6.0.0,
Property => 'Script_Extensions',
Early => [ sub {} ], # Doesn't do anything but ensures
: $IGNORED),
),
# These two Indic files are actually not usable as-is until 6.1.0,
- # because their property values are missing from PropValueAliases.txt
- # until that release, so that further work would have to be done to get
- # them to work properly, which isn't worth it because of them being
- # provisional.
+ # because they are provisional, so their property values are missing from
+ # PropValueAliases.txt until that release, so that further work would have
+ # to be done to get them to work properly.
Input_file->new('IndicMatraCategory.txt', v6.0.0,
Withdrawn => v8.0.0,
Property => 'Indic_Matra_Category',
Input_file->new('NushuSources.txt', v10.0.0,
Skip => 'Specifies source material for Nushu characters',
),
+ Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0,
+ Property => 'Equivalent_Unified_Ideograph',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ ),
+ Input_file->new('EmojiData.txt', v11.0.0,
+ # Is in UAX #51 and not the UCD, so must be updated
+ # separately, and the first line edited to indicate the
+ # UCD release we're pretending it to be in. The UTC says
+ # this is a transitional state.
+ Pre_Handler => \&setup_emojidata,
+ Has_Missings_Defaults => $NOT_IGNORED,
+ Each_Line_Handler => \&filter_emojidata_line,
+ ),
);
# End of all the preliminaries.
if ($version_of_mk_invlist_bounds lt $v_version) {
Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
. " to be checked and possibly updated to Unicode"
- . " $string_version");
+ . " $string_version. Failing tests will be marked TODO");
}
exit(0);