prop_invlist
prop_invmap search_invlist
charprop
+ num
);
require './regen/regen_lib.pl';
require './regen/charset_translations.pl';
require './lib/unicore/Heavy.pl';
+use re "/aa";
# This program outputs charclass_invlists.h, which contains various inversion
# lists in the form of C arrays that are to be used as-is for inversion lists.
# out-of-sync, or the wrong data structure being passed. Currently that
# random number is:
-# charclass_invlists.h now also has a partial implementation of inversion
-# maps; enough to generate tables for the line break properties, such as GCB
+# charclass_invlists.h now also contains inversion maps and enum definitions
+# for those maps that have a finite number of possible values
my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
# integer or float
-my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
+my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /x;
my %keywords;
my $table_name_prefix = "PL_";
my %keep_together = (
assigned => 1,
ascii => 1,
+ upper => 1,
+ lower => 1,
+ title => 1,
cased => 1,
+ uppercaseletter => 1,
+ lowercaseletter => 1,
+ titlecaseletter => 1,
+ casedletter => 1,
vertspace => 1,
xposixalnum => 1,
xposixalpha => 1,
# that.
for (my $i = 0; $i < @decimals_invlist; $i += 2) {
my $code_point = $decimals_invlist[$i];
- next if chr($code_point) !~ /\p{Nv=0}/;
+ next if num(chr($code_point)) ne '0';
# Turn the scripts this zero is in into a list.
my @scripts = split ",",
# other. This situation happens in Unicode 3.0.1, but probably no
# other version.
foreach my $fold (keys %new) {
- my $folds_to_string = $fold =~ /\D/a;
+ my $folds_to_string = $fold =~ /\D/;
# If the bucket contains only one element, convert from an array to a
# scalar
for my $i (0 .. $size - 1) {
no warnings 'numeric';
- $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
+ $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /x;
$spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
}
my @deprecated_messages = ""; # Element [0] is a placeholder
my %deprecated_tags;
+my $float_e_format = qr/ ^ -? \d \. \d+ e [-+] \d+ $ /x;
+
+# Create another hash that maps floating point x.yyEzz representation to what
+# %stricter_to_file_of does for the equivalent rational. A typical entry in
+# the latter hash is
+#
+# 'nv=1/2' => 'Nv/1_2',
+#
+# From that, this loop creates an entry
+#
+# 'nv=5.00e-01' => 'Nv/1_2',
+#
+# %stricter_to_file_of contains far more than just the rationals. Instead we
+# use %utf8::nv_floating_to_rational which should have an entry for each
+# nv in the former hash.
+my %floating_to_file_of;
+foreach my $key (keys %utf8::nv_floating_to_rational) {
+ my $value = $utf8::nv_floating_to_rational{$key};
+ $floating_to_file_of{$key} = $utf8::stricter_to_file_of{"nv=$value"};
+}
+
# Collect all the binary properties from data in lib/unicore
# Sort so that complements come after the main table, and the shortest
# names first, finally alphabetically. Also, sort together the tables we want
{ exists $keep_together{lc $b} <=> exists $keep_together{lc $a}
or $b =~ /posix/i <=> $a =~ /posix/i
or $b =~ /perl/i <=> $a =~ /perl/i
+ or $a =~ $float_e_format <=> $b =~ $float_e_format
or $a =~ /!/ <=> $b =~ /!/
or length $a <=> length $b
or $a cmp $b
} keys %utf8::loose_to_file_of,
- keys %utf8::stricter_to_file_of
+ keys %utf8::stricter_to_file_of,
+ keys %floating_to_file_of
) {
# These two hashes map properties to values that can be considered to
# identical entries. Otherwise they differ in some way.
my $tag = $utf8::loose_to_file_of{$property};
$tag = $utf8::stricter_to_file_of{$property} unless defined $tag;
+ $tag = $floating_to_file_of{$property} unless defined $tag;
# The tag may contain an '!' meaning it is identical to the one formed
# by removing the !, except that it is inverted.
my $inverted = $tag =~ s/!//;
+ # This hash is lacking the property name
+ $property = "nv=$property" if $property =~ $float_e_format;
+
# The list of 'prop=value' entries that this single entry expands to
my @this_entries;
# 255 because a re-ordering could cause 256 to need to be in the same
# range as 255.)
if ( (@invmap && $maps_to_code_point)
- || ( ($invlist[0] < 256
+ || ( @invlist
+ && $invlist[0] < 256
&& ( $invlist[0] != 0
- || (scalar @invlist != 1 && $invlist[1] < 256)))))
+ || (scalar @invlist != 1 && $invlist[1] < 256))))
{
$same_in_all_code_pages = 0;
if (! @invmap) { # Straight inversion list
unshift @invlist, @new_invlist;
}
}
+ elsif (@invmap) { # inversion maps can't cope with this variable
+ # being true, even if it could be true
+ $same_in_all_code_pages = 0;
+ }
else {
$same_in_all_code_pages = 1;
}
sub token_name
{
my $name = sanitize_name(shift);
- warn "$name contains non-word" if $name =~ /\W/a;
+ warn "$name contains non-word" if $name =~ /\W/;
return "$table_name_prefix\U$name"
}
{style => '*', by => 'regen/mk_invlists.pl',
from => "mph.pl"});
+no warnings 'once';
+print $keywords_fh <<"EOF";
+/* The precisionn to use in "%.*e" formats */
+#define PL_E_FORMAT_PRECISION $utf8::e_precision
+
+EOF
+
my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows) = MinimalPerfectHash::make_mph_from_hash(\%keywords);
print $keywords_fh MinimalPerfectHash::make_algo($second_level, $seed1, $length_all_keys, $smart_blob, $rows, undef, undef, undef, 'match_uniprop' );