prop_invlist
prop_invmap search_invlist
charprop
+ num
);
require './regen/regen_lib.pl';
require './regen/charset_translations.pl';
require './lib/unicore/Heavy.pl';
+use re "/aa";
# This program outputs charclass_invlists.h, which contains various inversion
# lists in the form of C arrays that are to be used as-is for inversion lists.
# out-of-sync, or the wrong data structure being passed. Currently that
# random number is:
-# charclass_invlists.h now also has a partial implementation of inversion
-# maps; enough to generate tables for the line break properties, such as GCB
+# charclass_invlists.h now also contains inversion maps and enum definitions
+# for those maps that have a finite number of possible values
my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
# integer or float
-my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
+my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /x;
+my %keywords;
my $table_name_prefix = "PL_";
# Matches valid C language enum names: begins with ASCII alphabetic, then any
# enums that should be made public
my %public_enums = (
- #_Perl_SCX => 1
+ _Perl_SCX => 1
);
# The symbols generated by this program are all currently defined only in a
my @a2n;
+my %prop_name_aliases;
+# Invert this hash so that for each canonical name, we get a list of things
+# that map to it (excluding itself)
+foreach my $name (sort keys %utf8::loose_property_name_of) {
+ my $canonical = $utf8::loose_property_name_of{$name};
+ push @{$prop_name_aliases{$canonical}}, $name if $canonical ne $name;
+}
+
# Output these tables in the same vicinity as each other, so that will get
-# paged in at about the same time
+# paged in at about the same time. These are also assumed to be the exact
+# same list as those properties used internally by perl.
my %keep_together = (
assigned => 1,
ascii => 1,
+ upper => 1,
+ lower => 1,
+ title => 1,
cased => 1,
+ uppercaseletter => 1,
+ lowercaseletter => 1,
+ titlecaseletter => 1,
+ casedletter => 1,
vertspace => 1,
xposixalnum => 1,
xposixalpha => 1,
_perl_problematic_locale_folds => 1,
_perl_quotemeta => 1,
);
+my %perl_tags; # So can find synonyms of the above properties
sub uniques {
# Returns non-duplicated input values. From "Perl Best Practices:
print $out_fh "\n" . get_conditional_compile_line_end();
}
-sub switch_pound_if ($$) {
+sub switch_pound_if ($$;$) {
my $name = shift;
my $new_pound_if = shift;
+ my $charset = shift;
+
my @new_pound_if = ref ($new_pound_if)
? sort @$new_pound_if
: $new_pound_if;
# Switch to new #if given by the 2nd argument. If there is an override
# for this, it instead switches to that. The 1st argument is the
# static's name, used only to check if there is an override for this
+ #
+ # The 'charset' parmameter, if present, is used to first end the charset
+ # #if if we actually do a switch, and then restart it afterwards. This
+ # code, then assumes that the charset #if's are enclosed in the file ones.
if (exists $exceptions_to_where_to_define{$name}) {
@new_pound_if = $exceptions_to_where_to_define{$name};
# Change to the new one if different from old
if ($in_file_pound_if ne $new_pound_if) {
+ end_charset_pound_if() if defined $charset;
+
# Exit any current #if
if ($in_file_pound_if) {
end_file_pound_if;
$in_file_pound_if = $new_pound_if;
print $out_fh "\n#if $in_file_pound_if\n";
+
+ start_charset_pound_if ($charset, 1) if defined $charset;
}
}
? 'PERL_IN_UTF8_C'
: 'PERL_IN_REGEXEC_C';
- end_charset_pound_if;
- end_file_pound_if;
- switch_pound_if($name, $where) unless exists $public_enums{$name};
- start_charset_pound_if($charset, 1);
+ if (! exists $public_enums{$name}) {
+ switch_pound_if($name, $where, $charset);
+ }
+ else {
+ end_charset_pound_if;
+ end_file_pound_if;
+ start_charset_pound_if($charset, 1);
+ }
# If the enum only contains one element, that is a dummy, default one
if (scalar @enum_definition > 1) {
print $out_fh "} $enum_declaration_type;\n";
}
- end_charset_pound_if;
- switch_pound_if($name, $where);
- start_charset_pound_if($charset, 1);
+ switch_pound_if($name, $where, $charset);
$invmap_declaration_type = ($input_format =~ /s/)
? $enum_declaration_type
# that.
for (my $i = 0; $i < @decimals_invlist; $i += 2) {
my $code_point = $decimals_invlist[$i];
- next if chr($code_point) !~ /\p{Nv=0}/;
+ next if num(chr($code_point)) ne '0';
# Turn the scripts this zero is in into a list.
my @scripts = split ",",
# Add to the non-finals list each code point that is in a non-final
# position
for my $j (0 .. @{$folds_ref->[$i]} - 2) {
- push @is_non_final_fold, $folds_ref->[$i][$j]
- unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
+ push @is_non_final_fold, $folds_ref->[$i][$j];
}
+ @is_non_final_fold = uniques @is_non_final_fold;
}
sub _Perl_Non_Final_Folds {
# other. This situation happens in Unicode 3.0.1, but probably no
# other version.
foreach my $fold (keys %new) {
- my $folds_to_string = $fold =~ /\D/a;
+ my $folds_to_string = $fold =~ /\D/;
# If the bucket contains only one element, convert from an array to a
# scalar
for my $i (0 .. $size - 1) {
no warnings 'numeric';
- $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
+ $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /x;
$spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
}
# are needed by perl, but aren't in all Unicode releases.
my @bin_props;
-my @bin_prop_defines;
+my @perl_prop_synonyms;
my %enums;
my @deprecated_messages = ""; # Element [0] is a placeholder
my %deprecated_tags;
+my $float_e_format = qr/ ^ -? \d \. \d+ e [-+] \d+ $ /x;
+
+# Create another hash that maps floating point x.yyEzz representation to what
+# %stricter_to_file_of does for the equivalent rational. A typical entry in
+# the latter hash is
+#
+# 'nv=1/2' => 'Nv/1_2',
+#
+# From that, this loop creates an entry
+#
+# 'nv=5.00e-01' => 'Nv/1_2',
+#
+# %stricter_to_file_of contains far more than just the rationals. Instead we
+# use %utf8::nv_floating_to_rational which should have an entry for each
+# nv in the former hash.
+my %floating_to_file_of;
+foreach my $key (keys %utf8::nv_floating_to_rational) {
+ my $value = $utf8::nv_floating_to_rational{$key};
+ $floating_to_file_of{$key} = $utf8::stricter_to_file_of{"nv=$value"};
+}
+
# Collect all the binary properties from data in lib/unicore
# Sort so that complements come after the main table, and the shortest
-# names first, finally alphabetically.
+# names first, finally alphabetically. Also, sort together the tables we want
+# to be kept together, and prefer those with 'posix' in their names, which is
+# what the C code is expecting their names to be.
foreach my $property (sort
{ exists $keep_together{lc $b} <=> exists $keep_together{lc $a}
+ or $b =~ /posix/i <=> $a =~ /posix/i
+ or $b =~ /perl/i <=> $a =~ /perl/i
+ or $a =~ $float_e_format <=> $b =~ $float_e_format
or $a =~ /!/ <=> $b =~ /!/
or length $a <=> length $b
or $a cmp $b
} keys %utf8::loose_to_file_of,
- keys %utf8::stricter_to_file_of
+ keys %utf8::stricter_to_file_of,
+ keys %floating_to_file_of
) {
# These two hashes map properties to values that can be considered to
# identical entries. Otherwise they differ in some way.
my $tag = $utf8::loose_to_file_of{$property};
$tag = $utf8::stricter_to_file_of{$property} unless defined $tag;
+ $tag = $floating_to_file_of{$property} unless defined $tag;
# The tag may contain an '!' meaning it is identical to the one formed
- # by removing the !, except that it is inverted, so we don't need a
- # table for it
- next if $tag =~ s/!//;
+ # by removing the !, except that it is inverted.
+ my $inverted = $tag =~ s/!//;
+
+ # This hash is lacking the property name
+ $property = "nv=$property" if $property =~ $float_e_format;
# The list of 'prop=value' entries that this single entry expands to
my @this_entries;
# thing if there is no '='
my ($lhs, $rhs) = $property =~ / ( [^=]* ) ( =? .*) /x;
- if (! exists $enums{$tag}) {
+ # $lhs then becomes the property name. See if there are any synonyms
+ # for this property.
+ if (exists $prop_name_aliases{$lhs}) {
+
+ # If so, do the combinatorics so that a new entry is added for
+ # each legal property combined with the property value (which is
+ # $rhs)
+ foreach my $alias (@{$prop_name_aliases{$lhs}}) {
+
+ # But, there are some ambiguities, like 'script' is a synonym
+ # for 'sc', and 'sc' can stand alone, meaning something
+ # entirely different than 'script'. 'script' cannot stand
+ # alone. Don't add if the potential new lhs is in the hash of
+ # stand-alone properties.
+ no warnings 'once';
+ next if $rhs eq "" && grep { $alias eq $_ }
+ keys %utf8::loose_property_to_file_of;
+
+ my $new_entry = $alias . $rhs;
+ push @this_entries, $new_entry;
+ }
+ }
+
+ # Above, we added the synonyms for the base entry we're now
+ # processing. But we haven't dealt with it yet. If we already have a
+ # property with the identical characteristics, this becomes just a
+ # synonym for it.
+ if (exists $enums{$tag}) {
+ push @this_entries, $property;
+ }
+ else { # Otherwise, create a new entry.
+
# Add to the list of properties to generate inversion lists for.
push @bin_props, uc $property;
+ # Create a rule for the parser
+ if (! exists $keywords{$property}) {
+ $keywords{$property} = token_name($property);
+ }
+
# And create an enum for it.
$enums{$tag} = $table_name_prefix . uc sanitize_name($property);
+
+ $perl_tags{$tag} = 1 if exists $keep_together{lc $property};
+
+ # Some properties are deprecated. This hash tells us so, and the
+ # warning message to raise if they are used.
+ if (exists $utf8::why_deprecated{$tag}) {
+ $deprecated_tags{$enums{$tag}} = scalar @deprecated_messages;
+ push @deprecated_messages, $utf8::why_deprecated{$tag};
+ }
+
+ # Our sort above should have made sure that we see the
+ # non-inverted version first, but this makes sure.
+ warn "$property is inverted!!!" if $inverted;
+ }
+
+ # Everything else is #defined to be the base enum, inversion is
+ # indicated by negating the value.
+ my $defined_to = "";
+ $defined_to .= "-" if $inverted;
+ $defined_to .= $enums{$tag};
+
+ # Go through the entries that evaluate to this.
+ @this_entries = uniques @this_entries;
+ foreach my $define (@this_entries) {
+
+ # There is a rule for the parser for each.
+ $keywords{$define} = $defined_to;
+
+ # And a #define for all simple names equivalent to a perl property,
+ # except those that begin with 'is' or 'in';
+ if (exists $perl_tags{$tag} && $property !~ / ^ i[ns] | = /x) {
+ push @perl_prop_synonyms, "#define "
+ . $table_name_prefix
+ . uc(sanitize_name($define))
+ . " $defined_to";
+ }
}
}
@bin_props = sort { exists $keep_together{lc $b} <=> exists $keep_together{lc $a}
or $a cmp $b
} @bin_props;
+@perl_prop_synonyms = sort(uniques(@perl_prop_synonyms));
push @props, @bin_props;
foreach my $prop (@props) {
# 255 because a re-ordering could cause 256 to need to be in the same
# range as 255.)
if ( (@invmap && $maps_to_code_point)
- || ( ($invlist[0] < 256
+ || ( @invlist
+ && $invlist[0] < 256
&& ( $invlist[0] != 0
- || (scalar @invlist != 1 && $invlist[1] < 256)))))
+ || (scalar @invlist != 1 && $invlist[1] < 256))))
{
$same_in_all_code_pages = 0;
if (! @invmap) { # Straight inversion list
unshift @invlist, @new_invlist;
}
}
+ elsif (@invmap) { # inversion maps can't cope with this variable
+ # being true, even if it could be true
+ $same_in_all_code_pages = 0;
+ }
else {
$same_in_all_code_pages = 1;
}
$found_nonl1 = 1;
last;
}
- die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
+ if (! $found_nonl1) {
+ warn "No non-Latin1 code points in $prop_name";
+ output_invlist($prop_name, []);
+ last;
+ }
}
switch_pound_if ($prop_name, 'PERL_IN_UTF8_C');
}
}
+switch_pound_if ('binary_property_tables', 'PERL_IN_UTF8_C');
+
+print $out_fh "\nconst char * deprecated_property_msgs[] = {\n\t";
+print $out_fh join ",\n\t", map { "\"$_\"" } @deprecated_messages;
+print $out_fh "\n};\n";
+
+my @enums = sort values %enums;
+
+# Save a copy of these before modification
+my @invlist_names = map { "${_}_invlist" } @enums;
+
+# Post-process the enums for deprecated properties.
+if (scalar keys %deprecated_tags) {
+ my $seen_deprecated = 0;
+ foreach my $enum (@enums) {
+ if (grep { $_ eq $enum } keys %deprecated_tags) {
+
+ # Change the enum name for this deprecated property to a
+ # munged one to act as a placeholder in the typedef. Then
+ # make the real name be a #define whose value is such that
+ # its modulus with the number of enums yields the index into
+ # the table occupied by the placeholder. And so that dividing
+ # the #define value by the table length gives an index into
+ # the table of deprecation messages for the corresponding
+ # warning.
+ my $revised_enum = "${enum}_perl_aux";
+ if (! $seen_deprecated) {
+ $seen_deprecated = 1;
+ print $out_fh "\n";
+ }
+ print $out_fh "#define $enum ($revised_enum + (MAX_UNI_KEYWORD_INDEX * $deprecated_tags{$enum}))\n";
+ $enum = $revised_enum;
+ }
+ }
+}
+
+print $out_fh "\ntypedef enum {\n\tPERL_BIN_PLACEHOLDER = 0,\n\t";
+print $out_fh join ",\n\t", @enums;
+print $out_fh "\n";
+print $out_fh "} binary_invlist_enum;\n";
+print $out_fh "\n#define MAX_UNI_KEYWORD_INDEX $enums[-1]\n";
+
+print $out_fh "\n/* Synonyms for perl properties */\n";
+print $out_fh join "\n", @perl_prop_synonyms, "\n";
+
+print $out_fh "\nstatic const UV * const PL_uni_prop_ptrs\[] = {\n";
+print $out_fh "\tNULL,\t/* Placeholder */\n\t";
+print $out_fh join ",\n\t", @invlist_names;
+print $out_fh "\n";
+print $out_fh "};\n";
+
switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
output_GCB_table();
lib/unicore/mktables
lib/Unicode/UCD.pm
regen/charset_translations.pl
+ regen/mk_PL_charclass.pl
);
{
# Depend on mktables’ own sources. It’s a shorter list of files than
}
read_only_bottom_close_and_rename($out_fh, \@sources);
+
+require './regen/mph.pl';
+
+sub token_name
+{
+ my $name = sanitize_name(shift);
+ warn "$name contains non-word" if $name =~ /\W/;
+
+ return "$table_name_prefix\U$name"
+}
+
+my $keywords_fh = open_new('uni_keywords.h', '>',
+ {style => '*', by => 'regen/mk_invlists.pl',
+ from => "mph.pl"});
+
+no warnings 'once';
+print $keywords_fh <<"EOF";
+/* The precisionn to use in "%.*e" formats */
+#define PL_E_FORMAT_PRECISION $utf8::e_precision
+
+EOF
+
+my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows) = MinimalPerfectHash::make_mph_from_hash(\%keywords);
+print $keywords_fh MinimalPerfectHash::make_algo($second_level, $seed1, $length_all_keys, $smart_blob, $rows, undef, undef, undef, 'match_uniprop' );
+
+push @sources, 'regen/mph.pl';
+read_only_bottom_close_and_rename($keywords_fh, \@sources);