prop_invlist
prop_invmap search_invlist
charprop
+ num
+ charblock
);
require './regen/regen_lib.pl';
require './regen/charset_translations.pl';
+require './lib/unicore/UCD.pl';
+use re "/aa";
# This program outputs charclass_invlists.h, which contains various inversion
# lists in the form of C arrays that are to be used as-is for inversion lists.
# out-of-sync, or the wrong data structure being passed. Currently that
# random number is:
-# charclass_invlists.h now also has a partial implementation of inversion
-# maps; enough to generate tables for the line break properties, such as GCB
-
my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
-# integer or float
-my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
+# charclass_invlists.h now also contains inversion maps and enum definitions
+# for those maps that have a finite number of possible values
+
+# integer or float (no exponent)
+my $integer_or_float_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /x;
+
+# Also includes rationals
+my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x;
+
+# More than one code point may have the same code point as their fold. This
+# gives the maximum number in the current Unicode release. (The folded-to
+# code point is not included in this count.) Most folds are pairs of code
+# points, like 'B' and 'b', so this number is at least one.
+my $max_fold_froms = 1;
+
+my %keywords;
+my $table_name_prefix = "UNI_";
# Matches valid C language enum names: begins with ASCII alphabetic, then any
# ASCII \w
# enums that should be made public
my %public_enums = (
- #_Perl_SCX => 1
+ _Perl_SCX => 1
);
# The symbols generated by this program are all currently defined only in a
(
#_Perl_IVCF => 'PERL_IN_REGCOMP_C',
);
+
my %where_to_define_enums = ();
+my $applies_to_all_charsets_text = "all charsets";
+
my %gcb_enums;
my @gcb_short_enums;
my %gcb_abbreviations;
my @a2n;
+my %prop_name_aliases;
+# Invert this hash so that for each canonical name, we get a list of things
+# that map to it (excluding itself)
+foreach my $name (sort keys %Unicode::UCD::loose_property_name_of) {
+ my $canonical = $Unicode::UCD::loose_property_name_of{$name};
+ push @{$prop_name_aliases{$canonical}}, $name if $canonical ne $name;
+}
+
+# Output these tables in the same vicinity as each other, so that will get
+# paged in at about the same time. These are also assumed to be the exact
+# same list as those properties used internally by perl.
+my %keep_together = (
+ assigned => 1,
+ ascii => 1,
+ upper => 1,
+ lower => 1,
+ title => 1,
+ cased => 1,
+ uppercaseletter => 1,
+ lowercaseletter => 1,
+ titlecaseletter => 1,
+ casedletter => 1,
+ vertspace => 1,
+ xposixalnum => 1,
+ xposixalpha => 1,
+ xposixblank => 1,
+ xposixcntrl => 1,
+ xposixdigit => 1,
+ xposixgraph => 1,
+ xposixlower => 1,
+ xposixprint => 1,
+ xposixpunct => 1,
+ xposixspace => 1,
+ xposixupper => 1,
+ xposixword => 1,
+ xposixxdigit => 1,
+ posixalnum => 1,
+ posixalpha => 1,
+ posixblank => 1,
+ posixcntrl => 1,
+ posixdigit => 1,
+ posixgraph => 1,
+ posixlower => 1,
+ posixprint => 1,
+ posixpunct => 1,
+ posixspace => 1,
+ posixupper => 1,
+ posixword => 1,
+ posixxdigit => 1,
+ _perl_any_folds => 1,
+ _perl_folds_to_multi_char => 1,
+ _perl_is_in_multi_char_fold => 1,
+ _perl_non_final_folds => 1,
+ _perl_idstart => 1,
+ _perl_idcont => 1,
+ _perl_charname_begin => 1,
+ _perl_charname_continue => 1,
+ _perl_problematic_locale_foldeds_start => 1,
+ _perl_problematic_locale_folds => 1,
+ _perl_quotemeta => 1,
+ );
+my %perl_tags; # So can find synonyms of the above properties
+
+my $unused_table_hdr = 'u'; # Heading for row or column for unused values
+
sub uniques {
# Returns non-duplicated input values. From "Perl Best Practices:
# Encapsulated Cleverness". p. 455 in first edition.
# Returns the input Unicode code point translated to native.
- return $cp if $cp !~ $numeric_re || $cp > 255;
+ return $cp if $cp !~ $integer_or_float_re || $cp > 255;
return $a2n[$cp];
}
}
}
-sub switch_pound_if ($$) {
+sub end_charset_pound_if {
+ print $out_fh "\n" . get_conditional_compile_line_end();
+}
+
+sub switch_pound_if ($$;$) {
my $name = shift;
my $new_pound_if = shift;
+ my $charset = shift;
+
my @new_pound_if = ref ($new_pound_if)
? sort @$new_pound_if
: $new_pound_if;
# Switch to new #if given by the 2nd argument. If there is an override
# for this, it instead switches to that. The 1st argument is the
# static's name, used only to check if there is an override for this
+ #
+ # The 'charset' parmameter, if present, is used to first end the charset
+ # #if if we actually do a switch, and then restart it afterwards. This
+ # code, then assumes that the charset #if's are enclosed in the file ones.
if (exists $exceptions_to_where_to_define{$name}) {
@new_pound_if = $exceptions_to_where_to_define{$name};
}
foreach my $element (@new_pound_if) {
+
+ # regcomp.c is arranged so that the tables are not compiled in
+ # re_comp.c */
+ my $no_xsub = 1 if $element =~ / PERL_IN_ (?: REGCOMP ) _C /x;
$element = "defined($element)";
+ $element = "($element && ! defined(PERL_IN_XSUB_RE))" if $no_xsub;
}
$new_pound_if = join " || ", @new_pound_if;
# Change to the new one if different from old
if ($in_file_pound_if ne $new_pound_if) {
+ end_charset_pound_if() if defined $charset;
+
# Exit any current #if
if ($in_file_pound_if) {
end_file_pound_if;
$in_file_pound_if = $new_pound_if;
print $out_fh "\n#if $in_file_pound_if\n";
+
+ start_charset_pound_if ($charset, 1) if defined $charset;
}
}
+sub start_charset_pound_if ($;$) {
+ print $out_fh "\n" . get_conditional_compile_line_start(shift, shift);
+}
+
+{ # Closure
+ my $fh;
+ my $in_doinit = 0;
+
+ sub output_table_header($$$;$@) {
+
+ # Output to $fh the heading for a table given by the other inputs
+
+ $fh = shift;
+ my ($type, # typedef of table, like UV, UV*
+ $name, # name of table
+ $comment, # Optional comment to put on header line
+ @sizes # Optional sizes of each array index. If omitted,
+ # there is a single index whose size is computed by
+ # the C compiler.
+ ) = @_;
+
+ $type =~ s/ \s+ $ //x;
+
+ # If a the typedef is a ptr, add in an extra const
+ $type .= " const" if $type =~ / \* $ /x;
+
+ $comment = "" unless defined $comment;
+ $comment = " /* $comment */" if $comment;
+
+ my $array_declaration;
+ if (@sizes) {
+ $array_declaration = "";
+ $array_declaration .= "[$_]" for @sizes;
+ }
+ else {
+ $array_declaration = '[]';
+ }
+
+ my $declaration = "$type ${name}$array_declaration";
+
+ # Things not matching this are static. Otherwise, it is an external
+ # constant, initialized only under DOINIT.
+ #
+ # (Currently everything is static)
+ if ($in_file_pound_if !~ / PERL_IN_ (?: ) _C /x) {
+ $in_doinit = 0;
+ print $fh "\nstatic const $declaration = {$comment\n";
+ }
+ else {
+ $in_doinit = 1;
+ print $fh <<EOF;
+
+# ifndef DOINIT
+
+EXTCONST $declaration;
+
+# else
+
+EXTCONST $declaration = {$comment
+EOF
+ }
+ }
+
+ sub output_table_trailer() {
+
+ # Close out a table started by output_table_header()
+
+ print $fh "};\n";
+ if ($in_doinit) {
+ print $fh "\n# endif /* DOINIT */\n\n";
+ $in_doinit = 0;
+ }
+ }
+} # End closure
+
+
sub output_invlist ($$;$) {
my $name = shift;
my $invlist = shift; # Reference to inversion list array
unshift @$invlist, 0;
$zero_or_one = 1;
}
- my $count = @$invlist;
- switch_pound_if ($name, 'PERL_IN_UTF8_C');
+ $charset = "for $charset" if $charset;
+ output_table_header($out_fh, "UV", "${name}_invlist", $charset);
- print $out_fh "\nstatic const UV ${name}_invlist[] = {";
- print $out_fh " /* for $charset */" if $charset;
- print $out_fh "\n";
-
- print $out_fh "\t$count,\t/* Number of elements */\n";
- print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
- print $out_fh "\t", $zero_or_one,
- ",\t/* 0 if the list starts at 0;",
- "\n\t\t 1 if it starts at the element beyond 0 */\n";
+ my $count = @$invlist;
+ print $out_fh <<EOF;
+\t$count,\t/* Number of elements */
+\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */
+\t$zero_or_one,\t/* 0 if the list starts at 0;
+\t\t 1 if it starts at the element beyond 0 */
+EOF
# The main body are the UVs passed in to this routine. Do the final
# element separately
print $out_fh "\n";
}
- print $out_fh "};\n";
+ output_table_trailer();
}
sub output_invmap ($$$$$$$) {
}
# The internal enums come last, and in the order specified.
+ #
+ # The internal one named EDGE is also used a marker. Any ones that
+ # come after it are used in the algorithms below, and so must be
+ # defined, even if the release of Unicode this is being compiled for
+ # doesn't use them. But since no code points are assigned to them in
+ # such a release, those values will never be accessed. We collapse
+ # all of them into a single placholder row and a column. The
+ # algorithms below will fill in those cells with essentially garbage,
+ # but they are never read, so it doesn't matter. This allows the
+ # algorithm to remain the same from release to release.
+ #
+ # In one case, regexec.c also uses a placeholder which must be defined
+ # here, and we put it in the unused row and column as its value is
+ # never read.
+ #
my @enums = @input_enums;
my @extras;
+ my @unused_enums;
+ my $unused_enum_value = @enums;
if ($extra_enums ne "") {
@extras = split /,/, $extra_enums;
+ my $seen_EDGE = 0;
# Don't add if already there.
foreach my $this_extra (@extras) {
next if grep { $_ eq $this_extra } @enums;
-
- push @enums, $this_extra;
+ if ($this_extra eq 'EDGE') {
+ push @enums, $this_extra;
+ $seen_EDGE = 1;
+ }
+ elsif ($seen_EDGE) {
+ push @unused_enums, $this_extra;
+ }
+ else {
+ push @enums, $this_extra;
+ }
}
+
+ @unused_enums = sort @unused_enums;
+ $unused_enum_value = @enums; # All unused have the same value,
+ # one beyond the final used one
}
# Assign a value to each element of the enum type we are creating.
# all the tables
my $type = lc $prop_name;
- my $placeholder = "a";
-
# Skip if we've already done this code, which populated
# this hash
if (eval "! \%${type}_enums") {
#
# First are those enums that are not part of the
# property, but are defined by this code. By
- # convention these have all-caps names of at least
- # 4 characters. We use the lowercased name for
- # thse.
+ # convention these have all-caps names. We use
+ # the lowercased name for these.
#
- # Second are enums that are needed to get
- # regexec.c to compile, but don't exist in all
- # Unicode releases. To get here, we must be
- # compiling an earlier Unicode release that
- # doesn't have that enum, so just use a unique
- # anonymous name for it.
+ # Second are enums that are needed to get the
+ # algorithms below to work and/or to get regexec.c
+ # to compile, but don't exist in all Unicode
+ # releases. These are handled outside this loop
+ # as 'unused_enums'
if (grep { $_ eq $enum } @input_enums) {
$short = $enum
}
- elsif ($enum !~ / ^ [A-Z]{4,} $ /x) {
- $short = $placeholder++;
- }
else {
$short = lc $enum;
}
eval "\$${type}_short_enums[$value] = '$short'";
die $@ if $@;
}
- }
- }
- # Inversion map stuff is used only by regexec or utf-8 (if it is
- # for code points) , unless it is in the enum exception list
- my $where = (exists $where_to_define_enums{$name})
- ? $where_to_define_enums{$name}
- : ($input_format =~ /a/)
- ? 'PERL_IN_UTF8_C'
- : 'PERL_IN_REGEXEC_C';
+ # Each unused enum has the same value. They all are collapsed
+ # into one row and one column, named $unused_table_hdr.
+ if (@unused_enums) {
+ eval "\$${type}_short_enums['$unused_enum_value'] = '$unused_table_hdr'";
+ die $@ if $@;
- my $is_public_enum = exists $public_enums{$name};
- if ($is_public_enum) {
- end_file_pound_if;
- }
- else {
- switch_pound_if($name, $where);
+ foreach my $enum (@unused_enums) {
+ eval "\$${type}_enums{$enum} = $unused_enum_value";
+ die $@ if $@;
+ }
+ }
+ }
}
# The short names tend to be two lower case letters, but it looks
my $name = $enum_list[$i];
push @enum_definition, "\t${name_prefix}$name = $i";
}
+ if (@unused_enums) {
+ foreach my $unused (@unused_enums) {
+ push @enum_definition,
+ ",\n\t${name_prefix}$unused = $unused_enum_value";
+ }
+ }
# For an 'l' property, we need extra enums, because some of the
# elements are lists. Each such distinct list is placed in its own
$enum_declaration_type = "${name_prefix}enum";
- # Finished with the enum definition. If it only contains one element,
- # that is a dummy, default one
+ # Finished with the enum definition. Inversion map stuff is used only
+ # by regexec or utf-8 (if it is for code points) , unless it is in the
+ # enum exception list
+ my $where = (exists $where_to_define_enums{$name})
+ ? $where_to_define_enums{$name}
+ : ($input_format =~ /a/)
+ ? 'PERL_IN_UTF8_C'
+ : 'PERL_IN_REGEXEC_C';
+
+ if (! exists $public_enums{$name}) {
+ switch_pound_if($name, $where, $charset);
+ }
+ else {
+ end_charset_pound_if;
+ end_file_pound_if;
+ start_charset_pound_if($charset, 1);
+ }
+
+ # If the enum only contains one element, that is a dummy, default one
if (scalar @enum_definition > 1) {
# Currently unneeded
print $out_fh "} $enum_declaration_type;\n";
}
- switch_pound_if($name, $where) if $is_public_enum;
+ switch_pound_if($name, $where, $charset);
$invmap_declaration_type = ($input_format =~ /s/)
? $enum_declaration_type
# Output each aux table.
foreach my $table_number (@sorted_table_list) {
my $table = $inverted_mults{$table_number};
- print $out_fh "\nstatic const $aux_declaration_type $name_prefix$aux_table_prefix$table_number\[] = {\n";
+ output_table_header($out_fh,
+ $aux_declaration_type,
+ "$name_prefix$aux_table_prefix$table_number");
# Earlier, we joined the elements of this table together with a comma
my @elements = split ",", $table;
print $out_fh "\t${name_prefix}$elements[$i]";
}
}
- print $out_fh "\n};\n";
+
+ print $out_fh "\n";
+ output_table_trailer();
}
# Output the table that is indexed by the absolute value of the
# aux table enum and contains pointers to the tables output just
# above
- print $out_fh "\nstatic const $aux_declaration_type * const ${name_prefix}${aux_table_prefix}ptrs\[] = {\n";
+ output_table_header($out_fh, "$aux_declaration_type *",
+ "${name_prefix}${aux_table_prefix}ptrs");
print $out_fh "\tNULL,\t/* Placeholder */\n";
for my $i (1 .. @sorted_table_list) {
print $out_fh ",\n" if $i > 1;
print $out_fh "\t$name_prefix$aux_table_prefix$i";
}
- print $out_fh "\n};\n";
+ print $out_fh "\n";
+ output_table_trailer();
print $out_fh
"\n/* Parallel table to the above, giving the number of elements"
. " in each table\n * pointed to */\n";
- print $out_fh "static const U8 ${name_prefix}${aux_table_prefix}lengths\[] = {\n";
+ output_table_header($out_fh, "U8",
+ "${name_prefix}${aux_table_prefix}lengths");
print $out_fh "\t0,\t/* Placeholder */\n";
for my $i (1 .. @sorted_table_list) {
print $out_fh ",\n" if $i > 1;
print $out_fh "\t$aux_counts[$i]\t/* $name_prefix$aux_table_prefix$i */";
}
- print $out_fh "\n};\n";
+ print $out_fh "\n";
+ output_table_trailer();
} # End of outputting the auxiliary and associated tables
# The scx property used in regexec.c needs a specialized table which
# that.
for (my $i = 0; $i < @decimals_invlist; $i += 2) {
my $code_point = $decimals_invlist[$i];
- next if chr($code_point) !~ /\p{Nv=0}/;
+ next if num(chr($code_point)) ne '0';
# Turn the scripts this zero is in into a list.
my @scripts = split ",",
. " code point for that\n * script; 0 if the script has multiple"
. " digit sequences. Scripts without a\n * digit sequence use"
. " ASCII [0-9], hence are marked '0' */\n";
- print $out_fh "static const UV script_zeros[] = {\n";
+ output_table_header($out_fh, "UV", "script_zeros");
for my $i (0 .. @script_zeros - 1) {
my $code_point = $script_zeros[$i];
if (defined $code_point) {
print $out_fh "\t/* $enum_list[$i] */";
print $out_fh "\n";
}
- print $out_fh "};\n";
+ output_table_trailer();
} # End of special handling of scx
}
else {
&& $count;
# Now output the inversion map proper
- print $out_fh "\nstatic const $invmap_declaration_type ${name}_invmap[] = {";
- print $out_fh " /* for $charset */" if $charset;
- print $out_fh "\n";
+ $charset = "for $charset" if $charset;
+ output_table_header($out_fh, $invmap_declaration_type,
+ "${name}_invmap",
+ $charset);
# The main body are the scalars passed in to this routine.
for my $i (0 .. $count - 1) {
print $out_fh "," if $i < $count - 1;
print $out_fh "\n";
}
- print $out_fh "};\n";
+ output_table_trailer();
}
sub mk_invlist_from_sorted_cp_list {
die "Incorrect format '$format' for Case_Folding inversion map"
unless $format eq 'al'
|| $format eq 'a';
-my @has_multi_char_fold;
-my @is_non_final_fold;
-
-for my $i (0 .. @$folds_ref - 1) {
- next unless ref $folds_ref->[$i]; # Skip single-char folds
- push @has_multi_char_fold, $cp_ref->[$i];
-
- # Add to the non-finals list each code point that is in a non-final
- # position
- for my $j (0 .. @{$folds_ref->[$i]} - 2) {
- push @is_non_final_fold, $folds_ref->[$i][$j]
- unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
- }
-}
-
-sub _Perl_Non_Final_Folds {
- @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
- my @return = mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
- return \@return;
-}
-
sub _Perl_IVCF {
# This creates a map of the inversion of case folding. i.e., given a
# other. This situation happens in Unicode 3.0.1, but probably no
# other version.
foreach my $fold (keys %new) {
- my $folds_to_string = $fold =~ /\D/a;
+ my $folds_to_string = $fold =~ /\D/;
# If the bucket contains only one element, convert from an array to a
# scalar
# Now we have a hash that is the inversion of the case fold property.
- # Convert it to an inversion map.
+ # First find the maximum number of code points that fold to the same one.
+ foreach my $fold_to (keys %new) {
+ if (ref $new{$fold_to}) {
+ my $folders_count = scalar @{$new{$fold_to}};
+ $max_fold_froms = $folders_count if $folders_count > $max_fold_froms;
+ }
+ }
+ # Then convert the hash to an inversion map.
my @sorted_folds = sort { $a <=> $b } keys %new;
my (@invlist, @invmap);
push @invlist, $sorted_folds[-1] + 1;
push @invmap, 0;
+ push @invlist, 0x110000;
+ push @invmap, 0;
+
# All Unicode versions have some places where multiple code points map to
# the same one, so the format always has an 'l'
return \@invlist, \@invmap, 'al', $default;
return \@return;
}
+sub _Perl_CCC_non0_non230 {
+
+ # Create an inversion list of code points with non-zero canonical
+ # combining class that also don't have 230 as the class number. This is
+ # part of a Unicode Standard rule
+
+ my @nonzeros = prop_invlist("ccc=0");
+ shift @nonzeros; # Invert so is "ccc != 0"
+
+ my @return;
+
+ # Expand into list of code points, while excluding those with ccc == 230
+ for (my $i = 0; $i < @nonzeros; $i += 2) {
+ my $upper = ($i + 1) < @nonzeros
+ ? $nonzeros[$i+1] - 1 # In range
+ : $Unicode::UCD::MAX_CP; # To infinity.
+ for my $j ($nonzeros[$i] .. $upper) {
+ my @ccc_names = prop_value_aliases("ccc", charprop($j, "ccc"));
+
+ # Final element in @ccc_names will be all numeric
+ push @return, $j if $ccc_names[-1] != 230;
+ }
+ }
+
+ @return = sort { $a <=> $b } @return;
+ @return = mk_invlist_from_sorted_cp_list(\@return);
+ return \@return;
+}
+
sub output_table_common {
# Common subroutine to actually output the generated rules table.
my $column_width = 2; # We currently allow 2 digits for the number
- # If the maximum value in the table is 1, it can be a bool. (Being above
- # a U8 is not currently handled
- my $max_element = 0;
- for my $i (0 .. $size - 1) {
- for my $j (0 .. $size - 1) {
- next if $max_element >= $table_ref->[$i][$j];
- $max_element = $table_ref->[$i][$j];
- }
- }
- die "Need wider table column width given '$max_element"
- if length $max_element > $column_width;
-
- my $table_type = ($max_element == 1)
- ? 'bool'
- : 'U8';
+ # Being above a U8 is not currently handled
+ my $table_type = 'U8';
# If a name is longer than the width set aside for a column, its column
# needs to have increased spacing so that the name doesn't get truncated
# nor run into an adjacent column
my @spacers;
- # If we are being compiled on a Unicode version earlier than that which
- # this file was designed for, it may be that some of the property values
- # aren't in the current release, and so would be undefined if we didn't
- # define them ourselves. Earlier code has done this, making them
- # lowercase characters of length one. We look to see if any exist, so
- # that we can add an annotation to the output table
- my $has_placeholder = 0;
+ # Is there a row and column for unused values in this release?
+ my $has_unused = $names_ref->[$size-1] eq $unused_table_hdr;
for my $i (0 .. $size - 1) {
no warnings 'numeric';
- $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
$spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
}
- print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
+ output_table_header($out_fh, $table_type, "${property}_table", undef, $size, $size);
# Calculate the column heading line
my $header_line = "/* "
$header_line .= " */\n";
# If we have annotations, output it now.
- if ($has_placeholder || scalar %$abbreviations_ref) {
+ if ($has_unused || scalar %$abbreviations_ref) {
my $text = "";
foreach my $abbr (sort keys %$abbreviations_ref) {
$text .= "; " if $text;
$text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
}
- if ($has_placeholder) {
- $text .= "; other " if $text;
- $text .= "lowercase names are placeholders for"
- . " property values not defined until a later Unicode"
- . " release, so are irrelevant in this one, as they are"
- . " not assigned to any code points";
+ if ($has_unused) {
+ $text .= "; $unused_table_hdr stands for 'unused in this Unicode"
+ . " release (and the data in the row or column are garbage)"
}
my $indent = " " x 3;
print $out_fh "\n";
}
- print $out_fh "};\n";
+ output_table_trailer();
}
sub output_GCB_table() {
GCB_BREAKABLE => 1,
GCB_RI_then_RI => 2, # Rules 12 and 13
GCB_EX_then_EM => 3, # Rule 10
+ GCB_Maybe_Emoji_NonBreak => 4,
);
# The table is constructed in reverse order of the rules, to make the
$gcb_table[$gcb_enums{'Regional_Indicator'}]
[$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
+ # Post 11.0: GB11 \p{Extended_Pictographic} Extend* ZWJ
+ # × \p{Extended_Pictographic}
+ $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'XPG_XX'}] =
+ $gcb_actions{GCB_Maybe_Emoji_NonBreak};
+
+ # This and the rule GB10 obsolete starting with Unicode 11.0, can be left
+ # in as there are no code points that match, so the code won't ever get
+ # executed.
# Do not break within emoji modifier sequences or emoji zwj sequences.
- # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
+ # Pre 11.0: GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
$gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
$gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
}
}
- # LB8a Do not break between a zero width joiner and an ideograph, emoji
- # base or emoji modifier. This rule prevents breaks within emoji joiner
- # sequences.
- # ZWJ × (ID | EB | EM)
- $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
- = $lb_actions{'LB_NOBREAK'};
- $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
- = $lb_actions{'LB_NOBREAK'};
- $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
- = $lb_actions{'LB_NOBREAK'};
+ # LB8a Do not break after a zero width joiner
+ # ZWJ ×
+ for my $i (0 .. @lb_table - 1) {
+ $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_NOBREAK'};
+ }
# LB8 Break before any character following a zero-width space, even if one
# or more spaces intervene.
# algorithm stops at the earliest matching rule
my @wb_table;
- my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
- die "UNKNOWN must be final WB enum" unless $wb_short_enums[-1] =~ /unk/i;
+ my $table_size = @wb_short_enums;
# Otherwise, break everywhere (including around ideographs).
# WB99 Any ÷ Any
# WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
$wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'XPG_LE'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
= $wb_actions{'WB_NOBREAK'};
# WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
- # × # ExtendNumLet
+ # × ExtendNumLet
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'ExtendNumLet'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
# WB10 Numeric × (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'Numeric'}][$wb_enums{'XPG_LE'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
# WB9 (ALetter | Hebrew_Letter) × Numeric
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Numeric'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
= $wb_actions{'WB_NOBREAK'};
# × (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
+ $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'XPG_LE'}]
+ += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
+ $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'XPG_LE'}]
+ += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
+ $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'XPG_LE'}]
+ += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
# | Single_Quote) (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'MidNumLet'}]
+ += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'MidLetter'}]
+ += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Single_Quote'}]
+ += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
# WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'ALetter'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Hebrew_Letter'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'XPG_LE'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'XPG_LE'}]
+ = $wb_actions{'WB_NOBREAK'};
# Ignore Format and Extend characters, except after sot, CR, LF, and
# Newline. This also has the effect of: Any × (Format | Extend | ZWJ)
$wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
}
+ # Keep horizontal whitespace together
+ # Use perl's tailoring instead
+ # WB3d WSegSpace × WSegSpace
+ #$wb_table[$wb_enums{'WSegSpace'}][$wb_enums{'WSegSpace'}]
+ # = $wb_actions{'WB_NOBREAK'};
+
# Do not break within emoji zwj sequences.
# WB3c ZWJ × ( Glue_After_Zwj | EBG )
$wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'XPG_XX'}]
+ = $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'XPG_LE'}]
+ = $wb_actions{'WB_NOBREAK'};
- # Break before and after white space
+ # Break before and after newlines
# WB3b ÷ (Newline | CR | LF)
# WB3a (Newline | CR | LF) ÷
# et. al.
\@wb_table, \@wb_short_enums, \%wb_abbreviations);
}
+sub sanitize_name ($) {
+ # Change the non-word characters in the input string to standardized word
+ # equivalents
+ #
+ my $sanitized = shift;
+ $sanitized =~ s/=/__/;
+ $sanitized =~ s/&/_AMP_/;
+ $sanitized =~ s/\./_DOT_/;
+ $sanitized =~ s/-/_MINUS_/;
+ $sanitized =~ s!/!_SLASH_!;
+
+ return $sanitized;
+}
+
+switch_pound_if ('ALL', 'PERL_IN_REGCOMP_C');
+
output_invlist("Latin1", [ 0, 256 ]);
output_invlist("AboveLatin1", [ 256 ]);
# An initial & means to use the subroutine from this file instead of an
# official inversion list.
-for my $charset (get_supported_code_pages()) {
- print $out_fh "\n" . get_conditional_compile_line_start($charset);
-
- @a2n = @{get_a2n($charset)};
- # Below is the list of property names to generate. '&' means to use the
- # subroutine to generate the inversion list instead of the generic code
- # below. Some properties have a comma-separated list after the name,
- # These are extra enums to add to those found in the Unicode tables.
- no warnings 'qw';
- # Ignore non-alpha in sort
- for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
- Assigned
- ASCII
- Cased
- VertSpace
- XPerlSpace
- XPosixAlnum
- XPosixAlpha
- XPosixBlank
- XPosixCntrl
- XPosixDigit
- XPosixGraph
- XPosixLower
- XPosixPrint
- XPosixPunct
- XPosixSpace
- XPosixUpper
- XPosixWord
- XPosixXDigit
- _Perl_Any_Folds
- &NonL1_Perl_Non_Final_Folds
- _Perl_Folds_To_Multi_Char
- &UpperLatin1
- _Perl_IDStart
- _Perl_IDCont
- _Perl_Charname_Begin
- _Perl_Charname_Continue
- _Perl_GCB,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,EDGE
- _Perl_LB,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,EDGE,
- _Perl_SB,SContinue,CR,Extend,LF,EDGE
- _Perl_WB,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,EDGE,UNKNOWN
- _Perl_SCX,Latin,Inherited,Unknown,Kore,Jpan,Hanb,INVALID
- Lowercase_Mapping
- Titlecase_Mapping
- Uppercase_Mapping
- Simple_Case_Folding
- Case_Folding
- &_Perl_IVCF
- )
- # NOTE that the convention is that extra enum
- # values come after the property name, separated by
- # commas, with the enums that aren't ever defined
- # by Unicode coming last, at least 4 all-uppercase
- # characters. The others are enum names that are
- # needed by perl, but aren't in all Unicode
- # releases.
- ) {
-
- # For the Latin1 properties, we change to use the eXtended version of the
- # base property, then go through the result and get rid of everything not
- # in Latin1 (above 255). Actually, we retain the element for the range
- # that crosses the 255/256 boundary if it is one that matches the
- # property. For example, in the Word property, there is a range of code
- # points that start at U+00F8 and goes through U+02C1. Instead of
- # artificially cutting that off at 256 because 256 is the first code point
- # above Latin1, we let the range go to its natural ending. That gives us
- # extra information with no added space taken. But if the range that
- # crosses the boundary is one that doesn't match the property, we don't
- # start a new range above 255, as that could be construed as going to
- # infinity. For example, the Upper property doesn't include the character
- # at 255, but does include the one at 256. We don't include the 256 one.
- my $prop_name = $prop;
- my $is_local_sub = $prop_name =~ s/^&//;
- my $extra_enums = "";
- $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
- my $lookup_prop = $prop_name;
- my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
- or $lookup_prop =~ s/^L1//);
- my $nonl1_only = 0;
- $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
- ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
+# Below is the list of property names to generate. '&' means to use the
+# subroutine to generate the inversion list instead of the generic code
+# below. Some properties have a comma-separated list after the name,
+# These are extra enums to add to those found in the Unicode tables.
+no warnings 'qw';
+ # Ignore non-alpha in sort
+my @props;
+push @props, sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
+ &UpperLatin1
+ _Perl_GCB,EDGE,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,XPG_XX
+ _Perl_LB,EDGE,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner
+ _Perl_SB,EDGE,SContinue,CR,Extend,LF
+ _Perl_WB,Perl_Tailored_HSpace,EDGE,UNKNOWN,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,XPG_XX,XPG_LE
+ _Perl_SCX,Latin,Inherited,Unknown,Kore,Jpan,Hanb,INVALID
+ Lowercase_Mapping
+ Titlecase_Mapping
+ Uppercase_Mapping
+ Simple_Case_Folding
+ Case_Folding
+ &_Perl_IVCF
+ &_Perl_CCC_non0_non230
+ );
+ # NOTE that the convention is that extra enum values come
+ # after the property name, separated by commas, with the enums
+ # that aren't ever defined by Unicode coming last, at least 4
+ # all-uppercase characters. The others are enum names that
+ # are needed by perl, but aren't in all Unicode releases.
+
+my @bin_props;
+my @perl_prop_synonyms;
+my %enums;
+my @deprecated_messages = ""; # Element [0] is a placeholder
+my %deprecated_tags;
+
+my $float_e_format = qr/ ^ -? \d \. \d+ e [-+] \d+ $ /x;
+
+# Create another hash that maps floating point x.yyEzz representation to what
+# %stricter_to_file_of does for the equivalent rational. A typical entry in
+# the latter hash is
+#
+# 'nv=1/2' => 'Nv/1_2',
+#
+# From that, this loop creates an entry
+#
+# 'nv=5.00e-01' => 'Nv/1_2',
+#
+# %stricter_to_file_of contains far more than just the rationals. Instead we
+# use %Unicode::UCD::nv_floating_to_rational which should have an entry for each
+# nv in the former hash.
+my %floating_to_file_of;
+foreach my $key (keys %Unicode::UCD::nv_floating_to_rational) {
+ my $value = $Unicode::UCD::nv_floating_to_rational{$key};
+ $floating_to_file_of{$key} = $Unicode::UCD::stricter_to_file_of{"nv=$value"};
+}
+
+# Properties that are specified with a prop=value syntax
+my @equals_properties;
+
+# Collect all the binary properties from data in lib/unicore
+# Sort so that complements come after the main table, and the shortest
+# names first, finally alphabetically. Also, sort together the tables we want
+# to be kept together, and prefer those with 'posix' in their names, which is
+# what the C code is expecting their names to be.
+foreach my $property (sort
+ { exists $keep_together{lc $b} <=> exists $keep_together{lc $a}
+ or $b =~ /posix/i <=> $a =~ /posix/i
+ or $b =~ /perl/i <=> $a =~ /perl/i
+ or $a =~ $float_e_format <=> $b =~ $float_e_format
+ or $a =~ /!/ <=> $b =~ /!/
+ or length $a <=> length $b
+ or $a cmp $b
+ } keys %Unicode::UCD::loose_to_file_of,
+ keys %Unicode::UCD::stricter_to_file_of,
+ keys %floating_to_file_of
+) {
+
+ # These two hashes map properties to values that can be considered to
+ # be checksums. If two properties have the same checksum, they have
+ # identical entries. Otherwise they differ in some way.
+ my $tag = $Unicode::UCD::loose_to_file_of{$property};
+ $tag = $Unicode::UCD::stricter_to_file_of{$property} unless defined $tag;
+ $tag = $floating_to_file_of{$property} unless defined $tag;
+
+ # The tag may contain an '!' meaning it is identical to the one formed
+ # by removing the !, except that it is inverted.
+ my $inverted = $tag =~ s/!//;
+
+ # This hash is lacking the property name
+ $property = "nv=$property" if $property =~ $float_e_format;
+
+ # The list of 'prop=value' entries that this single entry expands to
+ my @this_entries;
+
+ # Split 'property=value' on the equals sign, with $lhs being the whole
+ # thing if there is no '='
+ my ($lhs, $rhs) = $property =~ / ( [^=]* ) ( =? .*) /x;
+
+ # $lhs then becomes the property name.
+ my $prop_value = $rhs =~ s/ ^ = //rx;
+
+ push @equals_properties, $lhs if $prop_value ne "";
+
+ # See if there are any synonyms for this property.
+ if (exists $prop_name_aliases{$lhs}) {
+
+ # If so, do the combinatorics so that a new entry is added for
+ # each legal property combined with the property value (which is
+ # $rhs)
+ foreach my $alias (@{$prop_name_aliases{$lhs}}) {
+
+ # But, there are some ambiguities, like 'script' is a synonym
+ # for 'sc', and 'sc' can stand alone, meaning something
+ # entirely different than 'script'. 'script' cannot stand
+ # alone. Don't add if the potential new lhs is in the hash of
+ # stand-alone properties.
+ no warnings 'once';
+ next if $rhs eq "" && grep { $alias eq $_ }
+ keys %Unicode::UCD::loose_property_to_file_of;
+
+ my $new_entry = $alias . $rhs;
+ push @this_entries, $new_entry;
+ }
+ }
+
+ # Above, we added the synonyms for the base entry we're now
+ # processing. But we haven't dealt with it yet. If we already have a
+ # property with the identical characteristics, this becomes just a
+ # synonym for it.
+
+ if (exists $enums{$tag}) {
+ push @this_entries, $property;
+ }
+ else { # Otherwise, create a new entry.
+
+ # Add to the list of properties to generate inversion lists for.
+ push @bin_props, uc $property;
+
+ # Create a rule for the parser
+ if (! exists $keywords{$property}) {
+ $keywords{$property} = token_name($property);
+ }
+
+ # And create an enum for it.
+ $enums{$tag} = $table_name_prefix . uc sanitize_name($property);
+
+ $perl_tags{$tag} = 1 if exists $keep_together{lc $property};
+
+ # Some properties are deprecated. This hash tells us so, and the
+ # warning message to raise if they are used.
+ if (exists $Unicode::UCD::why_deprecated{$tag}) {
+ $deprecated_tags{$enums{$tag}} = scalar @deprecated_messages;
+ push @deprecated_messages, $Unicode::UCD::why_deprecated{$tag};
+ }
+
+ # Our sort above should have made sure that we see the
+ # non-inverted version first, but this makes sure.
+ warn "$property is inverted!!!" if $inverted;
+ }
+
+ # Everything else is #defined to be the base enum, inversion is
+ # indicated by negating the value.
+ my $defined_to = "";
+ $defined_to .= "-" if $inverted;
+ $defined_to .= $enums{$tag};
+
+ # Go through the entries that evaluate to this.
+ @this_entries = uniques @this_entries;
+ foreach my $define (@this_entries) {
+
+ # There is a rule for the parser for each.
+ $keywords{$define} = $defined_to;
+
+ # And a #define for all simple names equivalent to a perl property,
+ # except those that begin with 'is' or 'in';
+ if (exists $perl_tags{$tag} && $property !~ / ^ i[ns] | = /x) {
+ push @perl_prop_synonyms, "#define "
+ . $table_name_prefix
+ . uc(sanitize_name($define))
+ . " $defined_to";
+ }
+ }
+}
+
+@bin_props = sort { exists $keep_together{lc $b} <=> exists $keep_together{lc $a}
+ or $a cmp $b
+ } @bin_props;
+@perl_prop_synonyms = sort(uniques(@perl_prop_synonyms));
+push @props, @bin_props;
+
+foreach my $prop (@props) {
+
+ # For the Latin1 properties, we change to use the eXtended version of the
+ # base property, then go through the result and get rid of everything not
+ # in Latin1 (above 255). Actually, we retain the element for the range
+ # that crosses the 255/256 boundary if it is one that matches the
+ # property. For example, in the Word property, there is a range of code
+ # points that start at U+00F8 and goes through U+02C1. Instead of
+ # artificially cutting that off at 256 because 256 is the first code point
+ # above Latin1, we let the range go to its natural ending. That gives us
+ # extra information with no added space taken. But if the range that
+ # crosses the boundary is one that doesn't match the property, we don't
+ # start a new range above 255, as that could be construed as going to
+ # infinity. For example, the Upper property doesn't include the character
+ # at 255, but does include the one at 256. We don't include the 256 one.
+ my $prop_name = $prop;
+ my $is_local_sub = $prop_name =~ s/^&//;
+ my $extra_enums = "";
+ $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
+ my $lookup_prop = $prop_name;
+ $prop_name = sanitize_name($prop_name);
+ $prop_name = $table_name_prefix . $prop_name if grep { lc $lookup_prop eq lc $_ } @bin_props;
+ my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
+ or $lookup_prop =~ s/^L1//);
+ my $nonl1_only = 0;
+ $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
+ ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
+
+ for my $charset (get_supported_code_pages()) {
+ @a2n = @{get_a2n($charset)};
my @invlist;
my @invmap;
- my $map_format;
+ my $map_format = 0;;
my $map_default;
- my $maps_to_code_point;
- my $to_adjust;
+ my $maps_to_code_point = 0;
+ my $to_adjust = 0;
+ my $same_in_all_code_pages;
if ($is_local_sub) {
my @return = eval $lookup_prop;
die $@ if $@;
if (defined $count) {
# Short-circuit an empty inversion list.
output_invlist($prop_name, \@invlist, $charset);
- next;
+ last;
}
die "Could not find inversion list for '$lookup_prop'"
}
@invmap = @$map_ref;
$map_format = $format;
$map_default = $default;
- $maps_to_code_point = $map_format =~ / a ($ | [^r] ) /x;
- $to_adjust = $map_format =~ /a/;
}
}
}
+ if ($map_format) {
+ $maps_to_code_point = $map_format =~ / a ($ | [^r] ) /x;
+ $to_adjust = $map_format =~ /a/;
+ }
+
# Re-order the Unicode code points to native ones for this platform.
# This is only needed for code points below 256, because native code
# points are only in that range. For inversion maps of properties
# 255 because a re-ordering could cause 256 to need to be in the same
# range as 255.)
if ( (@invmap && $maps_to_code_point)
- || ( ($invlist[0] < 256
+ || ( @invlist
+ && $invlist[0] < 256
&& ( $invlist[0] != 0
- || (scalar @invlist != 1 && $invlist[1] < 256)))))
+ || (scalar @invlist != 1 && $invlist[1] < 256))))
{
+ $same_in_all_code_pages = 0;
if (! @invmap) { # Straight inversion list
# Look at all the ranges that start before 257.
my @latin1_list;
if (ref $invmap[0]) {
$bucket = join "\cK", map { a2n($_) } @{$invmap[0]};
}
- elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
+ elsif ( $maps_to_code_point
+ && $invmap[0] =~ $integer_or_float_re)
+ {
# Do convert to native for maps to single code points.
# There are some properties that have a few outlier
# maps that aren't code points, so the above test
- # skips those.
- $bucket = a2n($invmap[0]);
+ # skips those. 0 is never remapped.
+ $bucket = $invmap[0] == 0 ? 0 : a2n($invmap[0]);
} else {
$bucket = $invmap[0];
}
# Skip any non-numeric maps: these are outliers
# that aren't code points.
- && $base_map =~ $numeric_re
+ && $base_map =~ $integer_or_float_re
# 'ne' because the default can be a string
&& $base_map ne $map_default)
for my $i (0 .. @new_invlist - 1) {
next if $i > 0
&& $new_invlist[$i-1] + 1 == $new_invlist[$i]
- && $xlated{$new_invlist[$i-1]} =~ $numeric_re
- && $xlated{$new_invlist[$i]} =~ $numeric_re
- && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
+ && $xlated{$new_invlist[$i-1]}
+ =~ $integer_or_float_re
+ && $xlated{$new_invlist[$i]}
+ =~ $integer_or_float_re
+ && $xlated{$new_invlist[$i-1]} + 1
+ == $xlated{$new_invlist[$i]};
push @temp, $new_invlist[$i];
}
@new_invlist = @temp;
unshift @invlist, @new_invlist;
}
}
+ elsif (@invmap) { # inversion maps can't cope with this variable
+ # being true, even if it could be true
+ $same_in_all_code_pages = 0;
+ }
+ else {
+ $same_in_all_code_pages = 1;
+ }
# prop_invmap() returns an extra final entry, which we can now
# discard.
$found_nonl1 = 1;
last;
}
- die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
+ if (! $found_nonl1) {
+ warn "No non-Latin1 code points in $prop_name";
+ output_invlist($prop_name, []);
+ last;
+ }
}
- output_invlist($prop_name, \@invlist, $charset);
- output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
+ switch_pound_if ($prop_name, 'PERL_IN_REGCOMP_C');
+ start_charset_pound_if($charset, 1) unless $same_in_all_code_pages;
+
+ output_invlist($prop_name, \@invlist, ($same_in_all_code_pages)
+ ? $applies_to_all_charsets_text
+ : $charset);
+
+ if (@invmap) {
+ output_invmap($prop_name, \@invmap, $lookup_prop, $map_format,
+ $map_default, $extra_enums, $charset);
+ }
+
+ last if $same_in_all_code_pages;
+ end_charset_pound_if;
}
- end_file_pound_if;
- print $out_fh "\n" . get_conditional_compile_line_end();
}
+switch_pound_if ('binary_property_tables', 'PERL_IN_REGCOMP_C');
+
+print $out_fh "\nconst char * const deprecated_property_msgs[] = {\n\t";
+print $out_fh join ",\n\t", map { "\"$_\"" } @deprecated_messages;
+print $out_fh "\n};\n";
+
+my @enums = sort values %enums;
+
+# Save a copy of these before modification
+my @invlist_names = map { "${_}_invlist" } @enums;
+
+# Post-process the enums for deprecated properties.
+if (scalar keys %deprecated_tags) {
+ my $seen_deprecated = 0;
+ foreach my $enum (@enums) {
+ if (grep { $_ eq $enum } keys %deprecated_tags) {
+
+ # Change the enum name for this deprecated property to a
+ # munged one to act as a placeholder in the typedef. Then
+ # make the real name be a #define whose value is such that
+ # its modulus with the number of enums yields the index into
+ # the table occupied by the placeholder. And so that dividing
+ # the #define value by the table length gives an index into
+ # the table of deprecation messages for the corresponding
+ # warning.
+ my $revised_enum = "${enum}_perl_aux";
+ if (! $seen_deprecated) {
+ $seen_deprecated = 1;
+ print $out_fh "\n";
+ }
+ print $out_fh "#define $enum ($revised_enum + (MAX_UNI_KEYWORD_INDEX * $deprecated_tags{$enum}))\n";
+ $enum = $revised_enum;
+ }
+ }
+}
+
+print $out_fh "\ntypedef enum {\n\tPERL_BIN_PLACEHOLDER = 0, /* So no real value is zero */\n\t";
+print $out_fh join ",\n\t", @enums;
+print $out_fh "\n";
+print $out_fh "} binary_invlist_enum;\n";
+print $out_fh "\n#define MAX_UNI_KEYWORD_INDEX $enums[-1]\n";
+
+output_table_header($out_fh, "UV *", "uni_prop_ptrs");
+print $out_fh "\tNULL,\t/* Placeholder */\n";
+print $out_fh "\t";
+print $out_fh join ",\n\t", @invlist_names;
+print $out_fh "\n";
+
+output_table_trailer();
+
+print $out_fh join "\n", "\n",
+ #'# ifdef DOINIT',
+ #"\n",
+ "/* Synonyms for perl properties */",
+ @perl_prop_synonyms,
+ #"\n",
+ #"# endif /* DOINIT */",
+ "\n";
+
+switch_pound_if ('Valid property_values', 'PERL_IN_REGCOMP_C');
+
+# Each entry is a pointer to a table of property values for some property.
+# (Other properties may share this table. The next two data structures allow
+# this sharing to be implemented.)
+my @values_tables = "NULL /* Placeholder so zero index is an error */";
+
+# Keys are all the values of a property, strung together. The value of each
+# key is its index in @values_tables. This is because many properties have
+# the same values, and this allows the data to appear just once.
+my %joined_values;
+
+# #defines for indices into @values_tables, so can have synonyms resolved by
+# the C compiler.
+my @values_indices;
+
+# Go through each property which is specifiable by \p{prop=value}, and create
+# a hash with the keys being the canonicalized short property names, and the
+# values for each property being all possible values that it can take on.
+# Both the full value and its short, canonicalized into lc, sans punctuation
+# version are included.
+my %all_values;
+for my $property (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) }
+ uniques @equals_properties)
+{
+ # Get and canonicalize the short name for this property.
+ my ($short_name) = prop_aliases($property);
+ $short_name = lc $short_name;
+ $short_name =~ s/[ _-]//g;
+
+ # Now look at each value this property can take on
+ foreach my $value (prop_values($short_name)) {
+
+ # And for each value, look at each synonym for it
+ foreach my $alias (prop_value_aliases($short_name, $value)) {
+
+ # Add each synonym
+ push @{$all_values{$short_name}}, $alias;
+
+ # As well as its canonicalized name. khw made the decision to not
+ # support the grandfathered L_ Gc property value
+ $alias = lc $alias;
+ $alias =~ s/[ _-]//g unless $alias =~ $numeric_re;
+ push @{$all_values{$short_name}}, $alias;
+ }
+ }
+}
+
+# Also include the old style block names, using the recipe given in
+# Unicode::UCD
+foreach my $block (prop_values('block')) {
+ push @{$all_values{'blk'}}, charblock((prop_invlist("block=$block"))[0]);
+}
+
+# Now create output tables for each property in @equals_properties (the keys
+# in %all_values) each containing that property's possible values as computed
+# just above.
+PROPERTY:
+for my $property (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b)
+ or $a cmp $b } keys %all_values)
+{
+ @{$all_values{$property}} = uniques(@{$all_values{$property}});
+
+ # String together the values for this property, sorted. This string forms
+ # a list definition, with each value as an entry in it, indented on a new
+ # line. The sorting is used to find properties that take on the exact
+ # same values to share this string.
+ my $joined = "\t\"";
+ $joined .= join "\",\n\t\"",
+ sort { ($a =~ $numeric_re && $b =~ $numeric_re)
+ ? eval $a <=> eval $b
+ : prop_name_for_cmp($a) cmp prop_name_for_cmp($b)
+ or $a cmp $b
+ } @{$all_values{$property}};
+ # And add a trailing marker
+ $joined .= "\",\n\tNULL\n";
+
+ my $table_name = $table_name_prefix . $property . "_values";
+ my $index_name = "${table_name}_index";
+
+ # Add a rule for the parser that is just an empty value. It will need to
+ # know to look up empty things in the prop_value_ptrs[] table.
+
+ $keywords{"$property="} = $index_name;
+ if (exists $prop_name_aliases{$property}) {
+ foreach my $alias (@{$prop_name_aliases{$property}}) {
+ $keywords{"$alias="} = $index_name;
+ }
+ }
+
+ # Also create rules for the synonyms of this property to point to the same
+ # thing
+
+ # If this property's values are the same as one we've already computed,
+ # use that instead of creating a duplicate. But we add a #define to point
+ # to the proper one.
+ if (exists $joined_values{$joined}) {
+ push @values_indices, "#define $index_name $joined_values{$joined}\n";
+ next PROPERTY;
+ }
+
+ # And this property, now known to have unique values from any other seen
+ # so far is about to be pushed onto @values_tables. Its index is the
+ # current count.
+ push @values_indices, "#define $index_name "
+ . scalar @values_tables . "\n";
+ $joined_values{$joined} = $index_name;
+ push @values_tables, $table_name;
+
+ # Create the table for this set of values.
+ output_table_header($out_fh, "char *", $table_name);
+ print $out_fh $joined;
+ output_table_trailer();
+} # End of loop through the properties, and their values
+
+# We have completely determined the table of the unique property values
+output_table_header($out_fh, "char * const *",
+ "${table_name_prefix}prop_value_ptrs");
+print $out_fh join ",\n", @values_tables;
+print $out_fh "\n";
+output_table_trailer();
+
+# And the #defines for the indices in it
+print $out_fh "\n\n", join "", @values_indices;
+
switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
output_GCB_table();
end_file_pound_if;
+print $out_fh <<"EOF";
+
+/* More than one code point may have the same code point as their fold. This
+ * gives the maximum number in the current Unicode release. (The folded-to
+ * code point is not included in this count.) For example, both 'S' and
+ * \\x{17F} fold to 's', so the number for that fold is 2. Another way to
+ * look at it is the maximum length of all the IVCF_AUX_TABLE's */
+#define MAX_FOLD_FROMS $max_fold_froms
+EOF
+
my $sources_list = "lib/unicore/mktables.lst";
my @sources = qw(regen/mk_invlists.pl
lib/unicore/mktables
lib/Unicode/UCD.pm
regen/charset_translations.pl
+ regen/mk_PL_charclass.pl
);
{
# Depend on mktables’ own sources. It’s a shorter list of files than
}
read_only_bottom_close_and_rename($out_fh, \@sources);
+
+my %name_to_index;
+for my $i (0 .. @enums - 1) {
+ my $loose_name = $enums[$i] =~ s/^$table_name_prefix//r;
+ $loose_name = lc $loose_name;
+ $loose_name =~ s/__/=/;
+ $loose_name =~ s/_dot_/./;
+ $loose_name =~ s/_slash_/\//g;
+ $name_to_index{$loose_name} = $i + 1;
+}
+# unsanitize, exclude &, maybe add these before sanitize
+for my $i (0 .. @perl_prop_synonyms - 1) {
+ my $loose_name_pair = $perl_prop_synonyms[$i] =~ s/#\s*define\s*//r;
+ $loose_name_pair =~ s/\b$table_name_prefix//g;
+ $loose_name_pair = lc $loose_name_pair;
+ $loose_name_pair =~ s/__/=/g;
+ $loose_name_pair =~ s/_dot_/./g;
+ $loose_name_pair =~ s/_slash_/\//g;
+ my ($synonym, $primary) = split / +/, $loose_name_pair;
+ $name_to_index{$synonym} = $name_to_index{$primary};
+}
+
+my $uni_pl = open_new('lib/unicore/uni_keywords.pl', '>',
+ {style => '*', by => 'regen/mk_invlists.pl',
+ from => "Unicode::UCD"});
+{
+ print $uni_pl "\%Unicode::UCD::uni_prop_ptrs_indices = (\n";
+ for my $name (sort keys %name_to_index) {
+ print $uni_pl " '$name' => $name_to_index{$name},\n";
+ }
+ print $uni_pl ");\n\n1;\n";
+}
+
+read_only_bottom_close_and_rename($uni_pl, \@sources);
+
+require './regen/mph.pl';
+
+sub token_name
+{
+ my $name = sanitize_name(shift);
+ warn "$name contains non-word" if $name =~ /\W/;
+
+ return "$table_name_prefix\U$name"
+}
+
+my $keywords_fh = open_new('uni_keywords.h', '>',
+ {style => '*', by => 'regen/mk_invlists.pl',
+ from => "mph.pl"});
+
+no warnings 'once';
+print $keywords_fh <<"EOF";
+/* The precision to use in "%.*e" formats */
+#define PL_E_FORMAT_PRECISION $Unicode::UCD::e_precision
+
+EOF
+
+my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows) = MinimalPerfectHash::make_mph_from_hash(\%keywords);
+print $keywords_fh MinimalPerfectHash::make_algo($second_level, $seed1, $length_all_keys, $smart_blob, $rows, undef, undef, undef, 'match_uniprop' );
+
+push @sources, 'regen/mph.pl';
+read_only_bottom_close_and_rename($keywords_fh, \@sources);