require './regen/regen_lib.pl';
require './regen/charset_translations.pl';
require './lib/unicore/Heavy.pl';
+use re "/aa";
# This program outputs charclass_invlists.h, which contains various inversion
# lists in the form of C arrays that are to be used as-is for inversion lists.
# out-of-sync, or the wrong data structure being passed. Currently that
# random number is:
-# charclass_invlists.h now also has a partial implementation of inversion
-# maps; enough to generate tables for the line break properties, such as GCB
-
my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
+# charclass_invlists.h now also contains inversion maps and enum definitions
+# for those maps that have a finite number of possible values
+
# integer or float
-my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
+my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /x;
+
+# More than one code point may have the same code point as their fold. This
+# gives the maximum number in the current Unicode release. (The folded-to
+# code point is not included in this count.) Most folds are pairs of code
+# points, like 'B' and 'b', so this number is at least one.
+my $max_fold_froms = 1;
my %keywords;
-my $table_name_prefix = "PL_";
+my $table_name_prefix = "UNI_";
# Matches valid C language enum names: begins with ASCII alphabetic, then any
# ASCII \w
);
my %perl_tags; # So can find synonyms of the above properties
+my $unused_table_hdr = 'u'; # Heading for row or column for unused values
+
sub uniques {
# Returns non-duplicated input values. From "Perl Best Practices:
# Encapsulated Cleverness". p. 455 in first edition.
}
foreach my $element (@new_pound_if) {
+
+ # regcomp.c is arranged so that the tables are not compiled in
+ # re_comp.c */
+ my $no_xsub = 1 if $element =~ / PERL_IN_ (?: REGCOMP ) _C /x;
$element = "defined($element)";
+ $element = "($element && ! defined(PERL_IN_XSUB_RE))" if $no_xsub;
}
$new_pound_if = join " || ", @new_pound_if;
print $out_fh "\n" . get_conditional_compile_line_start(shift, shift);
}
+{ # Closure
+ my $fh;
+ my $in_doinit = 0;
+
+ sub output_table_header($$$;$@) {
+
+ # Output to $fh the heading for a table given by the other inputs
+
+ $fh = shift;
+ my ($type, # typedef of table, like UV, UV*
+ $name, # name of table
+ $comment, # Optional comment to put on header line
+ @sizes # Optional sizes of each array index. If omitted,
+ # there is a single index whose size is computed by
+ # the C compiler.
+ ) = @_;
+
+ $type =~ s/ \s+ $ //x;
+
+ # If a the typedef is a ptr, add in an extra const
+ $type .= " const" if $type =~ / \* $ /x;
+
+ $comment = "" unless defined $comment;
+ $comment = " /* $comment */" if $comment;
+
+ my $array_declaration;
+ if (@sizes) {
+ $array_declaration = "";
+ $array_declaration .= "[$_]" for @sizes;
+ }
+ else {
+ $array_declaration = '[]';
+ }
+
+ my $declaration = "$type ${name}$array_declaration";
+
+ # Things not matching this are static. Otherwise, it is an external
+ # constant, initialized only under DOINIT.
+ #
+ # (Currently everything is static)
+ if ($in_file_pound_if !~ / PERL_IN_ (?: ) _C /x) {
+ $in_doinit = 0;
+ print $fh "\nstatic const $declaration = {$comment\n";
+ }
+ else {
+ $in_doinit = 1;
+ print $fh <<EOF;
+
+# ifndef DOINIT
+
+EXTCONST $declaration;
+
+# else
+
+EXTCONST $declaration = {$comment
+EOF
+ }
+ }
+
+ sub output_table_trailer() {
+
+ # Close out a table started by output_table_header()
+
+ print $fh "};\n";
+ if ($in_doinit) {
+ print $fh "\n# endif /* DOINIT */\n\n";
+ $in_doinit = 0;
+ }
+ }
+} # End closure
+
+
sub output_invlist ($$;$) {
my $name = shift;
my $invlist = shift; # Reference to inversion list array
unshift @$invlist, 0;
$zero_or_one = 1;
}
- my $count = @$invlist;
- print $out_fh "\nstatic const UV ${name}_invlist[] = {";
- print $out_fh " /* for $charset */" if $charset;
- print $out_fh "\n";
+ $charset = "for $charset" if $charset;
+ output_table_header($out_fh, "UV", "${name}_invlist", $charset);
- print $out_fh "\t$count,\t/* Number of elements */\n";
- print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
- print $out_fh "\t", $zero_or_one,
- ",\t/* 0 if the list starts at 0;",
- "\n\t\t 1 if it starts at the element beyond 0 */\n";
+ my $count = @$invlist;
+ print $out_fh <<EOF;
+\t$count,\t/* Number of elements */
+\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */
+\t$zero_or_one,\t/* 0 if the list starts at 0;
+\t\t 1 if it starts at the element beyond 0 */
+EOF
# The main body are the UVs passed in to this routine. Do the final
# element separately
print $out_fh "\n";
}
- print $out_fh "};\n";
+ output_table_trailer();
}
sub output_invmap ($$$$$$$) {
}
# The internal enums come last, and in the order specified.
+ #
+ # The internal one named EDGE is also used a marker. Any ones that
+ # come after it are used in the algorithms below, and so must be
+ # defined, even if the release of Unicode this is being compiled for
+ # doesn't use them. But since no code points are assigned to them in
+ # such a release, those values will never be accessed. We collapse
+ # all of them into a single placholder row and a column. The
+ # algorithms below will fill in those cells with essentially garbage,
+ # but they are never read, so it doesn't matter. This allows the
+ # algorithm to remain the same from release to release.
+ #
+ # In one case, regexec.c also uses a placeholder which must be defined
+ # here, and we put it in the unused row and column as its value is
+ # never read.
+ #
my @enums = @input_enums;
my @extras;
+ my @unused_enums;
+ my $unused_enum_value = @enums;
if ($extra_enums ne "") {
@extras = split /,/, $extra_enums;
+ my $seen_EDGE = 0;
# Don't add if already there.
foreach my $this_extra (@extras) {
next if grep { $_ eq $this_extra } @enums;
-
- push @enums, $this_extra;
+ if ($this_extra eq 'EDGE') {
+ push @enums, $this_extra;
+ $seen_EDGE = 1;
+ }
+ elsif ($seen_EDGE) {
+ push @unused_enums, $this_extra;
+ }
+ else {
+ push @enums, $this_extra;
+ }
}
+
+ @unused_enums = sort @unused_enums;
+ $unused_enum_value = @enums; # All unused have the same value,
+ # one beyond the final used one
}
# Assign a value to each element of the enum type we are creating.
# all the tables
my $type = lc $prop_name;
- my $placeholder = "a";
-
# Skip if we've already done this code, which populated
# this hash
if (eval "! \%${type}_enums") {
#
# First are those enums that are not part of the
# property, but are defined by this code. By
- # convention these have all-caps names of at least
- # 4 characters. We use the lowercased name for
- # thse.
+ # convention these have all-caps names. We use
+ # the lowercased name for these.
#
- # Second are enums that are needed to get
- # regexec.c to compile, but don't exist in all
- # Unicode releases. To get here, we must be
- # compiling an earlier Unicode release that
- # doesn't have that enum, so just use a unique
- # anonymous name for it.
+ # Second are enums that are needed to get the
+ # algorithms below to work and/or to get regexec.c
+ # to compile, but don't exist in all Unicode
+ # releases. These are handled outside this loop
+ # as 'unused_enums'
if (grep { $_ eq $enum } @input_enums) {
$short = $enum
}
- elsif ($enum !~ / ^ [A-Z]{4,} $ /x) {
- $short = $placeholder++;
- }
else {
$short = lc $enum;
}
eval "\$${type}_short_enums[$value] = '$short'";
die $@ if $@;
}
+
+ # Each unused enum has the same value. They all are collapsed
+ # into one row and one column, named $unused_table_hdr.
+ if (@unused_enums) {
+ eval "\$${type}_short_enums['$unused_enum_value'] = '$unused_table_hdr'";
+ die $@ if $@;
+
+ foreach my $enum (@unused_enums) {
+ eval "\$${type}_enums{$enum} = $unused_enum_value";
+ die $@ if $@;
+ }
+ }
}
}
my $name = $enum_list[$i];
push @enum_definition, "\t${name_prefix}$name = $i";
}
+ if (@unused_enums) {
+ foreach my $unused (@unused_enums) {
+ push @enum_definition,
+ ",\n\t${name_prefix}$unused = $unused_enum_value";
+ }
+ }
# For an 'l' property, we need extra enums, because some of the
# elements are lists. Each such distinct list is placed in its own
# Output each aux table.
foreach my $table_number (@sorted_table_list) {
my $table = $inverted_mults{$table_number};
- print $out_fh "\nstatic const $aux_declaration_type $name_prefix$aux_table_prefix$table_number\[] = {\n";
+ output_table_header($out_fh,
+ $aux_declaration_type,
+ "$name_prefix$aux_table_prefix$table_number");
# Earlier, we joined the elements of this table together with a comma
my @elements = split ",", $table;
print $out_fh "\t${name_prefix}$elements[$i]";
}
}
- print $out_fh "\n};\n";
+
+ print $out_fh "\n";
+ output_table_trailer();
}
# Output the table that is indexed by the absolute value of the
# aux table enum and contains pointers to the tables output just
# above
- print $out_fh "\nstatic const $aux_declaration_type * const ${name_prefix}${aux_table_prefix}ptrs\[] = {\n";
+ output_table_header($out_fh, "$aux_declaration_type *",
+ "${name_prefix}${aux_table_prefix}ptrs");
print $out_fh "\tNULL,\t/* Placeholder */\n";
for my $i (1 .. @sorted_table_list) {
print $out_fh ",\n" if $i > 1;
print $out_fh "\t$name_prefix$aux_table_prefix$i";
}
- print $out_fh "\n};\n";
+ print $out_fh "\n";
+ output_table_trailer();
print $out_fh
"\n/* Parallel table to the above, giving the number of elements"
. " in each table\n * pointed to */\n";
- print $out_fh "static const U8 ${name_prefix}${aux_table_prefix}lengths\[] = {\n";
+ output_table_header($out_fh, "U8",
+ "${name_prefix}${aux_table_prefix}lengths");
print $out_fh "\t0,\t/* Placeholder */\n";
for my $i (1 .. @sorted_table_list) {
print $out_fh ",\n" if $i > 1;
print $out_fh "\t$aux_counts[$i]\t/* $name_prefix$aux_table_prefix$i */";
}
- print $out_fh "\n};\n";
+ print $out_fh "\n";
+ output_table_trailer();
} # End of outputting the auxiliary and associated tables
# The scx property used in regexec.c needs a specialized table which
. " code point for that\n * script; 0 if the script has multiple"
. " digit sequences. Scripts without a\n * digit sequence use"
. " ASCII [0-9], hence are marked '0' */\n";
- print $out_fh "static const UV script_zeros[] = {\n";
+ output_table_header($out_fh, "UV", "script_zeros");
for my $i (0 .. @script_zeros - 1) {
my $code_point = $script_zeros[$i];
if (defined $code_point) {
print $out_fh "\t/* $enum_list[$i] */";
print $out_fh "\n";
}
- print $out_fh "};\n";
+ output_table_trailer();
} # End of special handling of scx
}
else {
&& $count;
# Now output the inversion map proper
- print $out_fh "\nstatic const $invmap_declaration_type ${name}_invmap[] = {";
- print $out_fh " /* for $charset */" if $charset;
- print $out_fh "\n";
+ $charset = "for $charset" if $charset;
+ output_table_header($out_fh, $invmap_declaration_type,
+ "${name}_invmap",
+ $charset);
# The main body are the scalars passed in to this routine.
for my $i (0 .. $count - 1) {
print $out_fh "," if $i < $count - 1;
print $out_fh "\n";
}
- print $out_fh "};\n";
+ output_table_trailer();
}
sub mk_invlist_from_sorted_cp_list {
# other. This situation happens in Unicode 3.0.1, but probably no
# other version.
foreach my $fold (keys %new) {
- my $folds_to_string = $fold =~ /\D/a;
+ my $folds_to_string = $fold =~ /\D/;
# If the bucket contains only one element, convert from an array to a
# scalar
# Now we have a hash that is the inversion of the case fold property.
- # Convert it to an inversion map.
+ # First find the maximum number of code points that fold to the same one.
+ foreach my $fold_to (keys %new) {
+ if (ref $new{$fold_to}) {
+ my $folders_count = scalar @{$new{$fold_to}};
+ $max_fold_froms = $folders_count if $folders_count > $max_fold_froms;
+ }
+ }
+ # Then convert the hash to an inversion map.
my @sorted_folds = sort { $a <=> $b } keys %new;
my (@invlist, @invmap);
# nor run into an adjacent column
my @spacers;
- # If we are being compiled on a Unicode version earlier than that which
- # this file was designed for, it may be that some of the property values
- # aren't in the current release, and so would be undefined if we didn't
- # define them ourselves. Earlier code has done this, making them
- # lowercase characters of length one. We look to see if any exist, so
- # that we can add an annotation to the output table
- my $has_placeholder = 0;
+ # Is there a row and column for unused values in this release?
+ my $has_unused = $names_ref->[$size-1] eq $unused_table_hdr;
for my $i (0 .. $size - 1) {
no warnings 'numeric';
- $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
$spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
}
- print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
+ output_table_header($out_fh, $table_type, "${property}_table", undef, $size, $size);
# Calculate the column heading line
my $header_line = "/* "
$header_line .= " */\n";
# If we have annotations, output it now.
- if ($has_placeholder || scalar %$abbreviations_ref) {
+ if ($has_unused || scalar %$abbreviations_ref) {
my $text = "";
foreach my $abbr (sort keys %$abbreviations_ref) {
$text .= "; " if $text;
$text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
}
- if ($has_placeholder) {
- $text .= "; other " if $text;
- $text .= "lowercase names are placeholders for"
- . " property values not defined until a later Unicode"
- . " release, so are irrelevant in this one, as they are"
- . " not assigned to any code points";
+ if ($has_unused) {
+ $text .= "; $unused_table_hdr stands for 'unused in this Unicode"
+ . " release (and the data in the row or column are garbage)"
}
my $indent = " " x 3;
print $out_fh "\n";
}
- print $out_fh "};\n";
+ output_table_trailer();
}
sub output_GCB_table() {
GCB_BREAKABLE => 1,
GCB_RI_then_RI => 2, # Rules 12 and 13
GCB_EX_then_EM => 3, # Rule 10
+ GCB_Maybe_Emoji_NonBreak => 4,
);
# The table is constructed in reverse order of the rules, to make the
$gcb_table[$gcb_enums{'Regional_Indicator'}]
[$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
+ # Post 11.0: GB11 \p{Extended_Pictographic} Extend* ZWJ
+ # × \p{Extended_Pictographic}
+ $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'XPG_XX'}] =
+ $gcb_actions{GCB_Maybe_Emoji_NonBreak};
+
+ # This and the rule GB10 obsolete starting with Unicode 11.0, can be left
+ # in as there are no code points that match, so the code won't ever get
+ # executed.
# Do not break within emoji modifier sequences or emoji zwj sequences.
- # GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
+ # Pre 11.0: GB11 ZWJ × ( Glue_After_Zwj | E_Base_GAZ )
$gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
$gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
}
}
- # LB8a Do not break between a zero width joiner and an ideograph, emoji
- # base or emoji modifier. This rule prevents breaks within emoji joiner
- # sequences.
- # ZWJ × (ID | EB | EM)
- $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
- = $lb_actions{'LB_NOBREAK'};
- $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
- = $lb_actions{'LB_NOBREAK'};
- $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
- = $lb_actions{'LB_NOBREAK'};
+ # LB8a Do not break after a zero width joiner
+ # ZWJ ×
+ for my $i (0 .. @lb_table - 1) {
+ $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_NOBREAK'};
+ }
# LB8 Break before any character following a zero-width space, even if one
# or more spaces intervene.
# algorithm stops at the earliest matching rule
my @wb_table;
- my $table_size = @wb_short_enums - 1; # -1 because we don't use UNKNOWN
- die "UNKNOWN must be final WB enum" unless $wb_short_enums[-1] =~ /unk/i;
+ my $table_size = @wb_short_enums;
# Otherwise, break everywhere (including around ideographs).
# WB99 Any ÷ Any
# WB13b ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
$wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'XPG_LE'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
= $wb_actions{'WB_NOBREAK'};
# WB13a (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
- # × # ExtendNumLet
+ # × ExtendNumLet
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'ExtendNumLet'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
# WB10 Numeric × (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'Numeric'}][$wb_enums{'XPG_LE'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
# WB9 (ALetter | Hebrew_Letter) × Numeric
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Numeric'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
= $wb_actions{'WB_NOBREAK'};
# × (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
+ $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'XPG_LE'}]
+ += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
+ $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'XPG_LE'}]
+ += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
+ $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'XPG_LE'}]
+ += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
$wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
+= $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
# | Single_Quote) (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'MidNumLet'}]
+ += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'MidLetter'}]
+ += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Single_Quote'}]
+ += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
+= $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
# WB5 (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'ALetter'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Hebrew_Letter'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'XPG_LE'}]
+ = $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'XPG_LE'}]
+ = $wb_actions{'WB_NOBREAK'};
# Ignore Format and Extend characters, except after sot, CR, LF, and
# Newline. This also has the effect of: Any × (Format | Extend | ZWJ)
$wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
}
+ # Keep horizontal whitespace together
+ # Use perl's tailoring instead
+ # WB3d WSegSpace × WSegSpace
+ #$wb_table[$wb_enums{'WSegSpace'}][$wb_enums{'WSegSpace'}]
+ # = $wb_actions{'WB_NOBREAK'};
+
# Do not break within emoji zwj sequences.
# WB3c ZWJ × ( Glue_After_Zwj | EBG )
$wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
= $wb_actions{'WB_NOBREAK'};
$wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
= $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'XPG_XX'}]
+ = $wb_actions{'WB_NOBREAK'};
+ $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'XPG_LE'}]
+ = $wb_actions{'WB_NOBREAK'};
- # Break before and after white space
+ # Break before and after newlines
# WB3b ÷ (Newline | CR | LF)
# WB3a (Newline | CR | LF) ÷
# et. al.
return $sanitized;
}
-switch_pound_if ('ALL', 'PERL_IN_UTF8_C');
+switch_pound_if ('ALL', 'PERL_IN_REGCOMP_C');
output_invlist("Latin1", [ 0, 256 ]);
output_invlist("AboveLatin1", [ 256 ]);
push @props, sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
&NonL1_Perl_Non_Final_Folds
&UpperLatin1
- _Perl_GCB,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,EDGE
- _Perl_LB,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,EDGE,
- _Perl_SB,SContinue,CR,Extend,LF,EDGE
- _Perl_WB,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,EDGE,UNKNOWN
+ _Perl_GCB,EDGE,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,XPG_XX
+ _Perl_LB,EDGE,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner
+ _Perl_SB,EDGE,SContinue,CR,Extend,LF
+ _Perl_WB,Perl_Tailored_HSpace,EDGE,UNKNOWN,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,XPG_XX,XPG_LE
_Perl_SCX,Latin,Inherited,Unknown,Kore,Jpan,Hanb,INVALID
Lowercase_Mapping
Titlecase_Mapping
}
}
- switch_pound_if ($prop_name, 'PERL_IN_UTF8_C');
+ switch_pound_if ($prop_name, 'PERL_IN_REGCOMP_C');
start_charset_pound_if($charset, 1) unless $same_in_all_code_pages;
output_invlist($prop_name, \@invlist, ($same_in_all_code_pages)
}
}
-switch_pound_if ('binary_property_tables', 'PERL_IN_UTF8_C');
+switch_pound_if ('binary_property_tables', 'PERL_IN_REGCOMP_C');
print $out_fh "\nconst char * deprecated_property_msgs[] = {\n\t";
print $out_fh join ",\n\t", map { "\"$_\"" } @deprecated_messages;
print $out_fh "} binary_invlist_enum;\n";
print $out_fh "\n#define MAX_UNI_KEYWORD_INDEX $enums[-1]\n";
-print $out_fh "\n/* Synonyms for perl properties */\n";
-print $out_fh join "\n", @perl_prop_synonyms, "\n";
-
-print $out_fh "\nstatic const UV * const PL_uni_prop_ptrs\[] = {\n";
+output_table_header($out_fh, "UV *", "uni_prop_ptrs");
print $out_fh "\tNULL,\t/* Placeholder */\n\t";
+print $out_fh "\t";
print $out_fh join ",\n\t", @invlist_names;
print $out_fh "\n";
-print $out_fh "};\n";
+
+output_table_trailer();
+
+print $out_fh join "\n", "\n",
+ #'# ifdef DOINIT',
+ #"\n",
+ "/* Synonyms for perl properties */",
+ @perl_prop_synonyms,
+ #"\n",
+ #"# endif /* DOINIT */",
+ "\n";
switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
end_file_pound_if;
+print $out_fh <<"EOF";
+
+/* More than one code point may have the same code point as their fold. This
+ * gives the maximum number in the current Unicode release. (The folded-to
+ * code point is not included in this count.) For example, both 'S' and
+ * \\x{17F} fold to 's', so the number for that fold is 2. Another way to
+ * look at it is the maximum length of all the IVCF_AUX_TABLE's */
+#define MAX_FOLD_FROMS $max_fold_froms
+EOF
+
my $sources_list = "lib/unicore/mktables.lst";
my @sources = qw(regen/mk_invlists.pl
lib/unicore/mktables
sub token_name
{
my $name = sanitize_name(shift);
- warn "$name contains non-word" if $name =~ /\W/a;
+ warn "$name contains non-word" if $name =~ /\W/;
return "$table_name_prefix\U$name"
}
no warnings 'once';
print $keywords_fh <<"EOF";
-/* The precisionn to use in "%.*e" formats */
+/* The precision to use in "%.*e" formats */
#define PL_E_FORMAT_PRECISION $utf8::e_precision
EOF