-for my $charset (get_supported_code_pages()) {
- print $out_fh "\n" . get_conditional_compile_line_start($charset);
-
- @a2n = @{get_a2n($charset)};
- # Below is the list of property names to generate. '&' means to use the
- # subroutine to generate the inversion list instead of the generic code
- # below. Some properties have a comma-separated list after the name,
- # These are extra enums to add to those found in the Unicode tables.
- no warnings 'qw';
- # Ignore non-alpha in sort
- for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
- Assigned
- ASCII
- Cased
- VertSpace
- XPerlSpace
- XPosixAlnum
- XPosixAlpha
- XPosixBlank
- XPosixCntrl
- XPosixDigit
- XPosixGraph
- XPosixLower
- XPosixPrint
- XPosixPunct
- XPosixSpace
- XPosixUpper
- XPosixWord
- XPosixXDigit
- _Perl_Any_Folds
- &NonL1_Perl_Non_Final_Folds
- _Perl_Folds_To_Multi_Char
- &UpperLatin1
- _Perl_IDStart
- _Perl_IDCont
- _Perl_GCB,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,EDGE
- _Perl_LB,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,EDGE,
- _Perl_SB,SContinue,CR,Extend,LF,EDGE
- _Perl_WB,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,EDGE,UNKNOWN
- _Perl_SCX,Latin,Inherited,Unknown,Kore,Jpan,Hanb,INVALID
- Lowercase_Mapping
- Titlecase_Mapping
- Uppercase_Mapping
- Simple_Case_Folding
- Case_Folding
- )
- # NOTE that the convention is that extra enum
- # values come after the property name, separated by
- # commas, with the enums that aren't ever defined
- # by Unicode coming last, at least 4 all-uppercase
- # characters. The others are enum names that are
- # needed by perl, but aren't in all Unicode
- # releases.
- ) {
-
- # For the Latin1 properties, we change to use the eXtended version of the
- # base property, then go through the result and get rid of everything not
- # in Latin1 (above 255). Actually, we retain the element for the range
- # that crosses the 255/256 boundary if it is one that matches the
- # property. For example, in the Word property, there is a range of code
- # points that start at U+00F8 and goes through U+02C1. Instead of
- # artificially cutting that off at 256 because 256 is the first code point
- # above Latin1, we let the range go to its natural ending. That gives us
- # extra information with no added space taken. But if the range that
- # crosses the boundary is one that doesn't match the property, we don't
- # start a new range above 255, as that could be construed as going to
- # infinity. For example, the Upper property doesn't include the character
- # at 255, but does include the one at 256. We don't include the 256 one.
- my $prop_name = $prop;
- my $is_local_sub = $prop_name =~ s/^&//;
- my $extra_enums = "";
- $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
- my $lookup_prop = $prop_name;
- my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
- or $lookup_prop =~ s/^L1//);
- my $nonl1_only = 0;
- $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
- ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
+# Below is the list of property names to generate. '&' means to use the
+# subroutine to generate the inversion list instead of the generic code
+# below. Some properties have a comma-separated list after the name,
+# These are extra enums to add to those found in the Unicode tables.
+no warnings 'qw';
+ # Ignore non-alpha in sort
+my @props;
+push @props, sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
+ &NonL1_Perl_Non_Final_Folds
+ &UpperLatin1
+ _Perl_GCB,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,EDGE
+ _Perl_LB,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,EDGE,
+ _Perl_SB,SContinue,CR,Extend,LF,EDGE
+ _Perl_WB,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,EDGE,UNKNOWN
+ _Perl_SCX,Latin,Inherited,Unknown,Kore,Jpan,Hanb,INVALID
+ Lowercase_Mapping
+ Titlecase_Mapping
+ Uppercase_Mapping
+ Simple_Case_Folding
+ Case_Folding
+ &_Perl_IVCF
+ );
+ # NOTE that the convention is that extra enum values come
+ # after the property name, separated by commas, with the enums
+ # that aren't ever defined by Unicode coming last, at least 4
+ # all-uppercase characters. The others are enum names that
+ # are needed by perl, but aren't in all Unicode releases.
+
+my @bin_props;
+my @perl_prop_synonyms;
+my %enums;
+my @deprecated_messages = ""; # Element [0] is a placeholder
+my %deprecated_tags;
+
+my $float_e_format = qr/ ^ -? \d \. \d+ e [-+] \d+ $ /x;
+
+# Create another hash that maps floating point x.yyEzz representation to what
+# %stricter_to_file_of does for the equivalent rational. A typical entry in
+# the latter hash is
+#
+# 'nv=1/2' => 'Nv/1_2',
+#
+# From that, this loop creates an entry
+#
+# 'nv=5.00e-01' => 'Nv/1_2',
+#
+# %stricter_to_file_of contains far more than just the rationals. Instead we
+# use %utf8::nv_floating_to_rational which should have an entry for each
+# nv in the former hash.
+my %floating_to_file_of;
+foreach my $key (keys %utf8::nv_floating_to_rational) {
+ my $value = $utf8::nv_floating_to_rational{$key};
+ $floating_to_file_of{$key} = $utf8::stricter_to_file_of{"nv=$value"};
+}
+
+# Collect all the binary properties from data in lib/unicore
+# Sort so that complements come after the main table, and the shortest
+# names first, finally alphabetically. Also, sort together the tables we want
+# to be kept together, and prefer those with 'posix' in their names, which is
+# what the C code is expecting their names to be.
+foreach my $property (sort
+ { exists $keep_together{lc $b} <=> exists $keep_together{lc $a}
+ or $b =~ /posix/i <=> $a =~ /posix/i
+ or $b =~ /perl/i <=> $a =~ /perl/i
+ or $a =~ $float_e_format <=> $b =~ $float_e_format
+ or $a =~ /!/ <=> $b =~ /!/
+ or length $a <=> length $b
+ or $a cmp $b
+ } keys %utf8::loose_to_file_of,
+ keys %utf8::stricter_to_file_of,
+ keys %floating_to_file_of
+) {
+
+ # These two hashes map properties to values that can be considered to
+ # be checksums. If two properties have the same checksum, they have
+ # identical entries. Otherwise they differ in some way.
+ my $tag = $utf8::loose_to_file_of{$property};
+ $tag = $utf8::stricter_to_file_of{$property} unless defined $tag;
+ $tag = $floating_to_file_of{$property} unless defined $tag;
+
+ # The tag may contain an '!' meaning it is identical to the one formed
+ # by removing the !, except that it is inverted.
+ my $inverted = $tag =~ s/!//;
+
+ # This hash is lacking the property name
+ $property = "nv=$property" if $property =~ $float_e_format;
+
+ # The list of 'prop=value' entries that this single entry expands to
+ my @this_entries;
+
+ # Split 'property=value' on the equals sign, with $lhs being the whole
+ # thing if there is no '='
+ my ($lhs, $rhs) = $property =~ / ( [^=]* ) ( =? .*) /x;
+
+ # $lhs then becomes the property name. See if there are any synonyms
+ # for this property.
+ if (exists $prop_name_aliases{$lhs}) {
+
+ # If so, do the combinatorics so that a new entry is added for
+ # each legal property combined with the property value (which is
+ # $rhs)
+ foreach my $alias (@{$prop_name_aliases{$lhs}}) {
+
+ # But, there are some ambiguities, like 'script' is a synonym
+ # for 'sc', and 'sc' can stand alone, meaning something
+ # entirely different than 'script'. 'script' cannot stand
+ # alone. Don't add if the potential new lhs is in the hash of
+ # stand-alone properties.
+ no warnings 'once';
+ next if $rhs eq "" && grep { $alias eq $_ }
+ keys %utf8::loose_property_to_file_of;
+
+ my $new_entry = $alias . $rhs;
+ push @this_entries, $new_entry;
+ }
+ }
+
+ # Above, we added the synonyms for the base entry we're now
+ # processing. But we haven't dealt with it yet. If we already have a
+ # property with the identical characteristics, this becomes just a
+ # synonym for it.
+ if (exists $enums{$tag}) {
+ push @this_entries, $property;
+ }
+ else { # Otherwise, create a new entry.
+
+ # Add to the list of properties to generate inversion lists for.
+ push @bin_props, uc $property;
+
+ # Create a rule for the parser
+ if (! exists $keywords{$property}) {
+ $keywords{$property} = token_name($property);
+ }
+
+ # And create an enum for it.
+ $enums{$tag} = $table_name_prefix . uc sanitize_name($property);
+
+ $perl_tags{$tag} = 1 if exists $keep_together{lc $property};
+
+ # Some properties are deprecated. This hash tells us so, and the
+ # warning message to raise if they are used.
+ if (exists $utf8::why_deprecated{$tag}) {
+ $deprecated_tags{$enums{$tag}} = scalar @deprecated_messages;
+ push @deprecated_messages, $utf8::why_deprecated{$tag};
+ }
+
+ # Our sort above should have made sure that we see the
+ # non-inverted version first, but this makes sure.
+ warn "$property is inverted!!!" if $inverted;
+ }
+
+ # Everything else is #defined to be the base enum, inversion is
+ # indicated by negating the value.
+ my $defined_to = "";
+ $defined_to .= "-" if $inverted;
+ $defined_to .= $enums{$tag};
+
+ # Go through the entries that evaluate to this.
+ @this_entries = uniques @this_entries;
+ foreach my $define (@this_entries) {
+
+ # There is a rule for the parser for each.
+ $keywords{$define} = $defined_to;
+
+ # And a #define for all simple names equivalent to a perl property,
+ # except those that begin with 'is' or 'in';
+ if (exists $perl_tags{$tag} && $property !~ / ^ i[ns] | = /x) {
+ push @perl_prop_synonyms, "#define "
+ . $table_name_prefix
+ . uc(sanitize_name($define))
+ . " $defined_to";
+ }
+ }
+}
+
+@bin_props = sort { exists $keep_together{lc $b} <=> exists $keep_together{lc $a}
+ or $a cmp $b
+ } @bin_props;
+@perl_prop_synonyms = sort(uniques(@perl_prop_synonyms));
+push @props, @bin_props;
+
+foreach my $prop (@props) {
+
+ # For the Latin1 properties, we change to use the eXtended version of the
+ # base property, then go through the result and get rid of everything not
+ # in Latin1 (above 255). Actually, we retain the element for the range
+ # that crosses the 255/256 boundary if it is one that matches the
+ # property. For example, in the Word property, there is a range of code
+ # points that start at U+00F8 and goes through U+02C1. Instead of
+ # artificially cutting that off at 256 because 256 is the first code point
+ # above Latin1, we let the range go to its natural ending. That gives us
+ # extra information with no added space taken. But if the range that
+ # crosses the boundary is one that doesn't match the property, we don't
+ # start a new range above 255, as that could be construed as going to
+ # infinity. For example, the Upper property doesn't include the character
+ # at 255, but does include the one at 256. We don't include the 256 one.
+ my $prop_name = $prop;
+ my $is_local_sub = $prop_name =~ s/^&//;
+ my $extra_enums = "";
+ $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
+ my $lookup_prop = $prop_name;
+ $prop_name = sanitize_name($prop_name);
+ $prop_name = $table_name_prefix . $prop_name if grep { lc $lookup_prop eq lc $_ } @bin_props;
+ my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
+ or $lookup_prop =~ s/^L1//);
+ my $nonl1_only = 0;
+ $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
+ ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
+
+ for my $charset (get_supported_code_pages()) {
+ @a2n = @{get_a2n($charset)};