Upgrade podlators from version 4.10 to 4.11

[perl5.git] / regen / mk_invlists.pl
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl

index c9e05a8..fbec6c7 100644 (file)
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -8,10 +8,12 @@ use Unicode::UCD qw(prop_aliases
                      prop_invlist
                      prop_invmap search_invlist
                      charprop
+                    num
                     );
  require './regen/regen_lib.pl';
  require './regen/charset_translations.pl';
  require './lib/unicore/Heavy.pl';
+use re "/aa";
  
  # This program outputs charclass_invlists.h, which contains various inversion
  # lists in the form of C arrays that are to be used as-is for inversion lists.
@@ -24,15 +26,15 @@ require './lib/unicore/Heavy.pl';
  # out-of-sync, or the wrong data structure being passed.  Currently that
  # random number is:
  
-# charclass_invlists.h now also has a partial implementation of inversion
-# maps; enough to generate tables for the line break properties, such as GCB
+# charclass_invlists.h now also contains inversion maps and enum definitions
+# for those maps that have a finite number of possible values
  
  my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
  
  # integer or float
-my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
+my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /x;
  
-my @keywords;
+my %keywords;
  my $table_name_prefix = "PL_";
  
  # Matches valid C language enum names: begins with ASCII alphabetic, then any
@@ -51,7 +53,7 @@ print $out_fh "/* See the generating file for comments */\n\n";
  
  # enums that should be made public
  my %public_enums = (
-                    #_Perl_SCX => 1
+                    _Perl_SCX => 1
                      );
  
  # The symbols generated by this program are all currently defined only in a
@@ -87,11 +89,19 @@ foreach my $name (sort keys %utf8::loose_property_name_of) {
  }
  
  # Output these tables in the same vicinity as each other, so that will get
-# paged in at about the same time
+# paged in at about the same time.  These are also assumed to be the exact
+# same list as those properties used internally by perl.
  my %keep_together = (
                          assigned => 1,
                          ascii => 1,
+                        upper => 1,
+                        lower => 1,
+                        title => 1,
                          cased => 1,
+                        uppercaseletter => 1,
+                        lowercaseletter => 1,
+                        titlecaseletter => 1,
+                        casedletter => 1,
                          vertspace => 1,
                          xposixalnum => 1,
                          xposixalpha => 1,
@@ -129,6 +139,7 @@ my %keep_together = (
                          _perl_problematic_locale_folds => 1,
                          _perl_quotemeta => 1,
                      );
+my %perl_tags;  # So can find synonyms of the above properties
  
  sub uniques {
      # Returns non-duplicated input values.  From "Perl Best Practices:
@@ -158,9 +169,11 @@ sub end_charset_pound_if {
      print $out_fh "\n" . get_conditional_compile_line_end();
  }
  
-sub switch_pound_if ($$) {
+sub switch_pound_if ($$;$) {
      my $name = shift;
      my $new_pound_if = shift;
+    my $charset = shift;
+
      my @new_pound_if = ref ($new_pound_if)
                         ? sort @$new_pound_if
                         : $new_pound_if;
@@ -168,6 +181,10 @@ sub switch_pound_if ($$) {
      # Switch to new #if given by the 2nd argument.  If there is an override
      # for this, it instead switches to that.  The 1st argument is the
      # static's name, used only to check if there is an override for this
+    #
+    # The 'charset' parmameter, if present, is used to first end the charset
+    # #if if we actually do a switch, and then restart it afterwards.  This
+    # code, then assumes that the charset #if's are enclosed in the file ones.
  
      if (exists $exceptions_to_where_to_define{$name}) {
          @new_pound_if = $exceptions_to_where_to_define{$name};
@@ -181,6 +198,8 @@ sub switch_pound_if ($$) {
      # Change to the new one if different from old
      if ($in_file_pound_if ne $new_pound_if) {
  
+        end_charset_pound_if() if defined $charset;
+
          # Exit any current #if
          if ($in_file_pound_if) {
              end_file_pound_if;
@@ -188,6 +207,8 @@ sub switch_pound_if ($$) {
  
          $in_file_pound_if = $new_pound_if;
          print $out_fh "\n#if $in_file_pound_if\n";
+
+        start_charset_pound_if ($charset, 1) if defined $charset;
      }
  }
  
@@ -491,10 +512,14 @@ sub output_invmap ($$$$$$$) {
                         ? 'PERL_IN_UTF8_C'
                         : 'PERL_IN_REGEXEC_C';
  
-        end_charset_pound_if;
-        end_file_pound_if;
-        switch_pound_if($name, $where) unless exists $public_enums{$name};
-        start_charset_pound_if($charset, 1);
+        if (! exists $public_enums{$name}) {
+            switch_pound_if($name, $where, $charset);
+        }
+        else {
+            end_charset_pound_if;
+            end_file_pound_if;
+            start_charset_pound_if($charset, 1);
+        }
  
          # If the enum only contains one element, that is a dummy, default one
          if (scalar @enum_definition > 1) {
@@ -520,9 +545,7 @@ sub output_invmap ($$$$$$$) {
              print $out_fh "} $enum_declaration_type;\n";
          }
  
-        end_charset_pound_if;
-        switch_pound_if($name, $where);
-        start_charset_pound_if($charset, 1);
+        switch_pound_if($name, $where, $charset);
  
          $invmap_declaration_type = ($input_format =~ /s/)
                                   ? $enum_declaration_type
@@ -607,7 +630,7 @@ sub output_invmap ($$$$$$$) {
              # that.
              for (my $i = 0; $i < @decimals_invlist; $i += 2) {
                  my $code_point = $decimals_invlist[$i];
-                next if chr($code_point) !~ /\p{Nv=0}/;
+                next if num(chr($code_point)) ne '0';
  
                  # Turn the scripts this zero is in into a list.
                  my @scripts = split ",",
@@ -739,9 +762,9 @@ for my $i (0 .. @$folds_ref - 1) {
      # Add to the non-finals list each code point that is in a non-final
      # position
      for my $j (0 .. @{$folds_ref->[$i]} - 2) {
-        push @is_non_final_fold, $folds_ref->[$i][$j]
-                unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
+        push @is_non_final_fold, $folds_ref->[$i][$j];
      }
+    @is_non_final_fold = uniques @is_non_final_fold;
  }
  
  sub _Perl_Non_Final_Folds {
@@ -818,7 +841,7 @@ sub _Perl_IVCF {
      #    other.  This situation happens in Unicode 3.0.1, but probably no
      #    other version.
      foreach my $fold (keys %new) {
-        my $folds_to_string = $fold =~ /\D/a;
+        my $folds_to_string = $fold =~ /\D/;
  
          # If the bucket contains only one element, convert from an array to a
          # scalar
@@ -1016,7 +1039,7 @@ sub output_table_common {
  
      for my $i (0 .. $size - 1) {
          no warnings 'numeric';
-        $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
+        $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /x;
          $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
      }
  
@@ -2163,21 +2186,48 @@ push @props, sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
                  # are needed by perl, but aren't in all Unicode releases.
  
  my @bin_props;
-my @bin_prop_defines;
+my @perl_prop_synonyms;
  my %enums;
  my @deprecated_messages = "";   # Element [0] is a placeholder
  my %deprecated_tags;
  
+my $float_e_format = qr/ ^ -? \d \. \d+ e [-+] \d+ $ /x;
+
+# Create another hash that maps floating point x.yyEzz representation to what
+# %stricter_to_file_of does for the equivalent rational.  A typical entry in
+# the latter hash is
+#
+#    'nv=1/2' => 'Nv/1_2',
+#
+# From that, this loop creates an entry
+#
+#    'nv=5.00e-01' => 'Nv/1_2',
+#
+# %stricter_to_file_of contains far more than just the rationals.  Instead we
+# use %utf8::nv_floating_to_rational which should have an entry for each
+# nv in the former hash.
+my %floating_to_file_of;
+foreach my $key (keys %utf8::nv_floating_to_rational) {
+    my $value = $utf8::nv_floating_to_rational{$key};
+    $floating_to_file_of{$key} = $utf8::stricter_to_file_of{"nv=$value"};
+}
+
  # Collect all the binary properties from data in lib/unicore
  # Sort so that complements come after the main table, and the shortest
-# names first, finally alphabetically.
+# names first, finally alphabetically.  Also, sort together the tables we want
+# to be kept together, and prefer those with 'posix' in their names, which is
+# what the C code is expecting their names to be.
  foreach my $property (sort
          {   exists $keep_together{lc $b} <=> exists $keep_together{lc $a}
+         or $b =~ /posix/i <=> $a =~ /posix/i
+         or $b =~ /perl/i <=> $a =~ /perl/i
+         or $a =~ $float_e_format <=> $b =~ $float_e_format
           or $a =~ /!/ <=> $b =~ /!/
           or length $a <=> length $b
           or $a cmp $b
          }   keys %utf8::loose_to_file_of,
-            keys %utf8::stricter_to_file_of
+            keys %utf8::stricter_to_file_of,
+            keys %floating_to_file_of
  ) {
  
      # These two hashes map properties to values that can be considered to
@@ -2185,11 +2235,15 @@ foreach my $property (sort
      # identical entries.  Otherwise they differ in some way.
      my $tag = $utf8::loose_to_file_of{$property};
      $tag = $utf8::stricter_to_file_of{$property} unless defined $tag;
+    $tag = $floating_to_file_of{$property} unless defined $tag;
  
      # The tag may contain an '!' meaning it is identical to the one formed
      # by removing the !, except that it is inverted.
      my $inverted = $tag =~ s/!//;
  
+    # This hash is lacking the property name
+    $property = "nv=$property" if $property =~ $float_e_format;
+
      # The list of 'prop=value' entries that this single entry expands to
      my @this_entries;
  
@@ -2216,8 +2270,7 @@ foreach my $property (sort
                                      keys %utf8::loose_property_to_file_of;
  
              my $new_entry = $alias . $rhs;
-            push @this_entries, $new_entry
-                            unless grep { $_ eq $new_entry } @this_entries;
+            push @this_entries, $new_entry;
          }
      }
  
@@ -2234,11 +2287,15 @@ foreach my $property (sort
          push @bin_props, uc $property;
  
          # Create a rule for the parser
-        push @keywords, $property unless grep { $property eq $_ } @keywords;
+        if (! exists $keywords{$property}) {
+            $keywords{$property} = token_name($property);
+        }
  
          # And create an enum for it.
          $enums{$tag} = $table_name_prefix . uc sanitize_name($property);
  
+        $perl_tags{$tag} = 1 if exists $keep_together{lc $property};
+
          # Some properties are deprecated.  This hash tells us so, and the
          # warning message to raise if they are used.
          if (exists $utf8::why_deprecated{$tag}) {
@@ -2258,23 +2315,27 @@ foreach my $property (sort
      $defined_to .= $enums{$tag};
  
      # Go through the entries that evaluate to this.
+    @this_entries = uniques @this_entries;
      foreach my $define (@this_entries) {
  
          # There is a rule for the parser for each.
-        push @keywords, $define unless grep { $define eq $_ } @keywords;
-
-        # And a #define for each to this.
-        push @bin_prop_defines, "#define "
-                                . $table_name_prefix
-                                . uc(sanitize_name($define))
-                                . "   $defined_to";
+        $keywords{$define} = $defined_to;
+
+        # And a #define for all simple names equivalent to a perl property,
+        # except those that begin with 'is' or 'in';
+        if (exists $perl_tags{$tag} && $property !~ / ^ i[ns] | = /x) {
+            push @perl_prop_synonyms, "#define "
+                                    . $table_name_prefix
+                                    . uc(sanitize_name($define))
+                                    . "   $defined_to";
+        }
      }
  }
  
  @bin_props = sort {  exists $keep_together{lc $b} <=> exists $keep_together{lc $a}
                     or $a cmp $b
                    } @bin_props;
-@bin_prop_defines = sort @bin_prop_defines;
+@perl_prop_synonyms = sort(uniques(@perl_prop_synonyms));
  push @props, @bin_props;
  
  foreach my $prop (@props) {
@@ -2383,9 +2444,10 @@ foreach my $prop (@props) {
          # 255 because a re-ordering could cause 256 to need to be in the same
          # range as 255.)
          if (       (@invmap && $maps_to_code_point)
-            || (   ($invlist[0] < 256
+            || (    @invlist
+                &&  $invlist[0] < 256
                  && (    $invlist[0] != 0
-                    || (scalar @invlist != 1 && $invlist[1] < 256)))))
+                    || (scalar @invlist != 1 && $invlist[1] < 256))))
          {
              $same_in_all_code_pages = 0;
              if (! @invmap) {    # Straight inversion list
@@ -2591,6 +2653,10 @@ foreach my $prop (@props) {
                  unshift @invlist, @new_invlist;
              }
          }
+        elsif (@invmap) {   # inversion maps can't cope with this variable
+                            # being true, even if it could be true
+            $same_in_all_code_pages = 0;
+        }
          else {
              $same_in_all_code_pages = 1;
          }
@@ -2648,7 +2714,11 @@ foreach my $prop (@props) {
                  $found_nonl1 = 1;
                  last;
              }
-            die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
+            if (! $found_nonl1) {
+                warn "No non-Latin1 code points in $prop_name";
+                output_invlist($prop_name, []);
+                last;
+            }
          }
  
          switch_pound_if ($prop_name, 'PERL_IN_UTF8_C');
@@ -2674,10 +2744,6 @@ print $out_fh "\nconst char * deprecated_property_msgs[] = {\n\t";
  print $out_fh join ",\n\t", map { "\"$_\"" } @deprecated_messages;
  print $out_fh "\n};\n";
  
-switch_pound_if ('binary_property_tables', [ 'PERL_IN_UTF8_C',
-                                             'PERL_IN_UNI_KEYWORDS_C',
-                                            ]);
-
  my @enums = sort values %enums;
  
  # Save a copy of these before modification
@@ -2713,9 +2779,9 @@ print $out_fh join ",\n\t", @enums;
  print $out_fh "\n";
  print $out_fh "} binary_invlist_enum;\n";
  print $out_fh "\n#define MAX_UNI_KEYWORD_INDEX $enums[-1]\n";
-print $out_fh "\n", join "\n", @bin_prop_defines, "\n";
  
-switch_pound_if ('binary_property_index_table', 'PERL_IN_UTF8_C' );
+print $out_fh "\n/* Synonyms for perl properties */\n";
+print $out_fh join "\n", @perl_prop_synonyms, "\n";
  
  print $out_fh "\nstatic const UV * const PL_uni_prop_ptrs\[] = {\n";
  print $out_fh "\tNULL,\t/* Placeholder */\n\t";
@@ -2723,8 +2789,6 @@ print $out_fh join ",\n\t", @invlist_names;
  print $out_fh "\n";
  print $out_fh "};\n";
  
-end_file_pound_if;
-
  switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
  
  output_GCB_table();
@@ -2738,6 +2802,7 @@ my @sources = qw(regen/mk_invlists.pl
                   lib/unicore/mktables
                   lib/Unicode/UCD.pm
                   regen/charset_translations.pl
+                 regen/mk_PL_charclass.pl
                 );
  {
      # Depend on mktables’ own sources.  It’s a shorter list of files than
@@ -2758,49 +2823,29 @@ my @sources = qw(regen/mk_invlists.pl
  
  read_only_bottom_close_and_rename($out_fh, \@sources);
  
-use Devel::Tokenizer::C;
+require './regen/mph.pl';
  
  sub token_name
  {
      my $name = sanitize_name(shift);
-    warn "$name contains non-word" if $name =~ /\W/a;
+    warn "$name contains non-word" if $name =~ /\W/;
  
-    return "return $table_name_prefix\U$name;\n"
+    return "$table_name_prefix\U$name"
  }
  
-my $t = Devel::Tokenizer::C->new(TokenFunc => \&token_name,
-                                 StringLength  => 'len',
-                                 Strategy      => 'narrow',
-                                 TokenEnd      =>  undef,
-                                 UnknownCode   => 'return 0;',
-        );
-
-$t->add_tokens(lc $_) for @keywords;
-
-my $keywords_fh = open_new('uni_keywords.c', '>',
+my $keywords_fh = open_new('uni_keywords.h', '>',
                   {style => '*', by => 'regen/mk_invlists.pl',
-                  from => "Unicode::UCD"});
-
-print $keywords_fh <<EOF;
-
-#define PERL_IN_UNI_KEYWORDS_C
-
-#include "EXTERN.h"
-#include "perl.h"
-
-int
-Perl_uniprop_lookup(const char * tokstr, const Size_t len)
-{
+                  from => "mph.pl"});
  
-    PERL_ARGS_ASSERT_UNIPROP_LOOKUP;
+no warnings 'once';
+print $keywords_fh <<"EOF";
+/* The precisionn to use in "%.*e" formats */
+#define PL_E_FORMAT_PRECISION $utf8::e_precision
  
  EOF
  
-print $keywords_fh $t->generate;
-
-print $keywords_fh <<EOF;
-
-}
-EOF
+my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows) = MinimalPerfectHash::make_mph_from_hash(\%keywords);
+print $keywords_fh MinimalPerfectHash::make_algo($second_level, $seed1, $length_all_keys, $smart_blob, $rows, undef, undef, undef, 'match_uniprop' );
  
+push @sources, 'regen/mph.pl';
  read_only_bottom_close_and_rename($keywords_fh, \@sources);