pat_advanced.t: Update test

[perl5.git] / regen / mk_invlists.pl
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl

index 7f0a4d2..8136f43 100644 (file)
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -8,9 +8,13 @@ use Unicode::UCD qw(prop_aliases
                      prop_invlist
                      prop_invmap search_invlist
                      charprop
+                    num
+                    charblock
                     );
  require './regen/regen_lib.pl';
  require './regen/charset_translations.pl';
+require './lib/unicore/UCD.pl';
+use re "/aa";
  
  # This program outputs charclass_invlists.h, which contains various inversion
  # lists in the form of C arrays that are to be used as-is for inversion lists.
@@ -23,13 +27,25 @@ require './regen/charset_translations.pl';
  # out-of-sync, or the wrong data structure being passed.  Currently that
  # random number is:
  
-# charclass_invlists.h now also has a partial implementation of inversion
-# maps; enough to generate tables for the line break properties, such as GCB
-
  my $VERSION_DATA_STRUCTURE_TYPE = 148565664;
  
-# integer or float
-my $numeric_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /ax;
+# charclass_invlists.h now also contains inversion maps and enum definitions
+# for those maps that have a finite number of possible values
+
+# integer or float (no exponent)
+my $integer_or_float_re = qr/ ^ -? \d+ (:? \. \d+ )? $ /x;
+
+# Also includes rationals
+my $numeric_re = qr! $integer_or_float_re | ^ -? \d+ / \d+ $ !x;
+
+# More than one code point may have the same code point as their fold.  This
+# gives the maximum number in the current Unicode release.  (The folded-to
+# code point is not included in this count.)  Most folds are pairs of code
+# points, like 'B' and 'b', so this number is at least one.
+my $max_fold_froms = 1;
+
+my %keywords;
+my $table_name_prefix = "UNI_";
  
  # Matches valid C language enum names: begins with ASCII alphabetic, then any
  # ASCII \w
@@ -47,7 +63,7 @@ print $out_fh "/* See the generating file for comments */\n\n";
  
  # enums that should be made public
  my %public_enums = (
-                    #_Perl_SCX => 1
+                    _Perl_SCX => 1
                      );
  
  # The symbols generated by this program are all currently defined only in a
@@ -57,8 +73,11 @@ my %exceptions_to_where_to_define =
                          (
                              #_Perl_IVCF => 'PERL_IN_REGCOMP_C',
                          );
+
  my %where_to_define_enums = ();
  
+my $applies_to_all_charsets_text = "all charsets";
+
  my %gcb_enums;
  my @gcb_short_enums;
  my %gcb_abbreviations;
@@ -71,6 +90,71 @@ my %wb_abbreviations;
  
  my @a2n;
  
+my %prop_name_aliases;
+# Invert this hash so that for each canonical name, we get a list of things
+# that map to it (excluding itself)
+foreach my $name (sort keys %Unicode::UCD::loose_property_name_of) {
+    my $canonical = $Unicode::UCD::loose_property_name_of{$name};
+    push @{$prop_name_aliases{$canonical}},  $name if $canonical ne $name;
+}
+
+# Output these tables in the same vicinity as each other, so that will get
+# paged in at about the same time.  These are also assumed to be the exact
+# same list as those properties used internally by perl.
+my %keep_together = (
+                        assigned => 1,
+                        ascii => 1,
+                        upper => 1,
+                        lower => 1,
+                        title => 1,
+                        cased => 1,
+                        uppercaseletter => 1,
+                        lowercaseletter => 1,
+                        titlecaseletter => 1,
+                        casedletter => 1,
+                        vertspace => 1,
+                        xposixalnum => 1,
+                        xposixalpha => 1,
+                        xposixblank => 1,
+                        xposixcntrl => 1,
+                        xposixdigit => 1,
+                        xposixgraph => 1,
+                        xposixlower => 1,
+                        xposixprint => 1,
+                        xposixpunct => 1,
+                        xposixspace => 1,
+                        xposixupper => 1,
+                        xposixword => 1,
+                        xposixxdigit => 1,
+                        posixalnum => 1,
+                        posixalpha => 1,
+                        posixblank => 1,
+                        posixcntrl => 1,
+                        posixdigit => 1,
+                        posixgraph => 1,
+                        posixlower => 1,
+                        posixprint => 1,
+                        posixpunct => 1,
+                        posixspace => 1,
+                        posixupper => 1,
+                        posixword => 1,
+                        posixxdigit => 1,
+                        _perl_any_folds => 1,
+                        _perl_folds_to_multi_char => 1,
+                        _perl_is_in_multi_char_fold => 1,
+                        _perl_non_final_folds => 1,
+                        _perl_idstart => 1,
+                        _perl_idcont => 1,
+                        _perl_charname_begin => 1,
+                        _perl_charname_continue => 1,
+                        _perl_problematic_locale_foldeds_start => 1,
+                        _perl_problematic_locale_folds => 1,
+                        _perl_quotemeta => 1,
+                    );
+my %perl_tags;  # So can find synonyms of the above properties
+
+my $unused_table_hdr = 'u';     # Heading for row or column for unused values
+
  sub uniques {
      # Returns non-duplicated input values.  From "Perl Best Practices:
      # Encapsulated Cleverness".  p. 455 in first edition.
@@ -84,7 +168,7 @@ sub a2n($) {
  
      # Returns the input Unicode code point translated to native.
  
-    return $cp if $cp !~ $numeric_re || $cp > 255;
+    return $cp if $cp !~ $integer_or_float_re || $cp > 255;
      return $a2n[$cp];
  }
  
@@ -95,9 +179,15 @@ sub end_file_pound_if {
      }
  }
  
-sub switch_pound_if ($$) {
+sub end_charset_pound_if {
+    print $out_fh "\n" . get_conditional_compile_line_end();
+}
+
+sub switch_pound_if ($$;$) {
      my $name = shift;
      my $new_pound_if = shift;
+    my $charset = shift;
+
      my @new_pound_if = ref ($new_pound_if)
                         ? sort @$new_pound_if
                         : $new_pound_if;
@@ -105,19 +195,30 @@ sub switch_pound_if ($$) {
      # Switch to new #if given by the 2nd argument.  If there is an override
      # for this, it instead switches to that.  The 1st argument is the
      # static's name, used only to check if there is an override for this
+    #
+    # The 'charset' parmameter, if present, is used to first end the charset
+    # #if if we actually do a switch, and then restart it afterwards.  This
+    # code, then assumes that the charset #if's are enclosed in the file ones.
  
      if (exists $exceptions_to_where_to_define{$name}) {
          @new_pound_if = $exceptions_to_where_to_define{$name};
      }
  
      foreach my $element (@new_pound_if) {
+
+        # regcomp.c is arranged so that the tables are not compiled in
+        # re_comp.c */
+        my $no_xsub = 1 if $element =~ / PERL_IN_ (?: REGCOMP ) _C /x;
          $element = "defined($element)";
+        $element = "($element && ! defined(PERL_IN_XSUB_RE))" if $no_xsub;
      }
      $new_pound_if = join " || ", @new_pound_if;
  
      # Change to the new one if different from old
      if ($in_file_pound_if ne $new_pound_if) {
  
+        end_charset_pound_if() if defined $charset;
+
          # Exit any current #if
          if ($in_file_pound_if) {
              end_file_pound_if;
@@ -125,9 +226,87 @@ sub switch_pound_if ($$) {
  
          $in_file_pound_if = $new_pound_if;
          print $out_fh "\n#if $in_file_pound_if\n";
+
+        start_charset_pound_if ($charset, 1) if defined $charset;
      }
  }
  
+sub start_charset_pound_if ($;$) {
+    print $out_fh "\n" . get_conditional_compile_line_start(shift, shift);
+}
+
+{   # Closure
+    my $fh;
+    my $in_doinit = 0;
+
+    sub output_table_header($$$;$@) {
+
+        # Output to $fh the heading for a table given by the other inputs
+
+        $fh = shift;
+        my ($type,      # typedef of table, like UV, UV*
+            $name,      # name of table
+            $comment,   # Optional comment to put on header line
+            @sizes      # Optional sizes of each array index.  If omitted,
+                        # there is a single index whose size is computed by
+                        # the C compiler.
+            ) = @_;
+
+        $type =~ s/ \s+ $ //x;
+
+        # If a the typedef is a ptr, add in an extra const
+        $type .= " const" if $type =~ / \* $ /x;
+
+        $comment = "" unless defined $comment;
+        $comment = "  /* $comment */" if $comment;
+
+        my $array_declaration;
+        if (@sizes) {
+            $array_declaration = "";
+            $array_declaration .= "[$_]" for @sizes;
+        }
+        else {
+            $array_declaration = '[]';
+        }
+
+        my $declaration = "$type ${name}$array_declaration";
+
+        # Things not matching this are static.  Otherwise, it is an external
+        # constant, initialized only under DOINIT.
+        #
+        # (Currently everything is static)
+        if ($in_file_pound_if !~ / PERL_IN_ (?: ) _C /x) {
+            $in_doinit = 0;
+            print $fh "\nstatic const $declaration = {$comment\n";
+        }
+        else {
+            $in_doinit = 1;
+            print $fh <<EOF;
+
+#    ifndef DOINIT
+
+EXTCONST $declaration;
+
+#    else
+
+EXTCONST $declaration = {$comment
+EOF
+        }
+    }
+
+    sub output_table_trailer() {
+
+        # Close out a table started by output_table_header()
+
+        print $fh "};\n";
+        if ($in_doinit) {
+            print $fh "\n#    endif  /* DOINIT */\n\n";
+            $in_doinit = 0;
+        }
+    }
+} # End closure
+
+
  sub output_invlist ($$;$) {
      my $name = shift;
      my $invlist = shift;     # Reference to inversion list array
@@ -145,19 +324,17 @@ sub output_invlist ($$;$) {
          unshift @$invlist, 0;
          $zero_or_one = 1;
      }
-    my $count = @$invlist;
  
-    switch_pound_if ($name, 'PERL_IN_UTF8_C');
+    $charset = "for $charset" if $charset;
+    output_table_header($out_fh, "UV", "${name}_invlist", $charset);
  
-    print $out_fh "\nstatic const UV ${name}_invlist[] = {";
-    print $out_fh " /* for $charset */" if $charset;
-    print $out_fh "\n";
-
-    print $out_fh "\t$count,\t/* Number of elements */\n";
-    print $out_fh "\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */\n";
-    print $out_fh "\t", $zero_or_one,
-                  ",\t/* 0 if the list starts at 0;",
-                  "\n\t\t   1 if it starts at the element beyond 0 */\n";
+    my $count = @$invlist;
+    print $out_fh <<EOF;
+\t$count,\t/* Number of elements */
+\t$VERSION_DATA_STRUCTURE_TYPE, /* Version and data structure type */
+\t$zero_or_one,\t/* 0 if the list starts at 0;
+\t\t   1 if it starts at the element beyond 0 */
+EOF
  
      # The main body are the UVs passed in to this routine.  Do the final
      # element separately
@@ -167,7 +344,7 @@ sub output_invlist ($$;$) {
          print $out_fh "\n";
      }
  
-    print $out_fh "};\n";
+    output_table_trailer();
  }
  
  sub output_invmap ($$$$$$$) {
@@ -220,17 +397,47 @@ sub output_invmap ($$$$$$$) {
          }
  
          # The internal enums come last, and in the order specified.
+        #
+        # The internal one named EDGE is also used a marker.  Any ones that
+        # come after it are used in the algorithms below, and so must be
+        # defined, even if the release of Unicode this is being compiled for
+        # doesn't use them.   But since no code points are assigned to them in
+        # such a release, those values will never be accessed.  We collapse
+        # all of them into a single placholder row and a column.  The
+        # algorithms below will fill in those cells with essentially garbage,
+        # but they are never read, so it doesn't matter.  This allows the
+        # algorithm to remain the same from release to release.
+        #
+        # In one case, regexec.c also uses a placeholder which must be defined
+        # here, and we put it in the unused row and column as its value is
+        # never read.
+        #
          my @enums = @input_enums;
          my @extras;
+        my @unused_enums;
+        my $unused_enum_value = @enums;
          if ($extra_enums ne "") {
              @extras = split /,/, $extra_enums;
+            my $seen_EDGE = 0;
  
              # Don't add if already there.
              foreach my $this_extra (@extras) {
                  next if grep { $_ eq $this_extra } @enums;
-
-                push @enums, $this_extra;
+                if ($this_extra eq 'EDGE') {
+                    push @enums, $this_extra;
+                    $seen_EDGE = 1;
+                }
+                elsif ($seen_EDGE) {
+                    push @unused_enums, $this_extra;
+                }
+                else {
+                    push @enums, $this_extra;
+                }
              }
+
+            @unused_enums = sort @unused_enums;
+            $unused_enum_value = @enums;    # All unused have the same value,
+                                            # one beyond the final used one
          }
  
          # Assign a value to each element of the enum type we are creating.
@@ -255,8 +462,6 @@ sub output_invmap ($$$$$$$) {
              # all the tables
              my $type = lc $prop_name;
  
-            my $placeholder = "a";
-
              # Skip if we've already done this code, which populated
              # this hash
              if (eval "! \%${type}_enums") {
@@ -287,22 +492,17 @@ sub output_invmap ($$$$$$$) {
                              #
                              # First are those enums that are not part of the
                              # property, but are defined by this code.  By
-                            # convention these have all-caps names of at least
-                            # 4 characters.  We use the lowercased name for
-                            # thse.
+                            # convention these have all-caps names.  We use
+                            # the lowercased name for these.
                              #
-                            # Second are enums that are needed to get
-                            # regexec.c to compile, but don't exist in all
-                            # Unicode releases.  To get here, we must be
-                            # compiling an earlier Unicode release that
-                            # doesn't have that enum, so just use a unique
-                            # anonymous name for it.
+                            # Second are enums that are needed to get the
+                            # algorithms below to work and/or to get regexec.c
+                            # to compile, but don't exist in all Unicode
+                            # releases.  These are handled outside this loop
+                            # as 'unused_enums'
                              if (grep { $_ eq $enum } @input_enums) {
                                  $short = $enum
                              }
-                            elsif ($enum !~ / ^ [A-Z]{4,} $ /x) {
-                                $short = $placeholder++;
-                            }
                              else {
                                  $short = lc $enum;
                              }
@@ -350,23 +550,19 @@ sub output_invmap ($$$$$$$) {
                      eval "\$${type}_short_enums[$value] = '$short'";
                      die $@ if $@;
                  }
-            }
-        }
  
-        # Inversion map stuff is used only by regexec or utf-8 (if it is
-        # for code points) , unless it is in the enum exception list
-        my $where = (exists $where_to_define_enums{$name})
-                    ? $where_to_define_enums{$name}
-                    : ($input_format =~ /a/)
-                       ? 'PERL_IN_UTF8_C'
-                       : 'PERL_IN_REGEXEC_C';
+                # Each unused enum has the same value.  They all are collapsed
+                # into one row and one column, named $unused_table_hdr.
+                if (@unused_enums) {
+                    eval "\$${type}_short_enums['$unused_enum_value'] = '$unused_table_hdr'";
+                    die $@ if $@;
  
-        my $is_public_enum = exists $public_enums{$name};
-        if ($is_public_enum) {
-            end_file_pound_if;
-        }
-        else {
-            switch_pound_if($name, $where);
+                    foreach my $enum (@unused_enums) {
+                        eval "\$${type}_enums{$enum} = $unused_enum_value";
+                        die $@ if $@;
+                    }
+                }
+            }
          }
  
          # The short names tend to be two lower case letters, but it looks
@@ -387,6 +583,12 @@ sub output_invmap ($$$$$$$) {
              my $name = $enum_list[$i];
              push @enum_definition, "\t${name_prefix}$name = $i";
          }
+        if (@unused_enums) {
+            foreach my $unused (@unused_enums) {
+                push @enum_definition,
+                            ",\n\t${name_prefix}$unused = $unused_enum_value";
+            }
+        }
  
          # For an 'l' property, we need extra enums, because some of the
          # elements are lists.  Each such distinct list is placed in its own
@@ -433,8 +635,25 @@ sub output_invmap ($$$$$$$) {
  
          $enum_declaration_type = "${name_prefix}enum";
  
-        # Finished with the enum definition.  If it only contains one element,
-        # that is a dummy, default one
+        # Finished with the enum definition.  Inversion map stuff is used only
+        # by regexec or utf-8 (if it is for code points) , unless it is in the
+        # enum exception list
+        my $where = (exists $where_to_define_enums{$name})
+                    ? $where_to_define_enums{$name}
+                    : ($input_format =~ /a/)
+                       ? 'PERL_IN_UTF8_C'
+                       : 'PERL_IN_REGEXEC_C';
+
+        if (! exists $public_enums{$name}) {
+            switch_pound_if($name, $where, $charset);
+        }
+        else {
+            end_charset_pound_if;
+            end_file_pound_if;
+            start_charset_pound_if($charset, 1);
+        }
+
+        # If the enum only contains one element, that is a dummy, default one
          if (scalar @enum_definition > 1) {
  
              # Currently unneeded
@@ -458,7 +677,7 @@ sub output_invmap ($$$$$$$) {
              print $out_fh "} $enum_declaration_type;\n";
          }
  
-        switch_pound_if($name, $where) if $is_public_enum;
+        switch_pound_if($name, $where, $charset);
  
          $invmap_declaration_type = ($input_format =~ /s/)
                                   ? $enum_declaration_type
@@ -489,7 +708,9 @@ sub output_invmap ($$$$$$$) {
              # Output each aux table.
              foreach my $table_number (@sorted_table_list) {
                  my $table = $inverted_mults{$table_number};
-                print $out_fh "\nstatic const $aux_declaration_type $name_prefix$aux_table_prefix$table_number\[] = {\n";
+                output_table_header($out_fh,
+                                       $aux_declaration_type,
+                                       "$name_prefix$aux_table_prefix$table_number");
  
                  # Earlier, we joined the elements of this table together with a comma
                  my @elements = split ",", $table;
@@ -504,30 +725,36 @@ sub output_invmap ($$$$$$$) {
                          print $out_fh "\t${name_prefix}$elements[$i]";
                      }
                  }
-                print $out_fh "\n};\n";
+
+                print $out_fh "\n";
+                output_table_trailer();
              }
  
              # Output the table that is indexed by the absolute value of the
              # aux table enum and contains pointers to the tables output just
              # above
-            print $out_fh "\nstatic const $aux_declaration_type * const ${name_prefix}${aux_table_prefix}ptrs\[] = {\n";
+            output_table_header($out_fh, "$aux_declaration_type *",
+                                   "${name_prefix}${aux_table_prefix}ptrs");
              print $out_fh "\tNULL,\t/* Placeholder */\n";
              for my $i (1 .. @sorted_table_list) {
                  print $out_fh  ",\n" if $i > 1;
                  print $out_fh  "\t$name_prefix$aux_table_prefix$i";
              }
-            print $out_fh "\n};\n";
+            print $out_fh "\n";
+            output_table_trailer();
  
              print $out_fh
                "\n/* Parallel table to the above, giving the number of elements"
              . " in each table\n * pointed to */\n";
-            print $out_fh "static const U8 ${name_prefix}${aux_table_prefix}lengths\[] = {\n";
+            output_table_header($out_fh, "U8",
+                                   "${name_prefix}${aux_table_prefix}lengths");
              print $out_fh "\t0,\t/* Placeholder */\n";
              for my $i (1 .. @sorted_table_list) {
                  print $out_fh  ",\n" if $i > 1;
                  print $out_fh  "\t$aux_counts[$i]\t/* $name_prefix$aux_table_prefix$i */";
              }
-            print $out_fh "\n};\n";
+            print $out_fh "\n";
+            output_table_trailer();
          } # End of outputting the auxiliary and associated tables
  
          # The scx property used in regexec.c needs a specialized table which
@@ -543,7 +770,7 @@ sub output_invmap ($$$$$$$) {
              # that.
              for (my $i = 0; $i < @decimals_invlist; $i += 2) {
                  my $code_point = $decimals_invlist[$i];
-                next if chr($code_point) !~ /\p{Nv=0}/;
+                next if num(chr($code_point)) ne '0';
  
                  # Turn the scripts this zero is in into a list.
                  my @scripts = split ",",
@@ -579,7 +806,7 @@ sub output_invmap ($$$$$$$) {
            . " code point for that\n * script; 0 if the script has multiple"
            . " digit sequences.  Scripts without a\n * digit sequence use"
            . " ASCII [0-9], hence are marked '0' */\n";
-            print $out_fh "static const UV script_zeros[] = {\n";
+            output_table_header($out_fh, "UV", "script_zeros");
              for my $i (0 .. @script_zeros - 1) {
                  my $code_point = $script_zeros[$i];
                  if (defined $code_point) {
@@ -597,7 +824,7 @@ sub output_invmap ($$$$$$$) {
                  print $out_fh "\t/* $enum_list[$i] */";
                  print $out_fh "\n";
              }
-            print $out_fh "};\n";
+            output_table_trailer();
          } # End of special handling of scx
      }
      else {
@@ -609,9 +836,10 @@ sub output_invmap ($$$$$$$) {
                                               && $count;
  
      # Now output the inversion map proper
-    print $out_fh "\nstatic const $invmap_declaration_type ${name}_invmap[] = {";
-    print $out_fh " /* for $charset */" if $charset;
-    print $out_fh "\n";
+    $charset = "for $charset" if $charset;
+    output_table_header($out_fh, $invmap_declaration_type,
+                                    "${name}_invmap",
+                                    $charset);
  
      # The main body are the scalars passed in to this routine.
      for my $i (0 .. $count - 1) {
@@ -630,7 +858,7 @@ sub output_invmap ($$$$$$$) {
          print $out_fh "," if $i < $count - 1;
          print $out_fh  "\n";
      }
-    print $out_fh "};\n";
+    output_table_trailer();
  }
  
  sub mk_invlist_from_sorted_cp_list {
@@ -665,27 +893,6 @@ die "Could not find inversion map for Case_Folding" unless defined $format;
  die "Incorrect format '$format' for Case_Folding inversion map"
                                                      unless $format eq 'al'
                                                             || $format eq 'a';
-my @has_multi_char_fold;
-my @is_non_final_fold;
-
-for my $i (0 .. @$folds_ref - 1) {
-    next unless ref $folds_ref->[$i];   # Skip single-char folds
-    push @has_multi_char_fold, $cp_ref->[$i];
-
-    # Add to the non-finals list each code point that is in a non-final
-    # position
-    for my $j (0 .. @{$folds_ref->[$i]} - 2) {
-        push @is_non_final_fold, $folds_ref->[$i][$j]
-                unless grep { $folds_ref->[$i][$j] == $_ } @is_non_final_fold;
-    }
-}
-
-sub _Perl_Non_Final_Folds {
-    @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
-    my @return = mk_invlist_from_sorted_cp_list(\@is_non_final_fold);
-    return \@return;
-}
-
  sub _Perl_IVCF {
  
      # This creates a map of the inversion of case folding. i.e., given a
@@ -754,7 +961,7 @@ sub _Perl_IVCF {
      #    other.  This situation happens in Unicode 3.0.1, but probably no
      #    other version.
      foreach my $fold (keys %new) {
-        my $folds_to_string = $fold =~ /\D/a;
+        my $folds_to_string = $fold =~ /\D/;
  
          # If the bucket contains only one element, convert from an array to a
          # scalar
@@ -812,8 +1019,15 @@ sub _Perl_IVCF {
  
  
      # Now we have a hash that is the inversion of the case fold property.
-    # Convert it to an inversion map.
+    # First find the maximum number of code points that fold to the same one.
+    foreach my $fold_to (keys %new) {
+        if (ref $new{$fold_to}) {
+            my $folders_count = scalar @{$new{$fold_to}};
+            $max_fold_froms = $folders_count if $folders_count > $max_fold_froms;
+        }
+    }
  
+    # Then convert the hash to an inversion map.
      my @sorted_folds = sort { $a <=> $b } keys %new;
      my (@invlist, @invmap);
  
@@ -861,6 +1075,9 @@ sub _Perl_IVCF {
      push @invlist, $sorted_folds[-1] + 1;
      push @invmap, 0;
  
+    push @invlist, 0x110000;
+    push @invmap, 0;
+
      # All Unicode versions have some places where multiple code points map to
      # the same one, so the format always has an 'l'
      return \@invlist, \@invmap, 'al', $default;
@@ -882,6 +1099,35 @@ sub UpperLatin1 {
      return \@return;
  }
  
+sub _Perl_CCC_non0_non230 {
+
+    # Create an inversion list of code points with non-zero canonical
+    # combining class that also don't have 230 as the class number.  This is
+    # part of a Unicode Standard rule
+
+    my @nonzeros = prop_invlist("ccc=0");
+    shift @nonzeros;    # Invert so is "ccc != 0"
+
+    my @return;
+
+    # Expand into list of code points, while excluding those with ccc == 230
+    for (my $i = 0; $i < @nonzeros; $i += 2) {
+        my $upper = ($i + 1) < @nonzeros
+                    ? $nonzeros[$i+1] - 1      # In range
+                    : $Unicode::UCD::MAX_CP;  # To infinity.
+        for my $j ($nonzeros[$i] .. $upper) {
+            my @ccc_names = prop_value_aliases("ccc", charprop($j, "ccc"));
+
+            # Final element in @ccc_names will be all numeric
+            push @return, $j if $ccc_names[-1] != 230;
+        }
+    }
+
+    @return = sort { $a <=> $b } @return;
+    @return = mk_invlist_from_sorted_cp_list(\@return);
+    return \@return;
+}
+
  sub output_table_common {
  
      # Common subroutine to actually output the generated rules table.
@@ -921,42 +1167,23 @@ sub output_table_common {
  
      my $column_width = 2;   # We currently allow 2 digits for the number
  
-    # If the maximum value in the table is 1, it can be a bool.  (Being above
-    # a U8 is not currently handled
-    my $max_element = 0;
-    for my $i (0 .. $size - 1) {
-        for my $j (0 .. $size - 1) {
-            next if $max_element >= $table_ref->[$i][$j];
-            $max_element = $table_ref->[$i][$j];
-        }
-    }
-    die "Need wider table column width given '$max_element"
-                                    if length $max_element > $column_width;
-
-    my $table_type = ($max_element == 1)
-                     ? 'bool'
-                     : 'U8';
+    # Being above a U8 is not currently handled
+    my $table_type = 'U8';
  
      # If a name is longer than the width set aside for a column, its column
      # needs to have increased spacing so that the name doesn't get truncated
      # nor run into an adjacent column
      my @spacers;
  
-    # If we are being compiled on a Unicode version earlier than that which
-    # this file was designed for, it may be that some of the property values
-    # aren't in the current release, and so would be undefined if we didn't
-    # define them ourselves.  Earlier code has done this, making them
-    # lowercase characters of length one.  We look to see if any exist, so
-    # that we can add an annotation to the output table
-    my $has_placeholder = 0;
+    # Is there a row and column for unused values in this release?
+    my $has_unused = $names_ref->[$size-1] eq $unused_table_hdr;
  
      for my $i (0 .. $size - 1) {
          no warnings 'numeric';
-        $has_placeholder = 1 if $names_ref->[$i] =~ / ^ [[:lower:]] $ /ax;
          $spacers[$i] = " " x (length($names_ref->[$i]) - $column_width);
      }
  
-    print $out_fh "\nstatic const $table_type ${property}_table[$size][$size] = {\n";
+    output_table_header($out_fh, $table_type, "${property}_table", undef, $size, $size);
  
      # Calculate the column heading line
      my $header_line = "/* "
@@ -974,18 +1201,15 @@ sub output_table_common {
      $header_line .= " */\n";
  
      # If we have annotations, output it now.
-    if ($has_placeholder || scalar %$abbreviations_ref) {
+    if ($has_unused || scalar %$abbreviations_ref) {
          my $text = "";
          foreach my $abbr (sort keys %$abbreviations_ref) {
              $text .= "; " if $text;
              $text .= "'$abbr' stands for '$abbreviations_ref->{$abbr}'";
          }
-        if ($has_placeholder) {
-            $text .= "; other " if $text;
-            $text .= "lowercase names are placeholders for"
-                  .  " property values not defined until a later Unicode"
-                  .  " release, so are irrelevant in this one, as they are"
-                  .  " not assigned to any code points";
+        if ($has_unused) {
+            $text .= "; $unused_table_hdr stands for 'unused in this Unicode"
+                   . " release (and the data in the row or column are garbage)"
          }
  
          my $indent = " " x 3;
@@ -1036,7 +1260,7 @@ sub output_table_common {
          print $out_fh "\n";
      }
  
-    print $out_fh "};\n";
+    output_table_trailer();
  }
  
  sub output_GCB_table() {
@@ -1048,6 +1272,7 @@ sub output_GCB_table() {
          GCB_BREAKABLE                    => 1,
          GCB_RI_then_RI                   => 2,   # Rules 12 and 13
          GCB_EX_then_EM                   => 3,   # Rule 10
+        GCB_Maybe_Emoji_NonBreak         => 4,
      );
  
      # The table is constructed in reverse order of the rules, to make the
@@ -1074,8 +1299,16 @@ sub output_GCB_table() {
      $gcb_table[$gcb_enums{'Regional_Indicator'}]
                [$gcb_enums{'Regional_Indicator'}] = $gcb_actions{GCB_RI_then_RI};
  
+    # Post 11.0: GB11  \p{Extended_Pictographic} Extend* ZWJ
+    #                                               × \p{Extended_Pictographic}
+    $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'XPG_XX'}] =
+                                         $gcb_actions{GCB_Maybe_Emoji_NonBreak};
+
+    # This and the rule GB10 obsolete starting with Unicode 11.0, can be left
+    # in as there are no code points that match, so the code won't ever get
+    # executed.
      # Do not break within emoji modifier sequences or emoji zwj sequences.
-    # GB11  ZWJ  × ( Glue_After_Zwj | E_Base_GAZ )
+    # Pre 11.0: GB11  ZWJ  × ( Glue_After_Zwj | E_Base_GAZ )
      $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'Glue_After_Zwj'}] = 0;
      $gcb_table[$gcb_enums{'ZWJ'}][$gcb_enums{'E_Base_GAZ'}] = 0;
  
@@ -1682,16 +1915,11 @@ sub output_LB_table() {
          }
      }
  
-    # LB8a Do not break between a zero width joiner and an ideograph, emoji
-    # base or emoji modifier. This rule prevents breaks within emoji joiner
-    # sequences.
-    # ZWJ × (ID | EB | EM)
-    $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'Ideographic'}]
-                                                    = $lb_actions{'LB_NOBREAK'};
-    $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Base'}]
-                                                    = $lb_actions{'LB_NOBREAK'};
-    $lb_table[$lb_enums{'ZWJ'}][$lb_enums{'E_Modifier'}]
-                                                    = $lb_actions{'LB_NOBREAK'};
+    # LB8a Do not break after a zero width joiner
+    # ZWJ ×
+    for my $i (0 .. @lb_table - 1) {
+        $lb_table[$lb_enums{'ZWJ'}][$i] = $lb_actions{'LB_NOBREAK'};
+    }
  
      # LB8 Break before any character following a zero-width space, even if one
      # or more spaces intervene.
@@ -1805,8 +2033,7 @@ sub output_WB_table() {
      # algorithm stops at the earliest matching rule
  
      my @wb_table;
-    my $table_size = @wb_short_enums - 1;   # -1 because we don't use UNKNOWN
-    die "UNKNOWN must be final WB enum" unless $wb_short_enums[-1] =~ /unk/i;
+    my $table_size = @wb_short_enums;
  
      # Otherwise, break everywhere (including around ideographs).
      # WB99  Any  ÷  Any
@@ -1835,6 +2062,8 @@ sub output_WB_table() {
      # WB13b  ExtendNumLet  ×  (ALetter | Hebrew_Letter | Numeric | Katakana)
      $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'ALetter'}]
                                                  = $wb_actions{'WB_NOBREAK'};
+    $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'XPG_LE'}]
+                                                = $wb_actions{'WB_NOBREAK'};
      $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Hebrew_Letter'}]
                                                  = $wb_actions{'WB_NOBREAK'};
      $wb_table[$wb_enums{'ExtendNumLet'}][$wb_enums{'Numeric'}]
@@ -1843,9 +2072,11 @@ sub output_WB_table() {
                                                  = $wb_actions{'WB_NOBREAK'};
  
      # WB13a  (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet)
-    #        × # ExtendNumLet
+    #        × ExtendNumLet
      $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ExtendNumLet'}]
                                                  = $wb_actions{'WB_NOBREAK'};
+    $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'ExtendNumLet'}]
+                                                = $wb_actions{'WB_NOBREAK'};
      $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ExtendNumLet'}]
                                                  = $wb_actions{'WB_NOBREAK'};
      $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ExtendNumLet'}]
@@ -1882,12 +2113,16 @@ sub output_WB_table() {
      # WB10  Numeric  ×  (ALetter | Hebrew_Letter)
      $wb_table[$wb_enums{'Numeric'}][$wb_enums{'ALetter'}]
                                                  = $wb_actions{'WB_NOBREAK'};
+    $wb_table[$wb_enums{'Numeric'}][$wb_enums{'XPG_LE'}]
+                                                = $wb_actions{'WB_NOBREAK'};
      $wb_table[$wb_enums{'Numeric'}][$wb_enums{'Hebrew_Letter'}]
                                                  = $wb_actions{'WB_NOBREAK'};
  
      # WB9  (ALetter | Hebrew_Letter)  ×  Numeric
      $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Numeric'}]
                                                  = $wb_actions{'WB_NOBREAK'};
+    $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Numeric'}]
+                                                = $wb_actions{'WB_NOBREAK'};
      $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Numeric'}]
                                                  = $wb_actions{'WB_NOBREAK'};
  
@@ -1912,14 +2147,20 @@ sub output_WB_table() {
      #       × (ALetter | Hebrew_Letter)
      $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'ALetter'}]
                              += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
+    $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'XPG_LE'}]
+                            += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
      $wb_table[$wb_enums{'MidNumLet'}][$wb_enums{'Hebrew_Letter'}]
                              += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
      $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'ALetter'}]
                              += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
+    $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'XPG_LE'}]
+                            += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
      $wb_table[$wb_enums{'MidLetter'}][$wb_enums{'Hebrew_Letter'}]
                              += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
      $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'ALetter'}]
                              += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
+    $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'XPG_LE'}]
+                            += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
      $wb_table[$wb_enums{'Single_Quote'}][$wb_enums{'Hebrew_Letter'}]
                              += $wb_actions{'WB_MB_or_ML_or_SQ_then_LE_or_HL'};
  
@@ -1927,14 +2168,20 @@ sub output_WB_table() {
      #       | Single_Quote) (ALetter | Hebrew_Letter)
      $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidNumLet'}]
                              += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
+    $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'MidNumLet'}]
+                            += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
      $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidNumLet'}]
                              += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
      $wb_table[$wb_enums{'ALetter'}][$wb_enums{'MidLetter'}]
                              += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
+    $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'MidLetter'}]
+                            += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
      $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'MidLetter'}]
                              += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
      $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Single_Quote'}]
                              += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
+    $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Single_Quote'}]
+                            += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
      $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Single_Quote'}]
                              += $wb_actions{'WB_LE_or_HL_then_MB_or_ML_or_SQ'};
  
@@ -1942,12 +2189,20 @@ sub output_WB_table() {
      # WB5  (ALetter | Hebrew_Letter)  ×  (ALetter | Hebrew_Letter)
      $wb_table[$wb_enums{'ALetter'}][$wb_enums{'ALetter'}]
                                                      = $wb_actions{'WB_NOBREAK'};
+    $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'ALetter'}]
+                                                    = $wb_actions{'WB_NOBREAK'};
      $wb_table[$wb_enums{'ALetter'}][$wb_enums{'Hebrew_Letter'}]
                                                      = $wb_actions{'WB_NOBREAK'};
+    $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'Hebrew_Letter'}]
+                                                    = $wb_actions{'WB_NOBREAK'};
      $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'ALetter'}]
                                                      = $wb_actions{'WB_NOBREAK'};
+    $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'XPG_LE'}]
+                                                    = $wb_actions{'WB_NOBREAK'};
      $wb_table[$wb_enums{'Hebrew_Letter'}][$wb_enums{'Hebrew_Letter'}]
                                                      = $wb_actions{'WB_NOBREAK'};
+    $wb_table[$wb_enums{'XPG_LE'}][$wb_enums{'XPG_LE'}]
+                                                    = $wb_actions{'WB_NOBREAK'};
  
      # Ignore Format and Extend characters, except after sot, CR, LF, and
      # Newline.  This also has the effect of: Any × (Format | Extend | ZWJ)
@@ -1975,14 +2230,24 @@ sub output_WB_table() {
          $wb_table[$i][$wb_enums{'Format'}] = $wb_actions{'WB_NOBREAK'};
      }
  
+    # Keep horizontal whitespace together
+    # Use perl's tailoring instead
+    # WB3d WSegSpace × WSegSpace
+    #$wb_table[$wb_enums{'WSegSpace'}][$wb_enums{'WSegSpace'}]
+    #                                               = $wb_actions{'WB_NOBREAK'};
+
      # Do not break within emoji zwj sequences.
      # WB3c ZWJ × ( Glue_After_Zwj | EBG )
      $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'Glue_After_Zwj'}]
                                                  = $wb_actions{'WB_NOBREAK'};
      $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'E_Base_GAZ'}]
                                                  = $wb_actions{'WB_NOBREAK'};
+    $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'XPG_XX'}]
+                                                = $wb_actions{'WB_NOBREAK'};
+    $wb_table[$wb_enums{'ZWJ'}][$wb_enums{'XPG_LE'}]
+                                                = $wb_actions{'WB_NOBREAK'};
  
-    # Break before and after white space
+    # Break before and after newlines
      # WB3b     ÷  (Newline | CR | LF)
      # WB3a  (Newline | CR | LF)  ÷
      # et. al.
@@ -2026,6 +2291,22 @@ sub output_WB_table() {
                          \@wb_table, \@wb_short_enums, \%wb_abbreviations);
  }
  
+sub sanitize_name ($) {
+    # Change the non-word characters in the input string to standardized word
+    # equivalents
+    #
+    my $sanitized = shift;
+    $sanitized =~ s/=/__/;
+    $sanitized =~ s/&/_AMP_/;
+    $sanitized =~ s/\./_DOT_/;
+    $sanitized =~ s/-/_MINUS_/;
+    $sanitized =~ s!/!_SLASH_!;
+
+    return $sanitized;
+}
+
+switch_pound_if ('ALL', 'PERL_IN_REGCOMP_C');
+
  output_invlist("Latin1", [ 0, 256 ]);
  output_invlist("AboveLatin1", [ 256 ]);
  
@@ -2054,94 +2335,233 @@ end_file_pound_if;
  # An initial & means to use the subroutine from this file instead of an
  # official inversion list.
  
-for my $charset (get_supported_code_pages()) {
-    print $out_fh "\n" . get_conditional_compile_line_start($charset);
-
-    @a2n = @{get_a2n($charset)};
-    # Below is the list of property names to generate.  '&' means to use the
-    # subroutine to generate the inversion list instead of the generic code
-    # below.  Some properties have a comma-separated list after the name,
-    # These are extra enums to add to those found in the Unicode tables.
-    no warnings 'qw';
-                         # Ignore non-alpha in sort
-    for my $prop (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
-                             Assigned
-                             ASCII
-                             Cased
-                             VertSpace
-                             XPerlSpace
-                             XPosixAlnum
-                             XPosixAlpha
-                             XPosixBlank
-                             XPosixCntrl
-                             XPosixDigit
-                             XPosixGraph
-                             XPosixLower
-                             XPosixPrint
-                             XPosixPunct
-                             XPosixSpace
-                             XPosixUpper
-                             XPosixWord
-                             XPosixXDigit
-                             _Perl_Any_Folds
-                             &NonL1_Perl_Non_Final_Folds
-                             _Perl_Folds_To_Multi_Char
-                             &UpperLatin1
-                             _Perl_IDStart
-                             _Perl_IDCont
-                             _Perl_Charname_Begin
-                             _Perl_Charname_Continue
-                             _Perl_GCB,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,EDGE
-                             _Perl_LB,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner,EDGE,
-                             _Perl_SB,SContinue,CR,Extend,LF,EDGE
-                             _Perl_WB,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,EDGE,UNKNOWN
-                             _Perl_SCX,Latin,Inherited,Unknown,Kore,Jpan,Hanb,INVALID
-                             Lowercase_Mapping
-                             Titlecase_Mapping
-                             Uppercase_Mapping
-                             Simple_Case_Folding
-                             Case_Folding
-                            &_Perl_IVCF
-                           )
-                           # NOTE that the convention is that extra enum
-                           # values come after the property name, separated by
-                           # commas, with the enums that aren't ever defined
-                           # by Unicode coming last, at least 4 all-uppercase
-                           # characters.  The others are enum names that are
-                           # needed by perl, but aren't in all Unicode
-                           # releases.
-    ) {
-
-        # For the Latin1 properties, we change to use the eXtended version of the
-        # base property, then go through the result and get rid of everything not
-        # in Latin1 (above 255).  Actually, we retain the element for the range
-        # that crosses the 255/256 boundary if it is one that matches the
-        # property.  For example, in the Word property, there is a range of code
-        # points that start at U+00F8 and goes through U+02C1.  Instead of
-        # artificially cutting that off at 256 because 256 is the first code point
-        # above Latin1, we let the range go to its natural ending.  That gives us
-        # extra information with no added space taken.  But if the range that
-        # crosses the boundary is one that doesn't match the property, we don't
-        # start a new range above 255, as that could be construed as going to
-        # infinity.  For example, the Upper property doesn't include the character
-        # at 255, but does include the one at 256.  We don't include the 256 one.
-        my $prop_name = $prop;
-        my $is_local_sub = $prop_name =~ s/^&//;
-        my $extra_enums = "";
-        $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
-        my $lookup_prop = $prop_name;
-        my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
-                       or $lookup_prop =~ s/^L1//);
-        my $nonl1_only = 0;
-        $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
-        ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
+# Below is the list of property names to generate.  '&' means to use the
+# subroutine to generate the inversion list instead of the generic code
+# below.  Some properties have a comma-separated list after the name,
+# These are extra enums to add to those found in the Unicode tables.
+no warnings 'qw';
+                        # Ignore non-alpha in sort
+my @props;
+push @props, sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
+                    &UpperLatin1
+                    _Perl_GCB,EDGE,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,XPG_XX
+                    _Perl_LB,EDGE,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner
+                    _Perl_SB,EDGE,SContinue,CR,Extend,LF
+                    _Perl_WB,Perl_Tailored_HSpace,EDGE,UNKNOWN,CR,Double_Quote,E_Base,E_Base_GAZ,E_Modifier,Extend,Glue_After_Zwj,Hebrew_Letter,LF,MidNumLet,Newline,Regional_Indicator,Single_Quote,ZWJ,XPG_XX,XPG_LE
+                    _Perl_SCX,Latin,Inherited,Unknown,Kore,Jpan,Hanb,INVALID
+                    Lowercase_Mapping
+                    Titlecase_Mapping
+                    Uppercase_Mapping
+                    Simple_Case_Folding
+                    Case_Folding
+                    &_Perl_IVCF
+                    &_Perl_CCC_non0_non230
+                );
+                # NOTE that the convention is that extra enum values come
+                # after the property name, separated by commas, with the enums
+                # that aren't ever defined by Unicode coming last, at least 4
+                # all-uppercase characters.  The others are enum names that
+                # are needed by perl, but aren't in all Unicode releases.
+
+my @bin_props;
+my @perl_prop_synonyms;
+my %enums;
+my @deprecated_messages = "";   # Element [0] is a placeholder
+my %deprecated_tags;
+
+my $float_e_format = qr/ ^ -? \d \. \d+ e [-+] \d+ $ /x;
+
+# Create another hash that maps floating point x.yyEzz representation to what
+# %stricter_to_file_of does for the equivalent rational.  A typical entry in
+# the latter hash is
+#
+#    'nv=1/2' => 'Nv/1_2',
+#
+# From that, this loop creates an entry
+#
+#    'nv=5.00e-01' => 'Nv/1_2',
+#
+# %stricter_to_file_of contains far more than just the rationals.  Instead we
+# use %Unicode::UCD::nv_floating_to_rational which should have an entry for each
+# nv in the former hash.
+my %floating_to_file_of;
+foreach my $key (keys %Unicode::UCD::nv_floating_to_rational) {
+    my $value = $Unicode::UCD::nv_floating_to_rational{$key};
+    $floating_to_file_of{$key} = $Unicode::UCD::stricter_to_file_of{"nv=$value"};
+}
+
+# Properties that are specified with a prop=value syntax
+my @equals_properties;
+
+# Collect all the binary properties from data in lib/unicore
+# Sort so that complements come after the main table, and the shortest
+# names first, finally alphabetically.  Also, sort together the tables we want
+# to be kept together, and prefer those with 'posix' in their names, which is
+# what the C code is expecting their names to be.
+foreach my $property (sort
+        {   exists $keep_together{lc $b} <=> exists $keep_together{lc $a}
+         or $b =~ /posix/i <=> $a =~ /posix/i
+         or $b =~ /perl/i <=> $a =~ /perl/i
+         or $a =~ $float_e_format <=> $b =~ $float_e_format
+         or $a =~ /!/ <=> $b =~ /!/
+         or length $a <=> length $b
+         or $a cmp $b
+        }   keys %Unicode::UCD::loose_to_file_of,
+            keys %Unicode::UCD::stricter_to_file_of,
+            keys %floating_to_file_of
+) {
+
+    # These two hashes map properties to values that can be considered to
+    # be checksums.  If two properties have the same checksum, they have
+    # identical entries.  Otherwise they differ in some way.
+    my $tag = $Unicode::UCD::loose_to_file_of{$property};
+    $tag = $Unicode::UCD::stricter_to_file_of{$property} unless defined $tag;
+    $tag = $floating_to_file_of{$property} unless defined $tag;
+
+    # The tag may contain an '!' meaning it is identical to the one formed
+    # by removing the !, except that it is inverted.
+    my $inverted = $tag =~ s/!//;
+
+    # This hash is lacking the property name
+    $property = "nv=$property" if $property =~ $float_e_format;
+
+    # The list of 'prop=value' entries that this single entry expands to
+    my @this_entries;
+
+    # Split 'property=value' on the equals sign, with $lhs being the whole
+    # thing if there is no '='
+    my ($lhs, $rhs) = $property =~ / ( [^=]* ) ( =? .*) /x;
+
+    # $lhs then becomes the property name.
+    my $prop_value = $rhs =~ s/ ^ = //rx;
+
+    push @equals_properties, $lhs if $prop_value ne "";
+
+    # See if there are any synonyms for this property.
+    if (exists $prop_name_aliases{$lhs}) {
+
+        # If so, do the combinatorics so that a new entry is added for
+        # each legal property combined with the property value (which is
+        # $rhs)
+        foreach my $alias (@{$prop_name_aliases{$lhs}}) {
+
+            # But, there are some ambiguities, like 'script' is a synonym
+            # for 'sc', and 'sc' can stand alone, meaning something
+            # entirely different than 'script'.  'script' cannot stand
+            # alone.  Don't add if the potential new lhs is in the hash of
+            # stand-alone properties.
+            no warnings 'once';
+            next if $rhs eq "" &&  grep { $alias eq $_ }
+                                    keys %Unicode::UCD::loose_property_to_file_of;
+
+            my $new_entry = $alias . $rhs;
+            push @this_entries, $new_entry;
+        }
+    }
+
+    # Above, we added the synonyms for the base entry we're now
+    # processing.  But we haven't dealt with it yet.  If we already have a
+    # property with the identical characteristics, this becomes just a
+    # synonym for it.
+
+    if (exists $enums{$tag}) {
+        push @this_entries, $property;
+    }
+    else { # Otherwise, create a new entry.
+
+        # Add to the list of properties to generate inversion lists for.
+        push @bin_props, uc $property;
+
+        # Create a rule for the parser
+        if (! exists $keywords{$property}) {
+            $keywords{$property} = token_name($property);
+        }
+
+        # And create an enum for it.
+        $enums{$tag} = $table_name_prefix . uc sanitize_name($property);
+
+        $perl_tags{$tag} = 1 if exists $keep_together{lc $property};
+
+        # Some properties are deprecated.  This hash tells us so, and the
+        # warning message to raise if they are used.
+        if (exists $Unicode::UCD::why_deprecated{$tag}) {
+            $deprecated_tags{$enums{$tag}} = scalar @deprecated_messages;
+            push @deprecated_messages, $Unicode::UCD::why_deprecated{$tag};
+        }
+
+        # Our sort above should have made sure that we see the
+        # non-inverted version first, but this makes sure.
+        warn "$property is inverted!!!" if $inverted;
+    }
+
+    # Everything else is #defined to be the base enum, inversion is
+    # indicated by negating the value.
+    my $defined_to = "";
+    $defined_to .= "-" if $inverted;
+    $defined_to .= $enums{$tag};
+
+    # Go through the entries that evaluate to this.
+    @this_entries = uniques @this_entries;
+    foreach my $define (@this_entries) {
+
+        # There is a rule for the parser for each.
+        $keywords{$define} = $defined_to;
+
+        # And a #define for all simple names equivalent to a perl property,
+        # except those that begin with 'is' or 'in';
+        if (exists $perl_tags{$tag} && $property !~ / ^ i[ns] | = /x) {
+            push @perl_prop_synonyms, "#define "
+                                    . $table_name_prefix
+                                    . uc(sanitize_name($define))
+                                    . "   $defined_to";
+        }
+    }
+}
+
+@bin_props = sort {  exists $keep_together{lc $b} <=> exists $keep_together{lc $a}
+                   or $a cmp $b
+                  } @bin_props;
+@perl_prop_synonyms = sort(uniques(@perl_prop_synonyms));
+push @props, @bin_props;
+
+foreach my $prop (@props) {
+
+    # For the Latin1 properties, we change to use the eXtended version of the
+    # base property, then go through the result and get rid of everything not
+    # in Latin1 (above 255).  Actually, we retain the element for the range
+    # that crosses the 255/256 boundary if it is one that matches the
+    # property.  For example, in the Word property, there is a range of code
+    # points that start at U+00F8 and goes through U+02C1.  Instead of
+    # artificially cutting that off at 256 because 256 is the first code point
+    # above Latin1, we let the range go to its natural ending.  That gives us
+    # extra information with no added space taken.  But if the range that
+    # crosses the boundary is one that doesn't match the property, we don't
+    # start a new range above 255, as that could be construed as going to
+    # infinity.  For example, the Upper property doesn't include the character
+    # at 255, but does include the one at 256.  We don't include the 256 one.
+    my $prop_name = $prop;
+    my $is_local_sub = $prop_name =~ s/^&//;
+    my $extra_enums = "";
+    $extra_enums = $1 if $prop_name =~ s/, ( .* ) //x;
+    my $lookup_prop = $prop_name;
+    $prop_name = sanitize_name($prop_name);
+    $prop_name = $table_name_prefix . $prop_name if grep { lc $lookup_prop eq lc $_ } @bin_props;
+    my $l1_only = ($lookup_prop =~ s/^L1Posix/XPosix/
+                    or $lookup_prop =~ s/^L1//);
+    my $nonl1_only = 0;
+    $nonl1_only = $lookup_prop =~ s/^NonL1// unless $l1_only;
+    ($lookup_prop, my $has_suffixes) = $lookup_prop =~ / (.*) ( , .* )? /x;
+
+    for my $charset (get_supported_code_pages()) {
+        @a2n = @{get_a2n($charset)};
  
          my @invlist;
          my @invmap;
-        my $map_format;
+        my $map_format = 0;;
          my $map_default;
-        my $maps_to_code_point;
-        my $to_adjust;
+        my $maps_to_code_point = 0;
+        my $to_adjust = 0;
+        my $same_in_all_code_pages;
          if ($is_local_sub) {
              my @return = eval $lookup_prop;
              die $@ if $@;
@@ -2172,7 +2592,7 @@ for my $charset (get_supported_code_pages()) {
                      if (defined $count) {
                          # Short-circuit an empty inversion list.
                          output_invlist($prop_name, \@invlist, $charset);
-                        next;
+                        last;
                      }
                      die "Could not find inversion list for '$lookup_prop'"
                  }
@@ -2181,12 +2601,15 @@ for my $charset (get_supported_code_pages()) {
                      @invmap = @$map_ref;
                      $map_format = $format;
                      $map_default = $default;
-                    $maps_to_code_point = $map_format =~ / a ($ | [^r] ) /x;
-                    $to_adjust = $map_format =~ /a/;
                  }
              }
          }
  
+        if ($map_format) {
+            $maps_to_code_point = $map_format =~ / a ($ | [^r] ) /x;
+            $to_adjust = $map_format =~ /a/;
+        }
+
          # Re-order the Unicode code points to native ones for this platform.
          # This is only needed for code points below 256, because native code
          # points are only in that range.  For inversion maps of properties
@@ -2210,10 +2633,12 @@ for my $charset (get_supported_code_pages()) {
          # 255 because a re-ordering could cause 256 to need to be in the same
          # range as 255.)
          if (       (@invmap && $maps_to_code_point)
-            || (   ($invlist[0] < 256
+            || (    @invlist
+                &&  $invlist[0] < 256
                  && (    $invlist[0] != 0
-                    || (scalar @invlist != 1 && $invlist[1] < 256)))))
+                    || (scalar @invlist != 1 && $invlist[1] < 256))))
          {
+            $same_in_all_code_pages = 0;
              if (! @invmap) {    # Straight inversion list
                  # Look at all the ranges that start before 257.
                  my @latin1_list;
@@ -2286,13 +2711,15 @@ for my $charset (get_supported_code_pages()) {
                      if (ref $invmap[0]) {
                          $bucket = join "\cK", map { a2n($_) }  @{$invmap[0]};
                      }
-                    elsif ($maps_to_code_point && $invmap[0] =~ $numeric_re) {
+                    elsif (   $maps_to_code_point
+                           && $invmap[0] =~ $integer_or_float_re)
+                    {
  
                          # Do convert to native for maps to single code points.
                          # There are some properties that have a few outlier
                          # maps that aren't code points, so the above test
-                        # skips those.
-                        $bucket = a2n($invmap[0]);
+                        # skips those.  0 is never remapped.
+                        $bucket = $invmap[0] == 0 ? 0 : a2n($invmap[0]);
                      } else {
                          $bucket = $invmap[0];
                      }
@@ -2309,7 +2736,7 @@ for my $charset (get_supported_code_pages()) {
  
                                 # Skip any non-numeric maps: these are outliers
                                 # that aren't code points.
-                            && $base_map =~ $numeric_re
+                            && $base_map =~ $integer_or_float_re
  
                                 #  'ne' because the default can be a string
                              && $base_map ne $map_default)
@@ -2397,9 +2824,12 @@ for my $charset (get_supported_code_pages()) {
                      for my $i (0 .. @new_invlist - 1) {
                          next if $i > 0
                                  && $new_invlist[$i-1] + 1 == $new_invlist[$i]
-                                && $xlated{$new_invlist[$i-1]} =~ $numeric_re
-                                && $xlated{$new_invlist[$i]} =~ $numeric_re
-                                && $xlated{$new_invlist[$i-1]} + 1 == $xlated{$new_invlist[$i]};
+                                && $xlated{$new_invlist[$i-1]}
+                                                        =~ $integer_or_float_re
+                                && $xlated{$new_invlist[$i]}
+                                                        =~ $integer_or_float_re
+                                && $xlated{$new_invlist[$i-1]} + 1
+                                                 == $xlated{$new_invlist[$i]};
                          push @temp, $new_invlist[$i];
                      }
                      @new_invlist = @temp;
@@ -2417,6 +2847,13 @@ for my $charset (get_supported_code_pages()) {
                  unshift @invlist, @new_invlist;
              }
          }
+        elsif (@invmap) {   # inversion maps can't cope with this variable
+                            # being true, even if it could be true
+            $same_in_all_code_pages = 0;
+        }
+        else {
+            $same_in_all_code_pages = 1;
+        }
  
          # prop_invmap() returns an extra final entry, which we can now
          # discard.
@@ -2471,16 +2908,214 @@ for my $charset (get_supported_code_pages()) {
                  $found_nonl1 = 1;
                  last;
              }
-            die "No non-Latin1 code points in $lookup_prop" unless $found_nonl1;
+            if (! $found_nonl1) {
+                warn "No non-Latin1 code points in $prop_name";
+                output_invlist($prop_name, []);
+                last;
+            }
          }
  
-        output_invlist($prop_name, \@invlist, $charset);
-        output_invmap($prop_name, \@invmap, $lookup_prop, $map_format, $map_default, $extra_enums, $charset) if @invmap;
+        switch_pound_if ($prop_name, 'PERL_IN_REGCOMP_C');
+        start_charset_pound_if($charset, 1) unless $same_in_all_code_pages;
+
+        output_invlist($prop_name, \@invlist, ($same_in_all_code_pages)
+                                              ? $applies_to_all_charsets_text
+                                              : $charset);
+
+        if (@invmap) {
+            output_invmap($prop_name, \@invmap, $lookup_prop, $map_format,
+                          $map_default, $extra_enums, $charset);
+        }
+
+        last if $same_in_all_code_pages;
+        end_charset_pound_if;
      }
-    end_file_pound_if;
-    print $out_fh "\n" . get_conditional_compile_line_end();
  }
  
+switch_pound_if ('binary_property_tables', 'PERL_IN_REGCOMP_C');
+
+print $out_fh "\nconst char * const deprecated_property_msgs[] = {\n\t";
+print $out_fh join ",\n\t", map { "\"$_\"" } @deprecated_messages;
+print $out_fh "\n};\n";
+
+my @enums = sort values %enums;
+
+# Save a copy of these before modification
+my @invlist_names = map { "${_}_invlist" } @enums;
+
+# Post-process the enums for deprecated properties.
+if (scalar keys %deprecated_tags) {
+    my $seen_deprecated = 0;
+    foreach my $enum (@enums) {
+        if (grep { $_ eq $enum } keys %deprecated_tags) {
+
+            # Change the enum name for this deprecated property to a
+            # munged one to act as a placeholder in the typedef.  Then
+            # make the real name be a #define whose value is such that
+            # its modulus with the number of enums yields the index into
+            # the table occupied by the placeholder.  And so that dividing
+            # the #define value by the table length gives an index into
+            # the table of deprecation messages for the corresponding
+            # warning.
+            my $revised_enum = "${enum}_perl_aux";
+            if (! $seen_deprecated) {
+                $seen_deprecated = 1;
+                print $out_fh "\n";
+            }
+            print $out_fh "#define $enum ($revised_enum + (MAX_UNI_KEYWORD_INDEX * $deprecated_tags{$enum}))\n";
+            $enum = $revised_enum;
+        }
+    }
+}
+
+print $out_fh "\ntypedef enum {\n\tPERL_BIN_PLACEHOLDER = 0,  /* So no real value is zero */\n\t";
+print $out_fh join ",\n\t", @enums;
+print $out_fh "\n";
+print $out_fh "} binary_invlist_enum;\n";
+print $out_fh "\n#define MAX_UNI_KEYWORD_INDEX $enums[-1]\n";
+
+output_table_header($out_fh, "UV *", "uni_prop_ptrs");
+print $out_fh "\tNULL,\t/* Placeholder */\n";
+print $out_fh "\t";
+print $out_fh join ",\n\t", @invlist_names;
+print $out_fh "\n";
+
+output_table_trailer();
+
+print $out_fh join "\n", "\n",
+                         #'#    ifdef DOINIT',
+                         #"\n",
+                         "/* Synonyms for perl properties */",
+                         @perl_prop_synonyms,
+                         #"\n",
+                         #"#    endif  /* DOINIT */",
+                         "\n";
+
+switch_pound_if ('Valid property_values', 'PERL_IN_REGCOMP_C');
+
+# Each entry is a pointer to a table of property values for some property.
+# (Other properties may share this table.  The next two data structures allow
+# this sharing to be implemented.)
+my @values_tables = "NULL /* Placeholder so zero index is an error */";
+
+# Keys are all the values of a property, strung together.  The value of each
+# key is its index in @values_tables.  This is because many properties have
+# the same values, and this allows the data to appear just once.
+my %joined_values;
+
+# #defines for indices into @values_tables, so can have synonyms resolved by
+# the C compiler.
+my @values_indices;
+
+# Go through each property which is specifiable by \p{prop=value}, and create
+# a hash with the keys being the canonicalized short property names, and the
+# values for each property being all possible values that it can take on.
+# Both the full value and its short, canonicalized into lc, sans punctuation
+# version are included.
+my %all_values;
+for my $property (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) }
+                 uniques @equals_properties)
+{
+    # Get and canonicalize the short name for this property.
+    my ($short_name) = prop_aliases($property);
+    $short_name = lc $short_name;
+    $short_name =~ s/[ _-]//g;
+
+    # Now look at each value this property can take on
+    foreach my $value (prop_values($short_name)) {
+
+        # And for each value, look at each synonym for it
+        foreach my $alias (prop_value_aliases($short_name, $value)) {
+
+            # Add each synonym
+            push @{$all_values{$short_name}}, $alias;
+
+            # As well as its canonicalized name.  khw made the decision to not
+            # support the grandfathered L_ Gc property value
+            $alias = lc $alias;
+            $alias =~ s/[ _-]//g unless $alias =~ $numeric_re;
+            push @{$all_values{$short_name}}, $alias;
+        }
+    }
+}
+
+# Also include the old style block names, using the recipe given in
+# Unicode::UCD
+foreach my $block (prop_values('block')) {
+    push @{$all_values{'blk'}}, charblock((prop_invlist("block=$block"))[0]);
+}
+
+# Now create output tables for each property in @equals_properties (the keys
+# in %all_values) each containing that property's possible values as computed
+# just above.
+PROPERTY:
+for my $property (sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b)
+                         or $a cmp $b } keys %all_values)
+{
+    @{$all_values{$property}} = uniques(@{$all_values{$property}});
+
+    # String together the values for this property, sorted.  This string forms
+    # a list definition, with each value as an entry in it, indented on a new
+    # line.  The sorting is used to find properties that take on the exact
+    # same values to share this string.
+    my $joined = "\t\"";
+    $joined .= join "\",\n\t\"",
+                sort { ($a =~ $numeric_re && $b =~ $numeric_re)
+                        ? eval $a <=> eval $b
+                        :    prop_name_for_cmp($a) cmp prop_name_for_cmp($b)
+                          or $a cmp $b
+                        } @{$all_values{$property}};
+    # And add a trailing marker
+    $joined .= "\",\n\tNULL\n";
+
+    my $table_name = $table_name_prefix . $property . "_values";
+    my $index_name = "${table_name}_index";
+
+    # Add a rule for the parser that is just an empty value.  It will need to
+    # know to look up empty things in the prop_value_ptrs[] table.
+
+    $keywords{"$property="} = $index_name;
+    if (exists $prop_name_aliases{$property}) {
+        foreach my $alias (@{$prop_name_aliases{$property}}) {
+            $keywords{"$alias="} = $index_name;
+        }
+    }
+
+    # Also create rules for the synonyms of this property to point to the same
+    # thing
+
+    # If this property's values are the same as one we've already computed,
+    # use that instead of creating a duplicate.  But we add a #define to point
+    # to the proper one.
+    if (exists $joined_values{$joined}) {
+        push @values_indices, "#define $index_name  $joined_values{$joined}\n";
+        next PROPERTY;
+    }
+
+    # And this property, now known to have unique values from any other seen
+    # so far is about to be pushed onto @values_tables.  Its index is the
+    # current count.
+    push @values_indices, "#define $index_name  "
+                         . scalar @values_tables . "\n";
+    $joined_values{$joined} = $index_name;
+    push @values_tables, $table_name;
+
+    # Create the table for this set of values.
+    output_table_header($out_fh, "char *", $table_name);
+    print $out_fh $joined;
+    output_table_trailer();
+} # End of loop through the properties, and their values
+
+# We have completely determined the table of the unique property values
+output_table_header($out_fh, "char * const *",
+                             "${table_name_prefix}prop_value_ptrs");
+print $out_fh join ",\n", @values_tables;
+print $out_fh "\n";
+output_table_trailer();
+
+# And the #defines for the indices in it
+print $out_fh "\n\n", join "", @values_indices;
+
  switch_pound_if('Boundary_pair_tables', 'PERL_IN_REGEXEC_C');
  
  output_GCB_table();
@@ -2489,11 +3124,22 @@ output_WB_table();
  
  end_file_pound_if;
  
+print $out_fh <<"EOF";
+
+/* More than one code point may have the same code point as their fold.  This
+ * gives the maximum number in the current Unicode release.  (The folded-to
+ * code point is not included in this count.)  For example, both 'S' and
+ * \\x{17F} fold to 's', so the number for that fold is 2.  Another way to
+ * look at it is the maximum length of all the IVCF_AUX_TABLE's */
+#define MAX_FOLD_FROMS $max_fold_froms
+EOF
+
  my $sources_list = "lib/unicore/mktables.lst";
  my @sources = qw(regen/mk_invlists.pl
                   lib/unicore/mktables
                   lib/Unicode/UCD.pm
                   regen/charset_translations.pl
+                 regen/mk_PL_charclass.pl
                 );
  {
      # Depend on mktables’ own sources.  It’s a shorter list of files than
@@ -2513,3 +3159,64 @@ my @sources = qw(regen/mk_invlists.pl
  }
  
  read_only_bottom_close_and_rename($out_fh, \@sources);
+
+my %name_to_index;
+for my $i (0 .. @enums - 1) {
+    my $loose_name = $enums[$i] =~ s/^$table_name_prefix//r;
+    $loose_name = lc $loose_name;
+    $loose_name =~ s/__/=/;
+    $loose_name =~ s/_dot_/./;
+    $loose_name =~ s/_slash_/\//g;
+    $name_to_index{$loose_name} = $i + 1;
+}
+# unsanitize, exclude &, maybe add these before sanitize
+for my $i (0 .. @perl_prop_synonyms - 1) {
+    my $loose_name_pair = $perl_prop_synonyms[$i] =~ s/#\s*define\s*//r;
+    $loose_name_pair =~ s/\b$table_name_prefix//g;
+    $loose_name_pair = lc $loose_name_pair;
+    $loose_name_pair =~ s/__/=/g;
+    $loose_name_pair =~ s/_dot_/./g;
+    $loose_name_pair =~ s/_slash_/\//g;
+    my ($synonym, $primary) = split / +/, $loose_name_pair;
+    $name_to_index{$synonym} = $name_to_index{$primary};
+}
+
+my $uni_pl = open_new('lib/unicore/uni_keywords.pl', '>',
+                     {style => '*', by => 'regen/mk_invlists.pl',
+                      from => "Unicode::UCD"});
+{
+    print $uni_pl "\%Unicode::UCD::uni_prop_ptrs_indices = (\n";
+    for my $name (sort keys %name_to_index) {
+        print $uni_pl "    '$name' => $name_to_index{$name},\n";
+    }
+    print $uni_pl ");\n\n1;\n";
+}
+
+read_only_bottom_close_and_rename($uni_pl, \@sources);
+
+require './regen/mph.pl';
+
+sub token_name
+{
+    my $name = sanitize_name(shift);
+    warn "$name contains non-word" if $name =~ /\W/;
+
+    return "$table_name_prefix\U$name"
+}
+
+my $keywords_fh = open_new('uni_keywords.h', '>',
+                 {style => '*', by => 'regen/mk_invlists.pl',
+                  from => "mph.pl"});
+
+no warnings 'once';
+print $keywords_fh <<"EOF";
+/* The precision to use in "%.*e" formats */
+#define PL_E_FORMAT_PRECISION $Unicode::UCD::e_precision
+
+EOF
+
+my ($second_level, $seed1, $length_all_keys, $smart_blob, $rows) = MinimalPerfectHash::make_mph_from_hash(\%keywords);
+print $keywords_fh MinimalPerfectHash::make_algo($second_level, $seed1, $length_all_keys, $smart_blob, $rows, undef, undef, undef, 'match_uniprop' );
+
+push @sources, 'regen/mph.pl';
+read_only_bottom_close_and_rename($keywords_fh, \@sources);