Unicode::UCD: Add charprops_all() public function

[perl5.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index f3d5c83..4a16d83 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -528,6 +528,9 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
  # Without this change, there are casing problems for this character.
  #
+# Search for $string_compare_versions to see how to compare changes to
+# properties between Unicode versions
+#
  ##############################################################################
  
  my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
@@ -953,7 +956,7 @@ my %why_obsolete;    # Documentation only
          # contains the same information, but without the algorithmically
          # determinable Hangul syllables'.  This file is not published, so it's
          # existence is not noted in the comment.
-        'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
+        'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
  
          'Indic_Matra_Category' => "Provisional",
          'Indic_Syllabic_Category' => "Provisional",
@@ -962,14 +965,14 @@ my %why_obsolete;    # Documentation only
          # to differentiate between it and gc=c, which can be written as 'isc',
          # which is the same characters as ISO_Comment's short name.
  
-        'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
+        'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
  
-        'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
-        'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
-        'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
-        'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
+        'Simple_Case_Folding' => "$simple.  Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
+        'Simple_Lowercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+        'Simple_Titlecase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+        'Simple_Uppercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
  
-        FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
+        FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
      );
  
      foreach my $property (
@@ -1200,6 +1203,18 @@ my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
  my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
  my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
  
+# We work with above-Unicode code points, up to UV_MAX.   But when you get
+# that high, above IV_MAX, some operations don't work, and you can easily get
+# overflow.  Therefore for internal use, we use a much smaller number,
+# translating it to UV_MAX only for output.  The exact number is immaterial
+# (all Unicode code points are treated exactly the same), but the algorithm
+# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
+my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
+my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
+my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
+
+my $MAX_PLATFORM_CODEPOINT = ~0;
+
  # Matches legal code point.  4-6 hex numbers, If there are 6, the first
  # two must be 10; if there are 5, the first must not be a 0.  Written this way
  # to decrease backtracking.  The first regex allows the code point to be at
@@ -1353,12 +1368,18 @@ my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
  my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
  my $AUXILIARY = 'auxiliary';
  
-# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
-# and into UCD.pl for the use of UCD.pm
+# Hashes and arrays that will eventually go into Heavy.pl for the use of
+# utf8_heavy.pl and into UCD.pl for the use of UCD.pm
  my %loose_to_file_of;       # loosely maps table names to their respective
                              # files
  my %stricter_to_file_of;    # same; but for stricter mapping.
  my %loose_property_to_file_of; # Maps a loose property name to its map file
+my @inline_definitions = "V0"; # Each element gives a definition of a unique
+                            # inversion list.  When a definition is inlined,
+                            # its value in the hash it's in (one of the two
+                            # defined just above) will include an index into
+                            # this array.  The 0th element is initialized to
+                            # the definition for a zero length invwersion list
  my %file_to_swash_name;     # Maps the file name to its corresponding key name
                              # in the hash %utf8::SwashInfo
  my %nv_floating_to_rational; # maps numeric values floating point numbers to
@@ -1531,7 +1552,8 @@ my $UNASSIGNED_TYPE = -2;
  my $PRIVATE_USE_TYPE = -3;
  my $NONCHARACTER_TYPE = -4;
  my $CONTROL_TYPE = -5;
-my $UNKNOWN_TYPE = -6;  # Used only if there is a bug in this program
+my $ABOVE_UNICODE_TYPE = -6;
+my $UNKNOWN_TYPE = -7;  # Used only if there is a bug in this program
  
  sub populate_char_info ($) {
      # Used only with the $annotate option.  Populates the arrays with the
@@ -1562,7 +1584,13 @@ sub populate_char_info ($) {
      my $end;
      if (! $viacode[$i]) {
          my $nonchar;
-        if ($gc-> table('Private_use')->contains($i)) {
+        if ($i > $MAX_UNICODE_CODEPOINT) {
+            $viacode[$i] = 'Above-Unicode';
+            $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
+            $printable[$i] = 0;
+            $end = $MAX_WORKING_CODEPOINT;
+        }
+        elsif ($gc-> table('Private_use')->contains($i)) {
              $viacode[$i] = 'Private Use';
              $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
              $printable[$i] = 0;
@@ -1715,7 +1743,15 @@ sub clarify_code_point_count ($) {
      # This is like clarify_number(), but the input is assumed to be a count of
      # code points, rather than a generic number.
  
-    return clarify_number(shift);
+    my $append = "";
+
+    my $number = shift;
+    if ($number > $MAX_UNICODE_CODEPOINTS) {
+        $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
+        return "All above-Unicode code points" if $number == 0;
+        $append = " + all above-Unicode code points";
+    }
+    return clarify_number($number) . $append;
  }
  
  package Carp;
@@ -1824,7 +1860,8 @@ package main;
  { # Closure
  
      # This program uses the inside-out method for objects, as recommended in
-    # "Perl Best Practices".  This closure aids in generating those.  There
+    # "Perl Best Practices".  (This is the best solution still, since this has
+    # to run under miniperl.)  This closure aids in generating those.  There
      # are two routines.  setup_package() is called once per package to set
      # things up, and then set_access() is called for each hash representing a
      # field in the object.  These routines arrange for the object to be
@@ -2476,11 +2513,26 @@ END
              $handle{$addr} = $file_handle; # Cache the open file handle
  
              if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
-                $_ = <$file_handle>;
-                if ($_ !~ / - $string_version \. /x) {
-                    chomp;
-                    $_ =~ s/^#\s*//;
-                    die Carp::my_carp("File '$file' is version '$_'.  It should be version $string_version");
+                if ($file !~ /^Unihan/i) {
+                    $_ = <$file_handle>;
+                    if ($_ !~ / - $string_version \. /x) {
+                        chomp;
+                        $_ =~ s/^#\s*//;
+                        die Carp::my_carp("File '$file' is version '$_'.  It should be version $string_version");
+                    }
+                }
+                else {
+                    while (<$file_handle>) {
+                        if ($_ !~ /^#/) {
+                            Carp::my_carp_bug("Could not find the expected version info in file '$file'");
+                            last;
+                        }
+                        chomp;
+                        $_ =~ s/^#\s*//;
+                        next if $_ !~ / version: /x;
+                        last if $_ =~ /$string_version/;
+                        die Carp::my_carp("File '$file' is '$_'.  It should be version $string_version");
+                    }
                  }
              }
          }
@@ -3450,7 +3502,7 @@ sub trace { return main::trace(@_); }
  
          # If the range list is empty, return a large value that isn't adjacent
          # to any that could be in the range list, for simpler tests
-        return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
+        return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
          return $ranges{$addr}->[0]->start;
      }
  
@@ -3729,9 +3781,6 @@ sub trace { return main::trace(@_); }
              Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
              return;
          }
-        if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
-            Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ".  Adding it anyway");
-        }
          #local $to_trace = 1 if main::DEBUG;
  
          if ($operation eq '-') {
@@ -4529,8 +4578,8 @@ sub trace { return main::trace(@_); }
  
          # And finally, add the gap from the end of the table to the max
          # possible code point
-        if ($max < $MAX_UNICODE_CODEPOINT) {
-            $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
+        if ($max < $MAX_WORKING_CODEPOINT) {
+            $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
          }
          return $new;
      }
@@ -4819,6 +4868,7 @@ sub trace { return main::trace(@_); }
                  # range.
                  my $end = $set->end;
                  return $end if is_code_point_usable($end, $try_hard);
+                $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
  
                  # End point didn't, work.  Start at the beginning and try
                  # every one until find one that does work.
@@ -4994,6 +5044,11 @@ sub trace { return main::trace(@_); }
      # The constructor can override the global flag of the same name.
      main::set_access('output_range_counts', \%output_range_counts, 'r');
  
+    my %write_as_invlist;
+    # A boolean set iff the output file for this table is to be in the form of
+    # an inversion list/map.
+    main::set_access('write_as_invlist', \%write_as_invlist, 'r');
+
      my %format;
      # The format of the entries of the table.  This is calculated from the
      # data in the table (or passed in the constructor).  This is an enum e.g.,
@@ -5030,6 +5085,7 @@ sub trace { return main::trace(@_); }
          $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
          $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
          $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
+        $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
          my $ucd = delete $args{'UCD'};
  
          my $description = delete $args{'Description'};
@@ -5547,6 +5603,7 @@ END
          Carp::carp_extra_args(\@_) if main::DEBUG && @_;
  
          my $addr = do { no overloading; pack 'J', $self; };
+        my $write_as_invlist = $write_as_invlist{$addr};
  
          # Start with the header
          my @HEADER = $self->header;
@@ -5599,16 +5656,23 @@ END
          else {
              my $range_size_1 = $range_size_1{$addr};
  
+            # To make it more readable, use a minimum indentation
+            my $comment_indent;
+
              # These are used only in $annotate option
              my $format;         # e.g. $HEX_ADJUST_FORMAT
              my $include_name;   # ? Include the character's name in the
                                  # annotation?
              my $include_cp;     # ? Include its code point
  
-            # To make it more readable, use a minimum indentation
-            my $comment_indent = 16;
-
-            if ($annotate) {
+            if (! $annotate) {
+                $comment_indent = ($self->isa('Map_Table'))
+                                  ? 24
+                                  : ($write_as_invlist)
+                                    ? 8
+                                    : 16;
+            }
+            else {
                  $format = $self->format;
  
                  # The name of the character is output only for tables that
@@ -5626,7 +5690,11 @@ END
                  # the first column
                  $include_cp = ! $range_size_1;
  
-                if ($self->isa('Map_Table')) {
+                if (! $self->isa('Map_Table')) {
+                    $comment_indent = ($write_as_invlist) ? 8 : 16;
+                }
+                else {
+                    $comment_indent = 16;
  
                      # There are just a few short ranges in this table, so no
                      # need to include the code point in the annotation.
@@ -5722,6 +5790,7 @@ END
              my $next_end;
              my $next_value;
              my $offset = 0;
+            my $invlist_count = 0;
  
              my $output_value_in_hex = $self->isa('Map_Table')
                                  && ($self->format eq $HEX_ADJUST_FORMAT
@@ -5853,11 +5922,36 @@ END
                          $previous_value = $value;
                      }
  
-                    # If there is a range
-                    if ($start != $end) {
-                        push @OUT, sprintf "$hex_format\t$hex_format",
-                                             $start,       $end;
-                        if ($value ne "") {
+                    if ($write_as_invlist) {
+
+                        # Inversion list format has a single number per line,
+                        # the starting code point of a range that matches the
+                        # property
+                        push @OUT, $start, "\n";
+                        $invlist_count++;
+
+                        # Add a comment with the size of the range, if
+                        # requested.
+                        if ($output_range_counts{$addr}) {
+                            $OUT[-1] = merge_single_annotation_line(
+                                    $OUT[-1],
+                                    "# ["
+                                      . main::clarify_code_point_count($end - $start + 1)
+                                      . "]\n",
+                                    $comment_indent);
+                        }
+                    }
+                    elsif ($start != $end) { # If there is a range
+                        if ($end == $MAX_WORKING_CODEPOINT) {
+                            push @OUT, sprintf "$hex_format\t$hex_format",
+                                                $start,
+                                                $MAX_PLATFORM_CODEPOINT;
+                        }
+                        else {
+                            push @OUT, sprintf "$hex_format\t$hex_format",
+                                                $start,       $end;
+                        }
+                        if (length $value) {
                              if ($convert_map_to_from_hex) {
                                  $OUT[-1] .= sprintf "\t$hex_format\n", $value;
                              }
@@ -5958,8 +6052,15 @@ END
                                  }
  
                                  if ($i != $start || $range_end < $end) {
-                                    $annotation = sprintf "%04X..%04X",
-                                                           $i,   $range_end;
+                                    if ($range_end < $MAX_WORKING_CODEPOINT)
+                                    {
+                                        $annotation = sprintf "%04X..%04X",
+                                                              $i,   $range_end;
+                                    }
+                                    else {
+                                        $annotation = sprintf "%04X..INFINITY",
+                                                               $i;
+                                    }
                                  }
                                  else { # Indent if not displaying code points
                                      $annotation = " " x 4;
@@ -6076,6 +6177,15 @@ END
                          }
                      }
  
+                    # Add the beginning of the range that doesn't match the
+                    # property, except if the just added match range extends
+                    # to infinity.  We do this after any annotations for the
+                    # match range.
+                    if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
+                        push @OUT, $end + 1, "\n";
+                        $invlist_count++;
+                    }
+
                      # If we split the range, set up so the next time through
                      # we get the remainder, and redo.
                      if ($next_start) {
@@ -6089,6 +6199,8 @@ END
              } # End of loop through all the table's ranges
  
              push @OUT, @annotation; # Add orphaned annotation, if any
+
+            splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
          }
  
          # Add anything that goes after the main body, but within the here
@@ -6368,6 +6480,7 @@ sub trace { return main::trace(@_); }
                                      Full_Name => $full_name,
                                      _Property => $property,
                                      _Range_List => $range_list,
+                                    Write_As_Invlist => 0,
                                      %args);
  
          my $addr = do { no overloading; pack 'J', $self; };
@@ -6614,7 +6727,8 @@ sub trace { return main::trace(@_); }
  # but its format and even its name or existence are subject to change without
  # notice in a future Perl version.  Don't use it directly.  Instead, its
  # contents are now retrievable through a stable API in the Unicode::UCD
-# module: Unicode::UCD::prop_invmap('$property_name').
+# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
+# code points can be retrieved via Unicode::UCD::charprop());
  END
          }
          return $return;
@@ -6741,7 +6855,7 @@ END
          }
          $comment .= "\nwhere 'cp' is $cp.";
          if ($ucd_accessible_name) {
-            $comment .= "  Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
+            $comment .= "  Note that $these_mappings $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
          }
  
          # And append any commentary already set from the actual property.
@@ -6896,11 +7010,14 @@ END
                      }
                  }
  
-                # I (khw) have never waded through this line to
-                # understand it well enough to comment it.
+                # The unpack yields a list of the bytes that comprise the
+                # UTF-8 of $code_point, which are each placed in \xZZ format
+                # and output in the %s to map to $tostr, so the result looks
+                # like:
+                # "\xC4\xB0" => "\x{0069}\x{0307}",
                  my $utf8 = sprintf(qq["%s" => "$tostr",],
                          join("", map { sprintf "\\x%02X", $_ }
-                            unpack("U0C*", pack("U", $code_point))));
+                            unpack("U0C*", chr $code_point)));
  
                  # Add a comment so that a human reader can more easily
                  # see what's going on.
@@ -7290,6 +7407,7 @@ sub trace { return main::trace(@_); }
                                        _Property => $property,
                                        _Range_List => $range_list,
                                        Format => $EMPTY_FORMAT,
+                                      Write_As_Invlist => 1,
                                        );
          my $addr = do { no overloading; pack 'J', $self; };
  
@@ -7696,7 +7814,18 @@ END
          # Get the number of code points matched by each of the tables in this
          # file, and add underscores for clarity.
          my $count = $leader->count;
-        my $string_count = main::clarify_code_point_count($count);
+        my $unicode_count;
+        my $non_unicode_string;
+        if ($count > $MAX_UNICODE_CODEPOINTS) {
+            $unicode_count = $count - ($MAX_WORKING_CODEPOINT
+                                       - $MAX_UNICODE_CODEPOINT);
+            $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
+        }
+        else {
+            $unicode_count = $count;
+            $non_unicode_string = "";
+        }
+        my $string_count = main::clarify_code_point_count($unicode_count);
  
          my $loose_count = 0;        # how many aliases loosely matched
          my $compound_name = "";     # ? Are any names compound?, and if so, an
@@ -7894,11 +8023,13 @@ END
              }
          } # End of looping through all tables
  
+        $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
+
  
          my $code_points;
          my $match;
          my $any_of_these;
-        if ($count == 1) {
+        if ($unicode_count == 1) {
              $match = 'matches';
              $code_points = 'single code point';
          }
@@ -7998,14 +8129,25 @@ END
  
          if ($count) {   # The format differs if no code points, and needs no
                          # explanation in that case
+            if ($leader->write_as_invlist) {
                  $comment.= <<END;
  
-The format of the lines of this file is:
+The first data line of this file begins with the letter V to indicate it is in
+inversion list format.  The number following the V gives the number of lines
+remaining.  Each of those remaining lines is a single number representing the
+starting code point of a range which goes up to but not including the number
+on the next line; The 0th, 2nd, 4th... ranges are for code points that match
+the property; the 1st, 3rd, 5th... are ranges of code points that don't match
+the property.  The final line's range extends to the platform's infinity.
  END
-            $comment.= <<END;
+            }
+            else {
+                $comment.= <<END;
+The format of the lines of this file is:
  START\\tSTOP\\twhere START is the starting code point of the range, in hex;
  STOP is the ending point, or if omitted, the range has just one code point.
  END
+            }
              if ($leader->output_range_counts) {
                  $comment .= <<END;
  Numbers in comments in [brackets] indicate how many code points are in the
@@ -9132,7 +9274,7 @@ sub utf8_heavy_name ($$) {
  {   # Closure
  
      my $indent_increment = " " x (($debugging_build) ? 2 : 0);
-    my %already_output;
+    %main::already_output = ();
  
      $main::simple_dumper_nesting = 0;
  
@@ -9152,7 +9294,7 @@ sub utf8_heavy_name ($$) {
          # nesting level is localized, so that as the call stack pops, it goes
          # back to the prior value.
          local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
-        undef %already_output if $main::simple_dumper_nesting == 0;
+        local %main::already_output = %main::already_output;
          $main::simple_dumper_nesting++;
          #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
  
@@ -9183,10 +9325,10 @@ sub utf8_heavy_name ($$) {
  
              # Keep track of cycles in the input, and refuse to infinitely loop
              my $addr = do { no overloading; pack 'J', $item; };
-            if (defined $already_output{$addr}) {
+            if (defined $main::already_output{$addr}) {
                  return "${indent}ALREADY OUTPUT: $item\n";
              }
-            $already_output{$addr} = $item;
+            $main::already_output{$addr} = $item;
  
              if (ref $item eq 'ARRAY') {
                  my $using_brackets;
@@ -12999,7 +13141,7 @@ END
                  # This fills in any missing values with the default.  It's not
                  # necessary to do this with binary properties, as the default
                  # is defined completely in terms of the Y table.
-                $property->add_map(0, $MAX_UNICODE_CODEPOINT,
+                $property->add_map(0, $MAX_WORKING_CODEPOINT,
                                     $default_map, Replace => $NO);
              }
          }
@@ -13211,8 +13353,9 @@ sub compile_perl() {
      # 'All' is all code points.  As an error check, instead of just setting it
      # to be that, construct it to be the union of all the major categories
      $All = $perl->add_match_table('All',
-            Description  => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
-            Matches_All => 1);
+      Description
+        => "All code points, including those above Unicode.  Same as qr/./s",
+      Matches_All => 1);
  
      foreach my $major_table ($gc->tables) {
  
@@ -13222,10 +13365,10 @@ sub compile_perl() {
          $All += $major_table;
      }
  
-    if ($All->max != $MAX_UNICODE_CODEPOINT) {
+    if ($All->max != $MAX_WORKING_CODEPOINT) {
          Carp::my_carp_bug("Generated highest code point ("
             . sprintf("%X", $All->max)
-           . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
+           . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
      }
      if ($All->range_count != 1 || $All->min != 0) {
       Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
@@ -13276,7 +13419,7 @@ sub compile_perl() {
      # have Uppercase and Lowercase defined, so use the general category
      # instead for them, modified by hard-coding in the code points each is
      # missing.
-    my $Lower = $perl->add_match_table('Lower');
+    my $Lower = $perl->add_match_table('XPosixLower');
      my $Unicode_Lower = property_ref('Lowercase');
      if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
          $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
@@ -13314,13 +13457,12 @@ sub compile_perl() {
              $Lower += $code_point;
          }
      }
-    $Lower->add_alias('XPosixLower');
      my $Posix_Lower = $perl->add_match_table("PosixLower",
                              Description => "[a-z]",
                              Initialize => $Lower & $ASCII,
                              );
  
-    my $Upper = $perl->add_match_table('Upper');
+    my $Upper = $perl->add_match_table("XPosixUpper");
      my $Unicode_Upper = property_ref('Uppercase');
      if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
          $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
@@ -13333,7 +13475,6 @@ sub compile_perl() {
          $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
          $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
      }
-    $Upper->add_alias('XPosixUpper');
      my $Posix_Upper = $perl->add_match_table("PosixUpper",
                              Description => "[A-Z]",
                              Initialize => $Upper & $ASCII,
@@ -13455,7 +13596,7 @@ sub compile_perl() {
      # one whose name generally begins with Posix that is posix-compliant, and
      # one that matches Unicode characters beyond the Posix, ASCII range
  
-    my $Alpha = $perl->add_match_table('Alpha');
+    my $Alpha = $perl->add_match_table('XPosixAlpha');
  
      # Alphabetic was not present in early releases
      my $Alphabetic = property_ref('Alphabetic');
@@ -13542,7 +13683,6 @@ sub compile_perl() {
          $Alpha->add_description('Alphabetic');
          $Alpha->add_alias('Alphabetic');
      }
-    $Alpha->add_alias('XPosixAlpha');
      my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
                              Description => "[A-Za-z]",
                              Initialize => $Alpha & $ASCII,
@@ -13550,22 +13690,20 @@ sub compile_perl() {
      $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
      $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
  
-    my $Alnum = $perl->add_match_table('Alnum',
+    my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
                          Description => 'Alphabetic and (decimal) Numeric',
                          Initialize => $Alpha + $gc->table('Decimal_Number'),
                          );
-    $Alnum->add_alias('XPosixAlnum');
      $perl->add_match_table("PosixAlnum",
                              Description => "[A-Za-z0-9]",
                              Initialize => $Alnum & $ASCII,
                              );
  
-    my $Word = $perl->add_match_table('Word',
+    my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
                                  Description => '\w, including beyond ASCII;'
                                              . ' = \p{Alnum} + \pM + \p{Pc}',
                                  Initialize => $Alnum + $gc->table('Mark'),
                                  );
-    $Word->add_alias('XPosixWord');
      my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
      if (defined $Pc) {
          $Word += $Pc;
@@ -13582,13 +13720,13 @@ sub compile_perl() {
      }
  
      # This is a Perl extension, so the name doesn't begin with Posix.
-    my $PerlWord = $perl->add_match_table('PerlWord',
+    my $PerlWord = $perl->add_match_table('PosixWord',
                      Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
                      Initialize => $Word & $ASCII,
                      );
-    $PerlWord->add_alias('PosixWord');
+    $PerlWord->add_alias('PerlWord');
  
-    my $Blank = $perl->add_match_table('Blank',
+    my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
                                  Description => '\h, Horizontal white space',
  
                                  # 200B is Zero Width Space which is for line
@@ -13599,7 +13737,6 @@ sub compile_perl() {
                                              -   0x200B, # ZWSP
                                  );
      $Blank->add_alias('HorizSpace');        # Another name for it.
-    $Blank->add_alias('XPosixBlank');
      $perl->add_match_table("PosixBlank",
                              Description => "\\t and ' '",
                              Initialize => $Blank & $ASCII,
@@ -13618,34 +13755,22 @@ sub compile_perl() {
                      );
      # No Posix equivalent for vertical space
  
-    my $Space = $perl->add_match_table('Space',
+    my $Space = $perl->add_match_table('XPosixSpace',
                  Description => '\s including beyond ASCII and vertical tab',
                  Initialize => $Blank + $VertSpace,
      );
-    $Space->add_alias('XPosixSpace');
-    my $posix_space = $perl->add_match_table("PosixSpace",
+    $Space->add_alias('XPerlSpace');    # Pre-existing synonyms
+    $Space->add_alias('SpacePerl');
+
+    my $Posix_space = $perl->add_match_table("PosixSpace",
                              Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
                              Initialize => $Space & $ASCII,
                              );
+    $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
  
-    # Perl's traditional space doesn't include Vertical Tab prior to v5.18
-    my $XPerlSpace = $perl->add_match_table('XPerlSpace',
-                                  Description => '\s, including beyond ASCII',
-                                  Initialize => $Space,
-                                  #Initialize => $Space
-                                  # - utf8::unicode_to_native(0x0B]
-                                );
-    $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
-    my $PerlSpace = $perl->add_match_table('PerlSpace',
-                        Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
-                        Initialize => $XPerlSpace & $ASCII,
-                            );
-
-
-    my $Cntrl = $perl->add_match_table('Cntrl',
+    my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
                                          Description => 'Control characters');
      $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
-    $Cntrl->add_alias('XPosixCntrl');
      $perl->add_match_table("PosixCntrl",
                              Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
                              Initialize => $Cntrl & $ASCII,
@@ -13658,22 +13783,20 @@ sub compile_perl() {
      $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
  
      # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
-    my $Graph = $perl->add_match_table('Graph',
+    my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
                          Description => 'Characters that are graphical',
                          Initialize => ~ ($Space + $controls),
                          );
-    $Graph->add_alias('XPosixGraph');
      $perl->add_match_table("PosixGraph",
                              Description =>
                                  '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
                              Initialize => $Graph & $ASCII,
                              );
  
-    $print = $perl->add_match_table('Print',
+    $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
                          Description => 'Characters that are graphical plus space characters (but no controls)',
                          Initialize => $Blank + $Graph - $gc->table('Control'),
                          );
-    $print->add_alias('XPosixPrint');
      $perl->add_match_table("PosixPrint",
                              Description =>
                                '[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
@@ -13695,18 +13818,16 @@ sub compile_perl() {
          Initialize => $ASCII & $XPosixPunct,
          );
  
-    my $Digit = $perl->add_match_table('Digit',
+    my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
                              Description => '[0-9] + all other decimal digits');
      $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
-    $Digit->add_alias('XPosixDigit');
      my $PosixDigit = $perl->add_match_table("PosixDigit",
                                              Description => '[0-9]',
                                              Initialize => $Digit & $ASCII,
                                              );
  
      # Hex_Digit was not present in first release
-    my $Xdigit = $perl->add_match_table('XDigit');
-    $Xdigit->add_alias('XPosixXDigit');
+    my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
      my $Hex = property_ref('Hex_Digit');
      if (defined $Hex && ! $Hex->is_empty) {
          $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
@@ -13735,12 +13856,90 @@ sub compile_perl() {
      my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
                      Description => "Code points that particpate in some fold",
                      );
-    #
-    foreach my $range (property_ref('Case_Folding')->ranges) {
-        $any_folds->add_range($range->start, $range->end);
-        foreach my $hex_code_point (split " ", $range->value) {
-            my $code_point = hex $hex_code_point;
-            $any_folds->add_range($code_point, $code_point);
+    my $loc_problem_folds = $perl->add_match_table(
+               "_Perl_Problematic_Locale_Folds",
+               Description =>
+                   "Code points that are in some way problematic under locale",
+    );
+
+    # This allows regexec.c to skip some work when appropriate.  Some of the
+    # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
+    my $loc_problem_folds_start = $perl->add_match_table(
+               "_Perl_Problematic_Locale_Foldeds_Start",
+               Description =>
+                   "The first character of every sequence in _Perl_Problematic_Locale_Folds",
+    );
+
+    my $cf = property_ref('Case_Folding');
+
+    # Every character 0-255 is problematic because what each folds to depends
+    # on the current locale
+    $loc_problem_folds->add_range(0, 255);
+    $loc_problem_folds_start += $loc_problem_folds;
+
+    # Also problematic are anything these fold to outside the range.  Likely
+    # forever the only thing folded to by these outside the 0-255 range is the
+    # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
+    # completely general, which should catch any unexpected changes or errors.
+    # We look at each code point 0-255, and add its fold (including each part
+    # of a multi-char fold) to the list.  See commit message
+    # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
+    # of the MU issue.
+    foreach my $range ($loc_problem_folds->ranges) {
+        foreach my $code_point($range->start .. $range->end) {
+            my $fold_range = $cf->containing_range($code_point);
+            next unless defined $fold_range;
+
+            my @hex_folds = split " ", $fold_range->value;
+            my $start_cp = hex $hex_folds[0];
+            foreach my $i (0 .. @hex_folds - 1) {
+                my $cp = hex $hex_folds[$i];
+                next unless $cp > 255;    # Already have the < 256 ones
+
+                $loc_problem_folds->add_range($cp, $cp);
+                $loc_problem_folds_start->add_range($start_cp, $start_cp);
+            }
+        }
+    }
+
+    my $folds_to_multi_char = $perl->add_match_table(
+         "_Perl_Folds_To_Multi_Char",
+         Description =>
+              "Code points whose fold is a string of more than one character",
+    );
+
+    # Look through all the known folds to populate these tables.
+    foreach my $range ($cf->ranges) {
+        my $start = $range->start;
+        my $end = $range->end;
+        $any_folds->add_range($start, $end);
+
+        my @hex_folds = split " ", $range->value;
+        if (@hex_folds > 1) {   # Is multi-char fold
+            $folds_to_multi_char->add_range($start, $end);
+        }
+
+        my $found_locale_problematic = 0;
+
+        # Look at each of the folded-to characters...
+        foreach my $i (0 .. @hex_folds - 1) {
+            my $cp = hex $hex_folds[$i];
+            $any_folds->add_range($cp, $cp);
+
+            # The fold is problematic if any of the folded-to characters is
+            # already considered problematic.
+            if ($loc_problem_folds->contains($cp)) {
+                $loc_problem_folds->add_range($start, $end);
+                $found_locale_problematic = 1;
+            }
+        }
+
+        # If this is a problematic fold, add to the start chars the
+        # folding-from characters and first folded-to character.
+        if ($found_locale_problematic) {
+            $loc_problem_folds_start->add_range($start, $end);
+            my $cp = hex $hex_folds[0];
+            $loc_problem_folds_start->add_range($cp, $cp);
          }
      }
  
@@ -14284,8 +14483,9 @@ END
              $unassigned_sans_noncharacters &= $nonchars->table('N');
          }
  
-        for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
+        for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
              $i = populate_char_info($i);    # Note sets $i so may cause skips
+
          }
      }
  
@@ -14625,8 +14825,9 @@ sub register_file_for_name($$$) {
      # Join all the file path components together, using slashes.
      my $full_filename = join('/', @$directory_ref, $file);
  
-    # All go in the same subdirectory of unicore
-    if ($directory_ref->[0] ne $matches_directory) {
+    # All go in the same subdirectory of unicore, or the special
+    # pseudo-directory '#'
+    if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
          Carp::my_carp("Unexpected directory in "
                  .  join('/', @{$directory_ref}, $file));
      }
@@ -14937,7 +15138,18 @@ sub make_re_pod_entries($) {
      my $full_name = $property->full_name;
  
      my $count = $input_table->count;
-    my $string_count = clarify_number($count);
+    my $unicode_count;
+    my $non_unicode_string;
+    if ($count > $MAX_UNICODE_CODEPOINTS) {
+        $unicode_count = $count - ($MAX_WORKING_CODEPOINT
+                                    - $MAX_UNICODE_CODEPOINT);
+        $non_unicode_string = " plus all above-Unicode code points";
+    }
+    else {
+        $unicode_count = $count;
+        $non_unicode_string = "";
+    }
+    my $string_count = clarify_number($unicode_count) . $non_unicode_string;
      my $status = $input_table->status;
      my $status_info = $input_table->status_info;
      my $caseless_equivalent = $input_table->caseless_equivalent;
@@ -15004,6 +15216,8 @@ sub make_re_pod_entries($) {
              # property always use the single form.
              if ($table_property == $perl) {
                  $entry = "\\p{$name}";
+                $entry .= " \\p$name" if length $name == 1; # Show non-braced
+                                                            # form too
                  $entry_ref = "\\p{$name}";
              }
              else {    # Compound form.
@@ -15189,7 +15403,7 @@ sub make_re_pod_entries($) {
                      }
                  }
  
-                # Ouput both short and single in the same parenthesized
+                # Output both short and single in the same parenthesized
                  # expression, but with only one of 'Single', 'Short' if there
                  # are both items.
                  if ($short_name || $single_form || $table->conflicting) {
@@ -15870,6 +16084,9 @@ the right column will give its longer, more descriptive name; and if the left
  column is the longest name, the right column will show any equivalent shortest
  name, in both single and compound forms if applicable.
  
+If braces are not needed to specify a property (e.g., C<\\pL>), the left
+column contains both forms, with and without braces.
+
  The right column will also caution you if a property means something different
  than what might normally be expected.
  
@@ -15959,9 +16176,14 @@ $zero_matches
  
  =head1 Properties accessible through Unicode::UCD
  
-All the Unicode character properties mentioned above (except for those marked
-as for internal use by Perl) are also accessible by
-L<Unicode::UCD/prop_invlist()>.
+The value of any Unicode (not including Perl extensions) character
+property mentioned above for any single code point is available through
+L<Unicode::UCD/charprop()>.  L<Unicode::UCD/charprops_all()> returns the
+values of all the Unicode properties for a given code point.
+
+Besides these, all the Unicode character properties mentioned above
+(except for those marked as for internal use by Perl) are also
+accessible by L<Unicode::UCD/prop_invlist()>.
  
  Due to their nature, not all Unicode character properties are suitable for
  regular expression matches, nor C<prop_invlist()>.  The remaining
@@ -16091,6 +16313,9 @@ sub make_Heavy () {
      my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
      chomp $stricter_to_file_of;
  
+    my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
+    chomp $inline_definitions;
+
      my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
      chomp $loose_to_file_of;
  
@@ -16106,7 +16331,13 @@ sub make_Heavy () {
      # for the alternate table figured out at that time.
      foreach my $cased (keys %caseless_equivalent_to) {
          my @path = $caseless_equivalent_to{$cased}->file_path;
-        my $path = join '/', @path[1, -1];
+        my $path;
+        if ($path[0] eq "#") {  # Pseudo-directory '#'
+            $path = join '/', @path;
+        }
+        else {  # Gets rid of lib/
+            $path = join '/', @path[1, -1];
+        }
          $caseless_equivalent_to{$cased} = $path;
      }
      my $caseless_equivalent_to
@@ -16132,12 +16363,22 @@ $INTERNAL_ONLY_HEADER
  $loose_property_name_of
  );
  
-# Maps property, table to file for those using stricter matching
+# Gives the definitions (in the form of inversion lists) for those properties
+# whose definitions aren't kept in files
+\@utf8::inline_definitions = (
+$inline_definitions
+);
+
+# Maps property, table to file for those using stricter matching.  For paths
+# whose directory is '#', the file is in the form of a numeric index into
+# \@inline_definitions
  \%utf8::stricter_to_file_of = (
  $stricter_to_file_of
  );
  
-# Maps property, table to file for those using loose matching
+# Maps property, table to file for those using loose matching.  For paths
+# whose directory is '#', the file is in the form of a numeric index into
+# \@inline_definitions
  \%utf8::loose_to_file_of = (
  $loose_to_file_of
  );
@@ -16689,9 +16930,9 @@ sub write_all_tables() {
      # don't care if both defined, as they had better be different anyway.)
      # The property named 'Perl' needs to be first (it doesn't have any
      # immutable file name) because empty properties are defined in terms of
-    # it's table named 'All'.)   We also sort by the property's name.  This is
-    # just for repeatability of the outputs between runs of this program, but
-    # does not affect correctness.
+    # its table named 'All' under the -annotate option.)   We also sort by
+    # the property's name.  This is just for repeatability of the outputs
+    # between runs of this program, but does not affect correctness.
      PROPERTY:
      foreach my $property ($perl,
                            sort { return -1 if defined $a->file;
@@ -16807,8 +17048,12 @@ sub write_all_tables() {
                      push @unhandled_properties, "$table";
                  }
  
-                # An empty table is just the complement of everything.
-                $table->set_complement($All) if $table != $property;
+                # The old way of expressing an empty match list was to
+                # complement the list that matches everything.  The new way is
+                # to create an empty inversion list, but this doesn't work for
+                # annotating, so use the old way then.
+                $table->set_complement($All) if $annotate
+                                                && $table != $property;
              }
              elsif ($expected_empty) {
                  my $because = "";
@@ -16840,13 +17085,13 @@ sub write_all_tables() {
  
              my $count = $table->count;
              if ($expected_full) {
-                if ($count != $MAX_UNICODE_CODEPOINTS) {
+                if ($count != $MAX_WORKING_CODEPOINTS) {
                      Carp::my_carp("$table matches only "
                      . clarify_number($count)
                      . " Unicode code points but should match "
-                    . clarify_number($MAX_UNICODE_CODEPOINTS)
+                    . clarify_number($MAX_WORKING_CODEPOINTS)
                      . " (off by "
-                    .  clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
+                    .  clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
                      . ").  Proceeding anyway.");
                  }
  
@@ -17028,8 +17273,6 @@ sub write_all_tables() {
                      # Similarly, we create for Unicode::UCD a list of
                      # property-value aliases.
  
-                    my $property_full_name = $property->full_name;
-
                      # Look at each table in the property...
                      foreach my $table ($property->tables) {
                          my @values_list;
@@ -17068,7 +17311,7 @@ sub write_all_tables() {
                          }
  
                          # To save memory, unlike the similar list for property
-                        # aliases above, only the standard forms hve the list.
+                        # aliases above, only the standard forms have the list.
                          # This forces an extra step of converting from input
                          # name to standard name, but the savings are
                          # considerable.  (There is only marginal savings if we
@@ -17096,8 +17339,56 @@ sub write_all_tables() {
          my $filename;
          my $property = $table->property;
          my $is_property = ($table == $property);
-        if (! $is_property) {
  
+        # For very short tables, instead of writing them out to actual files,
+        # we in-line their inversion list definitions into Heavy.pl.  The
+        # definition replaces the file name, and the special pseudo-directory
+        # '#' is used to signal this.  This significantly cuts down the number
+        # of files written at little extra cost to the hashes in Heavy.pl.
+        # And it means, no run-time files to read to get the definitions.
+        if (! $is_property
+            && ! $annotate  # For annotation, we want to explicitly show
+                            # everything, so keep in files
+            && $table->ranges <= 3)
+        {
+            my @ranges = $table->ranges;
+            my $count = @ranges;
+            if ($count == 0) {  # 0th index reserved for 0-length lists
+                $filename = 0;
+            }
+            elsif ($table->leader != $table) {
+
+                # Here, is a table that is equivalent to another; code
+                # in register_file_for_name() causes its leader's definition
+                # to be used
+
+                next;
+            }
+            else {  # No equivalent table so far.
+
+                # Build up its definition range-by-range.
+                my $definition = "";
+                while (defined (my $range = shift @ranges)) {
+                    my $end = $range->end;
+                    if ($end < $MAX_WORKING_CODEPOINT) {
+                        $count++;
+                        $end = "\n" . ($end + 1);
+                    }
+                    else {  # Extends to infinity, hence no 'end'
+                        $end = "";
+                    }
+                    $definition .= "\n" . $range->start . $end;
+                }
+                $definition = "V$count" . $definition;
+                $filename = @inline_definitions;
+                push @inline_definitions, $definition;
+            }
+            @directory = "#";
+            register_file_for_name($table, \@directory, $filename);
+            next;
+        }
+
+        if (! $is_property) {
              # Match tables for the property go in lib/$subdirectory, which is
              # the property's name.  Don't use the standard file name for this,
              # as may get an unfamiliar alias
@@ -17588,10 +17879,12 @@ sub make_property_test_script() {
  
      &write($t_path,
             0,           # Not utf8;
-           [<DATA>,
+           [$HEADER,
+            <DATA>,
              @output,
              (map {"Test_X('$_');\n"} @backslash_X_tests),
-            "Finished();\n"]);
+            "Finished();\n"
+           ]);
      return;
  }
  
@@ -18305,11 +18598,6 @@ __DATA__
  use strict;
  use warnings;
  
-# If run outside the normal test suite on an ASCII platform, you can
-# just create a latin1_to_native() function that just returns its
-# inputs, because that's the only function used from test.pl
-require "test.pl";
-
  # Test qr/\X/ and the \p{} regular expression constructs.  This file is
  # constructed by mktables from the tables it generates, so if mktables is
  # buggy, this won't necessarily catch those bugs.  Tests are generated for all
@@ -18459,7 +18747,7 @@ sub Test_X($) {
          my $this_string = "";
          my $this_display = "";
          foreach my $code_point (@code_points) {
-            $this_string .= latin1_to_native(chr(hex $code_point));
+            $this_string .= chr utf8::unicode_to_native(hex $code_point);
              $this_display .= "\\x{$code_point}";
          }