perluniprops/mktables: Add Definition concept

[perl5.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 6b6851f..13732d3 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -36,6 +36,17 @@ my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  
  sub NON_ASCII_PLATFORM { ord("A") != 65 }
  
+# When a new version of Unicode is published, unfortunately the algorithms for
+# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
+# manually.  The changes may or may not be backward compatible with older
+# releases.  The code is in regen/mk_invlist.pl and regexec.c.  Make the
+# changes, then come back here and set the variable below to what version the
+# code is expecting.  If a newer version of Unicode is being compiled than
+# expected, a warning will be generated.  If an older version is being
+# compiled, any bounds tests that fail in the generated test file (-maketest
+# option) will be marked as TODO.
+my $version_of_mk_invlist_bounds = v10.0.0;
+
  ##########################################################################
  #
  # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
@@ -124,7 +135,7 @@ my $map_directory = 'To';        # Where map files go.
  # each one of the tens of thousands individually.
  #
  # In a match table, the value of a range is irrelevant (and hence the type as
-# well, which will always be 0), and arbitrarily set to the null string.
+# well, which will always be 0), and arbitrarily set to the empty string.
  # Using the example above, there would be two match tables for those two
  # entries, one named Upper would contain the 0x41..0x5A range, and the other
  # named Lower would contain 0x61..0x7A.
@@ -352,6 +363,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  #
  # trace ... if main::DEBUG && $to_trace;
  #
+# main::stack_trace() will display what its name implies
+#
  # If there is just one or a few files that you're debugging, you can easily
  # cause most everything else to be skipped.  Change the line
  #
@@ -409,7 +422,7 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  #
  # A NOTE ON UNIHAN
  #
-# This program can generate tables from the Unihan database.  But that db
+# This program can generate tables from the Unihan database.  But that DB
  # isn't normally available, so it is marked as optional.  Prior to version
  # 5.2, this database was in a single file, Unihan.txt.  In 5.2 the database
  # was split into 8 different files, all beginning with the letters 'Unihan'.
@@ -477,8 +490,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  # handled by Unicode::Normalize, nor will it compile when presented a version
  # that has them.  However, you can trivially get it to compile by simply
  # ignoring those decompositions, by changing the croak to a carp.  At the time
-# of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or
-# cpan/Unicode-Normalize/mkheader) reads
+# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
+# dist/Unicode-Normalize/mkheader) reads
  #
  #   croak("Weird Canonical Decomposition of U+$h");
  #
@@ -617,12 +630,28 @@ our $to_trace = 0;
      }
  }
  
+sub stack_trace() {
+    local $to_trace = 1 if main::DEBUG;
+    my $line = (caller(0))[2];
+    my $i = 1;
+
+    # Accumulate the stack trace
+    while (1) {
+        my ($pkg, $file, $caller_line, $caller) = caller $i++;
+
+        last unless defined $caller;
+
+        trace "called from $caller() at line $line";
+        $line = $caller_line;
+    }
+}
+
  # This is for a rarely used development feature that allows you to compare two
  # versions of the Unicode standard without having to deal with changes caused
  # by the code points introduced in the later version.  You probably also want
  # to use the -annotate option when using this.  Run this program on a unicore
  # containing the starting release you want to compare.  Save that output
-# structrue.  Then, switching to a unicore with the ending release, change the
+# structure.  Then, switching to a unicore with the ending release, change the
  # 0 in the $string_compare_versions definition just below to a string
  # containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
  # to the starting release.  This program will then compile, but throw away all
@@ -631,7 +660,7 @@ our $to_trace = 0;
  # common to both releases, and you can see the changes caused just by the
  # underlying release semantic changes.  For versions earlier than 3.2, you
  # must copy a version of DAge.txt into the directory.
-my $string_compare_versions = DEBUG && 0; #  e.g., "2.1";
+my $string_compare_versions = DEBUG && ""; #  e.g., "2.1";
  my $compare_versions = DEBUG
                         && $string_compare_versions
                         && pack "C*", split /\./, $string_compare_versions;
@@ -702,7 +731,8 @@ while (@ARGV) {
          $verbosity = 0;
      }
      elsif ($arg eq '-w') {
-        $write_unchanged_files = 1; # update the files even if havent changed
+        # update the files even if they haven't changed
+        $write_unchanged_files = 1;
      }
      elsif ($arg eq '-check') {
          my $this = shift @ARGV;
@@ -1110,17 +1140,17 @@ my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
  my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
  my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
  
-# We work with above-Unicode code points, up to UV_MAX.   But when you get
-# that high, above IV_MAX, some operations don't work, and you can easily get
-# overflow.  Therefore for internal use, we use a much smaller number,
-# translating it to UV_MAX only for output.  The exact number is immaterial
-# (all Unicode code points are treated exactly the same), but the algorithm
-# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
+# We work with above-Unicode code points, up to IV_MAX, but we may want to use
+# sentinels above that number.  Therefore for internal use, we use a much
+# smaller number, translating it to IV_MAX only for output.  The exact number
+# is immaterial (all above-Unicode code points are treated exactly the same),
+# but the algorithm requires it to be at least
+# 2 * $MAX_UNICODE_CODEPOINTS + 1
  my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
  my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
  my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
  
-my $MAX_PLATFORM_CODEPOINT = ~0;
+my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
  
  # Matches legal code point.  4-6 hex numbers, If there are 6, the first
  # two must be 10; if there are 5, the first must not be a 0.  Written this way
@@ -1132,7 +1162,7 @@ my $run_on_code_point_re =
              qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
  my $code_point_re = qr/\b$run_on_code_point_re/;
  
-# This matches the beginning of the line in the Unicode db files that give the
+# This matches the beginning of the line in the Unicode DB files that give the
  # defaults for code points not listed (i.e., missing) in the file.  The code
  # depends on this ending with a semi-colon, so it can assume it is a valid
  # field when the line is split() by semi-colons
@@ -1424,6 +1454,7 @@ my $Assigned;   # All assigned characters in this Unicode release
  my $DI;         # Default_Ignorable_Code_Point property
  my $NChar;      # Noncharacter_Code_Point property
  my $script;
+my $scx;        # Script_Extensions property
  
  # Are there conflicting names because of beginning with 'In_', or 'Is_'
  my $has_In_conflicts = 0;
@@ -1499,7 +1530,7 @@ sub populate_char_info ($) {
  
      $viacode[$i] = $perl_charname->value_of($i) || "";
      $age[$i] = (defined $age)
-               ? (($age->value_of($i) =~ / ^ \d \. \d $ /x)
+               ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
                    ? $age->value_of($i)
                    : "")
                 : "";
@@ -2075,7 +2106,7 @@ package Input_file;
  #   1) call before the first line is read, for pre processing
  #   2) call to adjust each line of the input before the main handler gets
  #      them.  This can be automatically generated, if appropriately simple
-#      enough, by specifiying a Properties parameter in the constructor.
+#      enough, by specifying a Properties parameter in the constructor.
  #   3) call upon EOF before the main handler exits its loop
  #   4) call at the end, for post processing
  #
@@ -2083,7 +2114,7 @@ package Input_file;
  # each_line_handler()s.  So, if the format of the line is not in the desired
  # format for the main handler, these are used to do that adjusting.  They can
  # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
-# so the $_ output of one is used as the input to the next.  The eof handler
+# so the $_ output of one is used as the input to the next.  The EOF handler
  # is also stackable, but none of the others are, but could easily be changed
  # to be so.
  #
@@ -2167,7 +2198,7 @@ sub trace { return main::trace(@_); }
      # not otherwise be processed, and to not raise a warning about not being
      # handled.  In the constructor call, any value that evaluates to a numeric
      # 0 or undef means don't skip.  Any other value is a string giving the
-    # reason it is being skippped, and this will appear in generated pod.
+    # reason it is being skipped, and this will appear in generated pod.
      # However, an empty string reason will suppress the pod entry.
      # Internally, calls that evaluate to numeric 0 are changed into undef to
      # distinguish them from an empty string call.
@@ -2180,11 +2211,20 @@ sub trace { return main::trace(@_); }
      # 'handler'
      main::set_access('each_line_handler', \%each_line_handler, 'c');
  
+    my %retain_trailing_comments;
+    # This is used to not discard the comments that end data lines.  This
+    # would be used only for files with non-typical syntax, and most code here
+    # assumes that comments have been stripped, so special handlers would have
+    # to be written.  It is assumed that the code will use these in
+    # single-quoted contexts, and so any "'" marks in the comment will be
+    # prefixed by a backslash.
+    main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
+
      my %properties; # Optional ordered list of the properties that occur in each
      # meaningful line of the input file.  If present, an appropriate
      # each_line_handler() is automatically generated and pushed onto the stack
      # of such handlers.  This is useful when a file contains multiple
-    # proerties per line, but no other special considerations are necessary.
+    # properties per line, but no other special considerations are necessary.
      # The special value "<ignored>" means to discard the corresponding input
      # field.
      # Any @missing lines in the file should also match this syntax; no such
@@ -2337,6 +2377,7 @@ sub trace { return main::trace(@_); }
  
          # Set defaults
          $handler{$addr} = \&main::process_generic_property_file;
+        $retain_trailing_comments{$addr} = 0;
          $non_skip{$addr} = 0;
          $skip{$addr} = undef;
          $has_missings_defaults{$addr} = $NO_DEFAULTS;
@@ -3002,9 +3043,21 @@ END
                  next;
              }
  
-            # Remove comments and trailing space, and skip this line if the
-            # result is empty
-            s/#.*//;
+            # Unless to keep, remove comments.  If to keep, ignore
+            # comment-only lines
+            if ($retain_trailing_comments{$addr}) {
+                next if / ^ \s* \# /x;
+
+                # But escape any single quotes (done in both the comment and
+                # non-comment portion; this could be a bug someday, but not
+                # likely)
+                s/'/\\'/g;
+            }
+            else {
+                s/#.*//;
+            }
+
+            # Remove trailing space, and skip this line if the result is empty
              s/\s+$//;
              next if /^$/;
  
@@ -5272,6 +5325,14 @@ use parent '-norequire', '_Range_List_Base';
          return $self->_add_delete('+', @_);
      }
  
+    sub replace_map {
+        # Replace a range
+
+        my $self = shift;
+
+        return $self->_add_delete('+', @_, Replace => $UNCONDITIONALLY);
+    }
+
      sub add_duplicate {
          # Adds entry to a range list which can duplicate an existing entry
  
@@ -5412,6 +5473,15 @@ sub trace { return main::trace(@_); }
      # used to override calculations.
      main::set_access('format', \%format, 'r', 'p_s');
  
+    my %has_dependency;
+    # A boolean that gives whether some other table in this property is
+    # defined as the complement of this table.  This is a crude, but currently
+    # sufficient, mechanism to make this table not get destroyed before what
+    # is dependent on it is.  Other dependencies could be added, so the name
+    # was chosen to reflect a more general situation than actually is
+    # currently the case.
+    main::set_access('has_dependency', \%has_dependency, 'r', 's');
+
      sub new {
          # All arguments are key => value pairs, which you can see below, most
          # of which match fields documented above.  Otherwise: Re_Pod_Entry,
@@ -5467,6 +5537,7 @@ sub trace { return main::trace(@_); }
          $note{$addr} = [ ];
          $file_path{$addr} = [ ];
          $locked{$addr} = "";
+        $has_dependency{$addr} = 0;
  
          push @{$description{$addr}}, $description if $description;
          push @{$note{$addr}}, $note if $note;
@@ -6290,6 +6361,22 @@ END
                      }
  
                      if ($write_as_invlist) {
+                        if (   $previous_end > 0
+                            && $output_range_counts{$addr})
+                        {
+                            my $complement_count = $start - $previous_end - 1;
+                            if ($complement_count > 1) {
+                                $OUT[-1] = merge_single_annotation_line(
+                                    $OUT[-1],
+                                       "#"
+                                     . (" " x 17)
+                                     . "["
+                                     .  main::clarify_code_point_count(
+                                                            $complement_count)
+                                      . "] in complement\n",
+                                    $comment_indent);
+                            }
+                        }
  
                          # Inversion list format has a single number per line,
                          # the starting code point of a range that matches the
@@ -6983,18 +7070,29 @@ sub trace { return main::trace(@_); }
  
      sub set_default_map {
          # Define what code points that are missing from the input files should
-        # map to
+        # map to.  The optional second parameter 'full_name' indicates to
+        # force using the full name of the map instead of its standard name.
  
          my $self = shift;
          my $map = shift;
+        my $use_full_name = shift // 0;
          Carp::carp_extra_args(\@_) if main::DEBUG && @_;
  
+        if ($use_full_name && $use_full_name ne 'full_name') {
+            Carp::my_carp_bug("Second parameter to set_default_map() if"
+                            . " present, must be 'full_name'");
+        }
+
          my $addr = do { no overloading; pack 'J', $self; };
  
          # Convert the input to the standard equivalent, if any (won't have any
          # for $STRING properties)
-        my $standard = $self->_find_table_from_alias->{$map};
-        $map = $standard->name if defined $standard;
+        my $standard = $self->property->table($map);
+        if (defined $standard) {
+            $map = ($use_full_name)
+                   ? $standard->full_name
+                   : $standard->name;
+        }
  
          # Warn if there already is a non-equivalent default map for this
          # property.  Note that a default map can be a ref, which means that
@@ -7644,6 +7742,7 @@ END
      # Accessors for the underlying list that should fail if locked.
      for my $sub (qw(
                      add_duplicate
+                    replace_map
                  ))
      {
          no strict "refs";
@@ -7711,6 +7810,10 @@ use parent '-norequire', '_Base_Table';
  #    version.  But manual intervention to decide what the actual behavior
  #    should be may be required should this happen.  The introductory comments
  #    have more to say about this.
+#
+# 4) Definition.  This is a string for human consumption that specifies the
+#    code points that this table matches.  This is used only for the generated
+#    pod file.
  
  sub standardize { return main::standardize($_[0]); }
  sub trace { return main::trace(@_); }
@@ -7755,6 +7858,10 @@ sub trace { return main::trace(@_); }
      # none.
      main::set_access('complement', \%complement, 'r');
  
+    my %definition;
+    # Human readable string of the code points matched by this table
+    main::set_access('definition', \%definition, 'r', 's');
+
      sub new {
          my $class = shift;
  
@@ -7771,6 +7878,7 @@ sub trace { return main::trace(@_); }
          my $initialize = delete $args{'Initialize'};
          my $matches_all = delete $args{'Matches_All'} || 0;
          my $format = delete $args{'Format'};
+        my $definition = delete $args{'Definition'} // "";
          # Rest of parameters passed on.
  
          my $range_list = Range_List->new(Initialize => $initialize,
@@ -7805,6 +7913,7 @@ sub trace { return main::trace(@_); }
          $leader{$addr} = $self;
          $parent{$addr} = $self;
          $complement{$addr} = 0;
+        $definition{$addr} = $definition;
  
          if (defined $format && $format ne $EMPTY_FORMAT) {
              Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'.  Using '$EMPTY_FORMAT'");
@@ -7915,13 +8024,23 @@ sub trace { return main::trace(@_); }
          # complement's if it has one.
  
          my $self = shift;
-        my $complement;
-        if (($complement = $self->complement) != 0) {
-            return ~ $complement->_range_list;
-        }
-        else {
-            return $self->SUPER::_range_list;
+        my $complement = $self->complement;
+
+        # In order to avoid re-complementing on each access, only do the
+        # complement the first time, and store the result in this table's
+        # range list to use henceforth.  However, this wouldn't work if the
+        # controlling (complement) table changed after we do this, so lock it.
+        # Currently, the value of the complement isn't needed until after it
+        # is fully constructed, so this works.  If this were to change, the
+        # each_range iteration functionality would no longer work on this
+        # complement.
+        if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
+            $self->_set_range_list($self->SUPER::_range_list
+                                + ~ $complement->_range_list);
+            $complement->lock;
          }
+
+        return $self->SUPER::_range_list;
      }
  
      sub add_alias {
@@ -7959,7 +8078,7 @@ sub trace { return main::trace(@_); }
          # disambiguate with).
          if (defined $conflicting_object) {
              foreach my $alias ($self->aliases) {
-                if ($alias->name eq $conflicting_name) {
+                if (standardize($alias->name) eq standardize($conflicting_name)) {
  
                      # Here, there is an exact match.  This results in
                      # ambiguous comments, so disambiguate by changing the
@@ -8052,7 +8171,19 @@ sub trace { return main::trace(@_); }
                                                              # add_alias()
                                                              # instead for same
                                                              # property
-                     && ! $other->perl_extension)
+                     && ! $other->perl_extension
+
+                         # We allow the sc and scx properties to be marked as
+                         # related.  They are in fact related, and this allows
+                         # the pod to show that better.  This test isn't valid
+                         # if this is an early Unicode release without the scx
+                         # property (having that also implies the sc property
+                         # exists, so don't have to test for no 'sc')
+                     && (   ! defined $scx
+                         && ! (   (   $self->property == $script
+                                   || $self->property == $scx)
+                               && (   $self->property == $script
+                                   || $self->property == $scx))))
              {
                  Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties.  Assuming $self is not related to $other");
                  $related = 0;
@@ -8123,6 +8254,15 @@ sub trace { return main::trace(@_); }
          }
          my $addr = do { no overloading; pack 'J', $self; };
          $complement{$addr} = $other;
+
+        # Be sure the other property knows we are depending on them; or the
+        # other table if it is one in the current property.
+        if ($self->property != $other->property) {
+            $other->property->set_has_dependency(1);
+        }
+        else {
+            $other->set_has_dependency(1);
+        }
          $self->lock;
          return;
      }
@@ -8709,6 +8849,15 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
      main::set_access('pre_declared_maps',
                                      \%pre_declared_maps, 'r', 's');
  
+    my %has_dependency;
+    # A boolean that gives whether some table somewhere is defined as the
+    # complement of a table in this property.  This is a crude, but currently
+    # sufficient, mechanism to make this property not get destroyed before
+    # what is dependent on it is.  Other dependencies could be added, so the
+    # name was chosen to reflect a more general situation than actually is
+    # currently the case.
+    main::set_access('has_dependency', \%has_dependency, 'r', 's');
+
      sub new {
          # The only required parameter is the positionally first, name.  All
          # other parameters are key => value pairs.  See the documentation just
@@ -8747,6 +8896,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
          $has_only_code_point_maps{$addr} = 1;
          $table_ref{$addr} = { };
          $unique_maps{$addr} = { };
+        $has_dependency{$addr} = 0;
  
          $map{$addr} = Map_Table->new($name,
                                      Full_Name => $full_name{$addr},
@@ -9241,6 +9391,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
                      containing_range
                      count
                      default_map
+                    definition
                      delete_range
                      description
                      each_range
@@ -9259,6 +9410,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
                      range_count
                      ranges
                      range_size_1
+                    replace_map
                      reset_each_range
                      set_comment
                      set_default_map
@@ -10610,8 +10762,8 @@ ea ; W         ; Wide
  END
      }
  
-    if (-e 'LineBreak.txt') {
-        push @return, split /\n/, <<'END';
+    if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
+        my @lb = split /\n/, <<'END';
  lb ; AI        ; Ambiguous
  lb ; AL        ; Alphabetic
  lb ; B2        ; Break_Both
@@ -10642,6 +10794,12 @@ lb ; SY        ; Break_Symbols
  lb ; XX        ; Unknown
  lb ; ZW        ; ZWSpace
  END
+        # If this Unicode version predates the lb property, we use our
+        # substitute one
+        if (-e 'LBsubst.txt') {
+            $_ = s/^lb/_Perl_LB/r for @lb;
+        }
+        push @return, @lb;
      }
  
      if (-e 'DNormalizationProps.txt') {
@@ -11628,7 +11786,16 @@ END
                                            . $CMD_DELIM
                                            . $fields[$CHARNAME];
              }
-            elsif ($fields[$CHARNAME] =~ /^CJK/) {
+            elsif ($fields[$CATEGORY] eq 'Lo') {    # Is a letter
+
+                # All the CJK ranges like this have the name given as a
+                # special case in the next code line.  And for the others, we
+                # hope that Unicode continues to use the correct name in
+                # future releases, so we don't have to make further special
+                # cases.
+                my $name = ($fields[$CHARNAME] =~ /^CJK/)
+                           ? 'CJK UNIFIED IDEOGRAPH'
+                           : uc $fields[$CHARNAME];
  
                  # The name for these contains the code point itself, and all
                  # are defined to have the same base name, regardless of what
@@ -11640,7 +11807,7 @@ END
                                             . '='
                                             . $CP_IN_NAME
                                             . $CMD_DELIM
-                                           . 'CJK UNIFIED IDEOGRAPH';
+                                           . $name;
  
              }
              elsif ($fields[$CATEGORY] eq 'Co'
@@ -12155,6 +12322,24 @@ sub process_NamedSequences {
      }
  }
  
+sub filter_substitute_lb {
+    # Used on Unicodes that predate the LB property, where there is a
+    # substitute file.  This just does the regular ea_lb handling for such
+    # files, and then substitutes the long property value name for the short
+    # one that comes with the file.  (The other break files have the long
+    # names in them, so this is the odd one out.)  The reason for doing this
+    # kludge is that regen/mk_invlists.pl is expecting the long name.  This
+    # also fixes the typo 'Inseperable' that leads to problems.
+
+    filter_early_ea_lb;
+    return unless $_;
+
+    my @fields = split /\s*;\s*/;
+    $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
+    $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
+    $_ = join '; ', @fields;
+}
+
  sub filter_old_style_arabic_shaping {
      # Early versions used a different term for the later one.
  
@@ -13000,7 +13185,7 @@ sub setup_script_extensions {
      # The Script_Extensions property starts out with a clone of the Script
      # property.
  
-    my $scx = property_ref("Script_Extensions");
+    $scx = property_ref("Script_Extensions");
      $scx = Property->new("scx", Full_Name => "Script_Extensions")
                                                              if ! defined $scx;
      $scx->_set_format($STRING_WHITE_SPACE_LIST);
@@ -13558,6 +13743,18 @@ numerals.
  END
      ));
  
+    # Make sure this assumption in perl core code is valid in this Unicode
+    # release, with known exceptions
+    foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
+        next if $range->end - $range->start == 9;
+        next if $range->start == 0x1D7CE;   # This whole range was added in 3.1
+        next if $range->end == 0x19DA && $v_version eq v5.2.0;
+        next if $range->end - $range->start < 9 && $v_version le 4.0.0;
+        Carp::my_carp("Range $range unexpectedly doesn't contain 10"
+                    . " decimal digits.  Code in regcomp.c assumes it does,"
+                    . " and will have to be fixed.  Proceeding anyway.");
+    }
+
      Property->new('Legacy_Case_Folding',
                      File => "Fold",
                      Directory => $map_directory,
@@ -13583,7 +13780,6 @@ END
      # data is retained in the map table for reference, but the spurious match
      # tables are deleted.
  
-    my $scx = property_ref("Script_Extensions");
      if (defined $scx) {
          foreach my $table ($scx->tables) {
              next unless $table->name =~ /\s/;   # All the new and only the new
@@ -13596,6 +13792,22 @@ END
              }
              $scx->delete_match_table($table);
          }
+
+        # Mark the scx table as the parent of the corresponding sc table for
+        # those which are identical.  This causes the pod for the script table
+        # to refer to the corresponding scx one.
+        #
+        # This has to be in a separate loop from above, so as to wait until
+        # the tables are stabilized before checking for equivalency.
+        if (defined $pod_directory) {
+            my $sc = property_ref("Script");
+            foreach my $table ($scx->tables) {
+                my $plain_sc_equiv = $sc->table($table->name);
+                if ($table->matches_identically_to($plain_sc_equiv)) {
+                    $plain_sc_equiv->set_equivalent_to($table, Related => 1);
+                }
+            }
+        }
      }
  
      return;
@@ -13745,8 +13957,10 @@ sub handle_compare_versions () {
      # since the first compare version.
      my $delta = Range_List->new();
      foreach my $table ($age->tables) {
+        use version;
          next if $table == $age->table('Unassigned');
-        next if $table->name le $string_compare_versions;
+        next if version->parse($table->name)
+             le version->parse($string_compare_versions);
          $delta += $table;
      }
      if ($delta->is_empty) {
@@ -13769,6 +13983,9 @@ sub handle_compare_versions () {
          next if     $this_block == $no_block
                  ||  ! ($this_block & $Assigned)->is_empty;
          $this_block->set_fate($SUPPRESSED, $after_first_version);
+        foreach my $range ($this_block->ranges) {
+            $block->replace_map($range->start, $range->end, 'No_Block')
+        }
          $no_block += $this_block;
      }
  
@@ -14015,8 +14232,8 @@ sub compile_perl() {
      }
  
      my $Any = $perl->add_match_table('Any',
-                                     Description  => "All Unicode code points: [\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
-                                     );
+                                     Description  => "All Unicode code points",
+                                     Definition => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]");
      $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
      $Any->add_alias('Unicode');
  
@@ -14029,7 +14246,7 @@ sub compile_perl() {
              ->set_equivalent_to(property_ref('ccc')->table('Above'),
                                                                  Related => 1);
  
-    my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
+    my $ASCII = $perl->add_match_table('ASCII');
      if (defined $block) {   # This is equivalent to the block if have it.
          my $Unicode_ASCII = $block->table('Basic_Latin');
          if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
@@ -14089,7 +14306,7 @@ sub compile_perl() {
          $Lower += $temp & $Assigned;
      }
      my $Posix_Lower = $perl->add_match_table("PosixLower",
-                            Description => "[a-z]",
+                            Definition => "[a-z]",
                              Initialize => $Lower & $ASCII,
                              );
  
@@ -14107,7 +14324,7 @@ sub compile_perl() {
          $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
      }
      my $Posix_Upper = $perl->add_match_table("PosixUpper",
-                            Description => "[A-Z]",
+                            Definition => "[A-Z]",
                              Initialize => $Upper & $ASCII,
                              );
  
@@ -14315,7 +14532,7 @@ sub compile_perl() {
          $Alpha->add_alias('Alphabetic');
      }
      my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
-                            Description => "[A-Za-z]",
+                            Definition => "[A-Za-z]",
                              Initialize => $Alpha & $ASCII,
                              );
      $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
@@ -14326,13 +14543,14 @@ sub compile_perl() {
                          Initialize => $Alpha + $gc->table('Decimal_Number'),
                          );
      $perl->add_match_table("PosixAlnum",
-                            Description => "[A-Za-z0-9]",
+                            Definition => "[A-Za-z0-9]",
                              Initialize => $Alnum & $ASCII,
                              );
  
      my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
                                  Description => '\w, including beyond ASCII;'
-                                            . ' = \p{Alnum} + \pM + \p{Pc}',
+                                            . ' = \p{Alnum} + \pM + \p{Pc}'
+                                            . ' + \p{Join_Control}',
                                  Initialize => $Alnum + $gc->table('Mark'),
                                  );
      my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
@@ -14352,7 +14570,8 @@ sub compile_perl() {
  
      # This is a Perl extension, so the name doesn't begin with Posix.
      my $PerlWord = $perl->add_match_table('PosixWord',
-                    Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
+                    Description => '\w, restricted to ASCII',
+                    Definition =>  '[A-Za-z0-9_]',
                      Initialize => $Word & $ASCII,
                      );
      $PerlWord->add_alias('PerlWord');
@@ -14369,7 +14588,7 @@ sub compile_perl() {
                                  );
      $Blank->add_alias('HorizSpace');        # Another name for it.
      $perl->add_match_table("PosixBlank",
-                            Description => "\\t and ' '",
+                            Definition => "\\t and ' '",
                              Initialize => $Blank & $ASCII,
                              );
  
@@ -14395,7 +14614,7 @@ sub compile_perl() {
      $Space->add_alias('Space') if $v_version lt v4.1.0;
  
      my $Posix_space = $perl->add_match_table("PosixSpace",
-                            Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
+                            Definition => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
                              Initialize => $Space & $ASCII,
                              );
      $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
@@ -14404,7 +14623,12 @@ sub compile_perl() {
                                          Description => 'Control characters');
      $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
      $perl->add_match_table("PosixCntrl",
-                            Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
+                            Description => "ASCII control characters",
+                            Definition =>  "ACK, BEL, BS, CAN, CR, DC1, DC2,"
+                                         . " DC3, DC4, DEL, DLE, ENQ, EOM,"
+                                         . " EOT, ESC, ETB, ETX, FF, FS, GS,"
+                                         . " HT, LF, NAK, NUL, RS, SI, SO,"
+                                         . " SOH, STX, SUB, SYN, US, VT",
                              Initialize => $Cntrl & $ASCII,
                              );
  
@@ -14428,7 +14652,7 @@ sub compile_perl() {
                          Initialize => ~ ($Space + $controls),
                          );
      $perl->add_match_table("PosixGraph",
-                            Description =>
+                            Definition =>
                                  '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
                              Initialize => $Graph & $ASCII,
                              );
@@ -14438,7 +14662,7 @@ sub compile_perl() {
                          Initialize => $Blank + $Graph - $gc->table('Control'),
                          );
      $perl->add_match_table("PosixPrint",
-                            Description =>
+                            Definition =>
                                '[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
                              Initialize => $print & $ASCII,
                              );
@@ -14454,7 +14678,7 @@ sub compile_perl() {
                                  Perl_Extension => 1
          );
      $perl->add_match_table('PosixPunct', Perl_Extension => 1,
-        Description => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
+        Definition => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
          Initialize => $ASCII & $XPosixPunct,
          );
  
@@ -14462,7 +14686,7 @@ sub compile_perl() {
                              Description => '[0-9] + all other decimal digits');
      $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
      my $PosixDigit = $perl->add_match_table("PosixDigit",
-                                            Description => '[0-9]',
+                                            Definition => '[0-9]',
                                              Initialize => $Digit & $ASCII,
                                              );
  
@@ -14477,7 +14701,7 @@ sub compile_perl() {
                                ord('A') .. ord('F'),
                                ord('a') .. ord('f'),
                                0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
-        $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
+        $Xdigit->set_definition('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
      }
  
      # AHex was not present in early releases
@@ -14491,7 +14715,7 @@ sub compile_perl() {
          $PosixXDigit->add_alias('AHex');
          $PosixXDigit->add_alias('Ascii_Hex_Digit');
      }
-    $PosixXDigit->add_description('[0-9A-Fa-f]');
+    $PosixXDigit->set_definition('[0-9A-Fa-f]');
  
      my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
                      Description => "Code points that particpate in some fold",
@@ -14767,7 +14991,6 @@ sub compile_perl() {
                                      + ord("(")
                                      + ord(")")
                                      + ord("-")
-                                    + utf8::unicode_to_native(0xA0) # NBSP
                          );
  
      my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
@@ -15117,6 +15340,10 @@ END
      # SA              CM      Only Mn or Mc
      # SA              AL      Any except Mn and Mc
      # CJ              NS      Any
+    #
+    # All property values are also written out in their long form, as
+    # regen/mk_invlist.pl expects that.  This also fixes occurrences of the
+    # typo in early Unicode versions: 'inseperable'.
      my $perl_lb = property_ref('_Perl_LB');
      if (! defined $perl_lb) {
          $perl_lb = Property->new('_Perl_LB',
@@ -15125,36 +15352,40 @@ END
                                   Directory => $map_directory,
                                   Type => $STRING);
          my $lb = property_ref('Line_Break');
-        $perl_lb->initialize($lb);
+
+        # Populate from $lb, but use full name and fix typo.
+        foreach my $range ($lb->ranges) {
+            my $full_name = $lb->table($range->value)->full_name;
+            $full_name = 'Inseparable'
+                                if standardize($full_name) eq 'inseperable';
+            $perl_lb->add_map($range->start, $range->end, $full_name);
+        }
      }
-    $perl_lb->set_default_map('AL');
  
-    # It's a little iffy relying on Unicode to not change which property value
-    # synonym they use, but if they do, tests should start failing and we can
-    # fix this up
+    $perl_lb->set_default_map('Alphabetic', 'full_name');    # XX -> AL
+
      for my $range ($perl_lb->ranges) {
          my $value = standardize($range->value);
          if (   $value eq standardize('Unknown')
-            || $value eq standardize('XX')
-            || $value eq standardize('AI')
-            || $value eq standardize('SG'))
+            || $value eq standardize('Ambiguous')
+            || $value eq standardize('Surrogate'))
          {
-            $perl_lb->add_map($range->start, $range->end, 'AL',
+            $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
                                Replace => $UNCONDITIONALLY);
          }
-        elsif ($value eq standardize('CJ')) {
-            $perl_lb->add_map($range->start, $range->end, 'NS',
+        elsif ($value eq standardize('Conditional_Japanese_Starter')) {
+            $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
                                Replace => $UNCONDITIONALLY);
          }
-        elsif ($value eq standardize('SA')) {
+        elsif ($value eq standardize('Complex_Context')) {
              for my $i ($range->start .. $range->end) {
                  my $gc_val = $gc->value_of($i);
                  if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
-                    $perl_lb->add_map($i, $i, 'CM',
+                    $perl_lb->add_map($i, $i, 'Combining_Mark',
                                        Replace => $UNCONDITIONALLY);
                  }
                  else {
-                    $perl_lb->add_map($i, $i, 'AL',
+                    $perl_lb->add_map($i, $i, 'Alphabetic',
                                        Replace => $UNCONDITIONALLY);
                  }
              }
@@ -15188,7 +15419,7 @@ sub add_perl_synonyms() {
      # the single-form, \p{name}.  These are:
      #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
      #       \p{Is_Name} as synonyms
-    #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
+    #   \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
      #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
      #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
      #       conflict, \p{Value} and \p{Is_Value} as well
@@ -15202,8 +15433,14 @@ sub add_perl_synonyms() {
                                                              property_ref('*');
      push @tables, $gc->tables;
  
-    # If the version of Unicode includes the Script property, add its tables
-    push @tables, $script->tables if defined $script;
+    # If the version of Unicode includes the Script Extensions (preferably),
+    # or Script property, add its tables
+    if (defined $scx) {
+        push @tables, $scx->tables;
+    }
+    else {
+        push @tables, $script->tables if defined $script;
+    }
  
      # The Block tables are kept separate because they are treated differently.
      # And the earliest versions of Unicode didn't include them, so add only if
@@ -15266,7 +15503,7 @@ sub add_perl_synonyms() {
  
                  if (! defined $pre_existing) {
  
-                    # No name collision, so ok to add the perl synonym.
+                    # No name collision, so OK to add the perl synonym.
  
                      my $make_re_pod_entry;
                      my $ok_as_filename;
@@ -15361,7 +15598,7 @@ sub add_perl_synonyms() {
                      next;
                  }
  
-                # Here, there is a name collision, but it still could be ok if
+                # Here, there is a name collision, but it still could be OK if
                  # the tables match the identical set of code points, in which
                  # case, we can combine the names.  Compare each table's code
                  # point list to see if they are identical.
@@ -16157,7 +16394,10 @@ sub make_re_pod_entries($) {
              if ($table_property != $perl && $table->perl_extension) {
                  push @info, '(Perl extension)';
              }
-            push @info, "($string_count)";
+            my $definition = $table->definition // "";
+            $definition = "" if $entry_for_first_alias;
+            $definition = ": $definition" if $definition;
+            push @info, "($string_count$definition)";
  
              # Now, we have both the entry and info so add them to the
              # list of all the properties.
@@ -16200,39 +16440,50 @@ sub make_ucd_table_pod_entries {
                     : $table->parent->property;
  
      my $perl_extension = $table->perl_extension;
+    my $is_perl_extension_match_table_but_not_dollar_perl
+                                                        = $property != $perl
+                                                       && $perl_extension
+                                                       && $property != $table;
  
      # Get the more official name for for perl extensions that aren't
      # stand-alone properties
-    if ($perl_extension && $property != $table) {
-        if ($property == $perl ||$property->type == $BINARY) {
-            $meaning = $table->complete_name;
+    if ($is_perl_extension_match_table_but_not_dollar_perl) {
+        if ($property->type == $BINARY) {
+            $meaning = $property->full_name;
          }
          else {
-            $meaning = $property->full_name . "=$full_name";
+            $meaning = $table->parent->complete_name;
          }
      }
  
      # There are three types of info column.  One for the short name, one for
      # the full name, and one for everything else.  They mostly are the same,
      # so initialize in the same loop.
+
      foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
-        if ($perl_extension && $property != $table) {
+        if ($info_ref != \$full_info) {
+
+            # The non-full name columns include the full name
+            $$info_ref .= $full_name;
+        }
+
+
+        if ($is_perl_extension_match_table_but_not_dollar_perl) {
  
              # Add the synonymous name for the non-full name entries; and to
              # the full-name entry if it adds extra information
-            if ($info_ref == \$other_info
-                || ($info_ref == \$short_info
-                    && $standard_short_name ne $standard_full_name)
-                || standardize($meaning) ne $standard_full_name
-            ) {
-                $$info_ref .= "$meaning.";
+            if (   standardize($meaning) ne $standard_full_name
+                || $info_ref == \$other_info
+                || $info_ref == \$short_info)
+            {
+                my $parenthesized =  $info_ref != \$full_info;
+                $$info_ref .= " " if $$info_ref && $parenthesized;
+                $$info_ref .= "(=" if $parenthesized;
+                $$info_ref .= "$meaning";
+                $$info_ref .= ")" if $parenthesized;
+                $$info_ref .= ".";
              }
          }
-        elsif ($info_ref != \$full_info) {
-
-            # Otherwise, the non-full name columns include the full name
-            $$info_ref .= $full_name;
-        }
  
          # And the full-name entry includes the short name, if shorter
          if ($info_ref == \$full_info
@@ -16327,7 +16578,7 @@ sub make_ucd_table_pod_entries {
                      Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
                  }
  
-                # We modifiy the info column of the one being output to
+                # We modify the info column of the one being output to
                  # indicate the ambiguity.  Set $which to point to that one's
                  # info.
                  my $which;
@@ -16389,27 +16640,69 @@ sub pod_alphanumeric_sort {
          return -1
      }
  
-    # Determine if the two operands are numeric property values or not.
-    # A numeric property will look like xyz: 3.  But the number
-    # can begin with an optional minus sign, and may have a
-    # fraction or rational component, like xyz: 3/2.  If either
-    # isn't numeric, use alphabetic sort.
-    my ($a_initial, $a_number) =
-        ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
-    return $a cmp $b unless defined $a_number;
-    my ($b_initial, $b_number) =
-        ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
-    return $a cmp $b unless defined $b_number;
-
-    # Here they are both numeric, but use alphabetic sort if the
-    # initial parts don't match
-    return $a cmp $b if $a_initial ne $b_initial;
+    # Determine if the two operands are compound or not, and if so if are
+    # "numeric" property values or not, like \p{Age: 3.0}.  But there are also
+    # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
+    # all of which this considers numeric, and for sorting, looks just at the
+    # numeric parts.  It can also be a rational like \p{Numeric Value=-1/2}.
+    my $split_re = qr/
+        ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
+                     # property name
+        [:=] \s*     # The syntax for the compound form
+        (?:          # followed by ...
+            (        # $2 gets defined if what follows is a "numeric"
+                     # expression, which is ...
+              ( -? \d+ (?: [.\/] \d+)?  # An integer, float, or rational
+                                        # number, optionally signed
+               | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131.  Either
+                                         # of these go into $3
+             | ( V \d+ _ \d+ )           # or a Unicode's Age property version
+                                         # number, into $4
+            )
+            | .* $    # If not "numeric", accept anything so that $1 gets
+                      # defined if it is any compound form
+        ) /ix;
+    my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
+    my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
+
+    # Sort alphabeticlly on the whole property name if either operand isn't
+    # compound, or they differ.
+    return $a cmp $b if   ! defined $a_initial
+                       || ! defined $b_initial
+                       || $a_initial ne $b_initial;
+
+    if (! defined $a_numeric) {
+
+        # If neither is numeric, use alpha sort
+        return $a cmp $b if ! defined $b_numeric;
+        return 1;  # Sort numeric ahead of alpha
+    }
+
+    # Here $a is numeric
+    return -1 if ! defined $b_numeric;  # Numeric sorts before alpha
+
+    # Here they are both numeric in the same property.
+    # Convert version numbers into regular numbers
+    if (defined $a_version) {
+        ($a_number = $a_version) =~ s/^V//i;
+        $a_number =~ s/_/./;
+    }
+    else {  # Otherwise get rid of the, e.g., CCC in CCC9 */
+        $a_number =~ s/ ^ [[:alpha:]]+ //x;
+    }
+    if (defined $b_version) {
+        ($b_number = $b_version) =~ s/^V//i;
+        $b_number =~ s/_/./;
+    }
+    else {
+        $b_number =~ s/ ^ [[:alpha:]]+ //x;
+    }
  
      # Convert rationals to floating for the comparison.
      $a_number = eval $a_number if $a_number =~ qr{/};
      $b_number = eval $b_number if $b_number =~ qr{/};
  
-    return $a_number <=> $b_number;
+    return $a_number <=> $b_number || $a cmp $b;
  }
  
  sub make_pod () {
@@ -16582,7 +16875,7 @@ END
          push @bad_re_properties, "\n=back\n";
      }
  
-    # Similiarly, generate a list of files that we don't use, grouped by the
+    # Similarly, generate a list of files that we don't use, grouped by the
      # reasons why (Don't output if the reason is empty).  First, create a hash
      # whose keys are the reasons, and whose values are anonymous arrays of all
      # the files that share that reason.
@@ -16695,14 +16988,16 @@ constructs, both single and compound forms.
  B<Compound forms> consist of two components, separated by an equals sign or a
  colon.  The first component is the property name, and the second component is
  the particular value of the property to match against, for example,
-C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
-whose Script property value is Greek.
+C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean
+to match characters whose Script_Extensions property value is Greek.
+(C<Script_Extensions> is an improved version of the C<Script> property.)
  
  B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
  their equivalent compound forms.  The table shows these equivalences.  (In our
-example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
-There are also a few Perl-defined single forms that are not shortcuts for a
-compound form.  One such is C<\\p{Word}>.  These are also listed in the table.
+example, C<\\p{Greek}> is a just a shortcut for
+C<\\p{Script_Extensions=Greek}>).  There are also a few Perl-defined single
+forms that are not shortcuts for a compound form.  One such is C<\\p{Word}>.
+These are also listed in the table.
  
  In parsing these constructs, Perl always ignores Upper/lower case differences
  everywhere within the {braces}.  Thus C<\\p{Greek}> means the same thing as
@@ -16870,34 +17165,20 @@ B<Legend summary:>
  
  =over 4
  
-=item *
+=item Z<>B<*> is a wild-card
  
-B<*> is a wild-card
-
-=item *
-
-B<(\\d+)> in the info column gives the number of Unicode code points matched
+=item B<(\\d+)> in the info column gives the number of Unicode code points matched
  by this property.
  
-=item *
-
-B<$DEPRECATED> means this is deprecated.
-
-=item *
+=item B<$DEPRECATED> means this is deprecated.
  
-B<$OBSOLETE> means this is obsolete.
+=item B<$OBSOLETE> means this is obsolete.
  
-=item *
+=item B<$STABILIZED> means this is stabilized.
  
-B<$STABILIZED> means this is stabilized.
+=item B<$STRICTER> means tighter (stricter) name matching applies.
  
-=item *
-
-B<$STRICTER> means tighter (stricter) name matching applies.
-
-=item *
-
-B<$DISCOURAGED> means use of this form is discouraged, and may not be
+=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
  stable.
  
  =back
@@ -17507,7 +17788,7 @@ sub make_UCD () {
              foreach my $prop_alias ($property->aliases) {
                  my $prop_alias_name = standardize($prop_alias->name);
  
-                # If no =value, there's just one combination possibe for this
+                # If no =value, there's just one combination possible for this
                  if (! $value_name) {
  
                      # The property may be suppressed, but there may be a proxy
@@ -17899,19 +18180,20 @@ sub write_all_tables() {
                      # the children.
                      make_re_pod_entries($table) if defined $pod_directory;
  
-                    # See if the the table matches identical code points with
-                    # something that has already been output.  In that case,
-                    # no need to have two files with the same code points in
-                    # them.  We use the table's hash() method to store these
-                    # in buckets, so that it is quite likely that if two
-                    # tables are in the same bucket they will be identical, so
-                    # don't have to compare tables frequently.  The tables
-                    # have to have the same status to share a file, so add
-                    # this to the bucket hash.  (The reason for this latter is
-                    # that Heavy.pl associates a status with a file.)
-                    # We don't check tables that are inverses of others, as it
-                    # would lead to some coding complications, and checking
-                    # all the regular ones should find everything.
+                    # See if the table matches identical code points with
+                    # something that has already been processed and is ready
+                    # for output.  In that case, no need to have two files
+                    # with the same code points in them.  We use the table's
+                    # hash() method to store these in buckets, so that it is
+                    # quite likely that if two tables are in the same bucket
+                    # they will be identical, so don't have to compare tables
+                    # frequently.  The tables have to have the same status to
+                    # share a file, so add this to the bucket hash.  (The
+                    # reason for this latter is that Heavy.pl associates a
+                    # status with a file.) We don't check tables that are
+                    # inverses of others, as it would lead to some coding
+                    # complications, and checking all the regular ones should
+                    # find everything.
                      if ($table->complement == 0) {
                          my $hash = $table->hash . ';' . $table->status;
  
@@ -17920,7 +18202,11 @@ sub write_all_tables() {
                          foreach my $comparison
                                              (@{$match_tables_to_write{$hash}})
                          {
-                            if ($table->matches_identically_to($comparison)) {
+                            # If the table doesn't point back to this one, we
+                            # see if it matches identically
+                            if (   $comparison->leader != $table
+                                && $table->matches_identically_to($comparison))
+                            {
                                  $table->set_equivalent_to($comparison,
                                                                  Related => 0);
                                  next TABLE;
@@ -18062,7 +18348,7 @@ sub write_all_tables() {
                          # the table.  That is, all the property-values given
                          # by this table.  By agreement with Unicode::UCD,
                          # if the name and full name are identical, and there
-                        # are no other names, drop the duplcate entry to save
+                        # are no other names, drop the duplicate entry to save
                          # memory.
                          if (@values_list == 2
                              && $values_list[0] eq $values_list[1])
@@ -18226,14 +18512,14 @@ sub generate_separator($) {
  
  sub generate_tests($$$$$) {
      # This used only for making the test script.  It generates test cases that
-    # are expected to compile successfully in perl.  Note that the lhs and
-    # rhs are assumed to already be as randomized as the caller wants.
+    # are expected to compile successfully in perl.  Note that the LHS and
+    # RHS are assumed to already be as randomized as the caller wants.
  
      my $lhs = shift;           # The property: what's to the left of the colon
                                 #  or equals separator
      my $rhs = shift;           # The property value; what's to the right
      my $valid_code = shift;    # A code point that's known to be in the
-                               # table given by lhs=rhs; undef if table is
+                               # table given by LHS=RHS; undef if table is
                                 # empty
      my $invalid_code = shift;  # A code point known to not be in the table;
                                 # undef if the table is all code points
@@ -18274,13 +18560,13 @@ sub generate_error($$$) {
                                      # colon or equals separator
      my $rhs = shift;                # The property value; what's to the right
      my $already_in_error = shift;   # Boolean; if true it's known that the
-                                # unmodified lhs and rhs will cause an error.
+                                # unmodified LHS and RHS will cause an error.
                                  # This routine should not force another one
      # Get the colon or equal
      my $separator = generate_separator($lhs);
  
      # Since this is an error only, don't bother to randomly decide whether to
-    # put the error on the left or right side; and assume that the rhs is
+    # put the error on the left or right side; and assume that the RHS is
      # loosely matched, again for convenience rather than rigor.
      $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
  
@@ -18479,10 +18765,33 @@ sub make_property_test_script() {
      # or multiple lines. main::write doesn't count the lines.
      my @output;
  
+    push @output, <<'EOF_CODE';
+Error('\p{Script=InGreek}');    # Bug #69018
+Test_GCB("1100 $nobreak 1161");  # Bug #70940
+Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
+Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
+Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
+
+# Make sure this gets tested; it was not part of the official test suite at
+# the time this was added.  Note that this is as it would appear in the
+# official suite, and gets modified to check for the perl tailoring by
+# Test_WB()
+Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
+Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
+EOF_CODE
+
      # Sort these so get results in same order on different runs of this
      # program
-    foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) {
-        foreach my $table (sort { $a->name cmp $b->name } $property->tables) {
+    foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
+                                    or
+                                 lc $a->name cmp lc $b->name
+                               } property_ref('*'))
+    {
+        foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
+                                    or
+                                  lc $a->name cmp lc $b->name
+                                } $property->tables)
+        {
  
              # Find code points that match, and don't match this table.
              my $valid = $table->get_valid_code_point;
@@ -18653,16 +18962,69 @@ sub make_property_test_script() {
          $property->DESTROY();
      }
  
+    # Make any test of the boundary (break) properties TODO if the code
+    # doesn't match the version being compiled
+    my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
+                             ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
+                             : "\nsub TODO_FAILING_BREAKS { 0 }\n";
+
+    @output= map {
+        map s/^/    /mgr,
+        map "$_;\n",
+        split /;\n/, $_
+    } @output;
+
+    # Cause there to be 'if' statements to only execute a portion of this
+    # long-running test each time, so that we can have a bunch of .t's running
+    # in parallel
+    my $chunks = 10     # Number of test files
+               - 1      # For GCB & SB
+               - 1      # For WB
+               - 4;     # LB split into this many files
+    my @output_chunked;
+    my $chunk_count=0;
+    my $chunk_size= int(@output / $chunks) + 1;
+    while (@output) {
+        $chunk_count++;
+        my @chunk= splice @output, 0, $chunk_size;
+        push @output_chunked,
+            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+                @chunk,
+            "}\n";
+    }
+
+    $chunk_count++;
+    push @output_chunked,
+        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+            (map {"    Test_GCB('$_');\n"} @backslash_X_tests),
+            (map {"    Test_SB('$_');\n"} @SB_tests),
+        "}\n";
+
+
+    $chunk_size= int(@LB_tests / 4) + 1;
+    @LB_tests = map {"    Test_LB('$_');\n"} @LB_tests;
+    while (@LB_tests) {
+        $chunk_count++;
+        my @chunk= splice @LB_tests, 0, $chunk_size;
+        push @output_chunked,
+            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+                @chunk,
+            "}\n";
+    }
+
+    $chunk_count++;
+    push @output_chunked,
+        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+            (map {"    Test_WB('$_');\n"} @WB_tests),
+        "}\n";
+
      &write($t_path,
             0,           # Not utf8;
             [$HEADER,
+            $TODO_FAILING_BREAKS,
              <DATA>,
-            @output,
-            (map {"Test_GCB('$_');\n"} @backslash_X_tests),
-            (map {"Test_LB('$_');\n"} @LB_tests),
-            (map {"Test_SB('$_');\n"} @SB_tests),
-            (map {"Test_WB('$_');\n"} @WB_tests),
-            "Finished();\n"
+            @output_chunked,
+            "Finished();\n",
             ]);
  
      return;
@@ -18823,6 +19185,14 @@ my @input_file_objects = (
                      Property => 'Joining_Type',
                      Has_Missings_Defaults => $NOT_IGNORED,
                     ),
+    Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
+                    Skip => 'This file adds no new information not already'
+                          . ' present in other files',
+                    # And it's unnecessary programmer work to handle this new
+                    # format.  Previous Derived files actually had bug fixes
+                    # in them that were useful, but that should not be the
+                    # case here.
+                   ),
      Input_file->new('Jamo.txt', v2.0.0,
                      Property => 'Jamo_Short_Name',
                      Each_Line_Handler => \&filter_jamo_line,
@@ -18921,11 +19291,16 @@ my @input_file_objects = (
                      Has_Missings_Defaults => $NOT_IGNORED,
                      Property => 'Line_Break',
                      # Early versions had problematic syntax
-                    Each_Line_Handler => (($v_version lt v3.1.0)
-                                        ? \&filter_early_ea_lb
-                                        : undef),
-                    Early => [ "LBsubst.txt", '_Perl_LB', 'AL',
-                               'AL', # default
+                    Each_Line_Handler => ($v_version ge v3.1.0)
+                                          ? undef
+                                          : ($v_version lt v3.0.0)
+                                            ? \&filter_substitute_lb
+                                            : \&filter_early_ea_lb,
+                    # Must use long names for property values see comments at
+                    # sub filter_substitute_lb
+                    Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
+                               'Alphabetic', # default to this because XX ->
+                                             # AL
  
                                 # Don't use _Perl_LB as a synonym for
                                 # Line_Break in later perls, as it is tailored
@@ -19029,13 +19404,14 @@ my @input_file_objects = (
                       # for the release it is.  To get it to actually mean
                       # something useful, someone would have to be using an
                       # earlier Unicode release, and copy it into the directory
-                     # for that release and recomplile.  So far there has been
+                     # for that release and recompile.  So far there has been
                       # no demand to do that, so this hasn't been implemented.
                      Skip => 'Documentation of corrections already '
                            . 'incorporated into the Unicode data base',
                     ),
      Input_file->new('StandardizedVariants.html', v3.2.0,
-                    Skip => 'Provides a visual display of the standard '
+                    Skip => 'Obsoleted as of Unicode 9.0, but previously '
+                          . 'provided a visual display of the standard '
                            . 'variant sequences derived from '
                            . 'F<StandardizedVariants.txt>.',
                          # I don't know why the html came earlier than the
@@ -19070,18 +19446,21 @@ my @input_file_objects = (
                     ),
      Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
                      Handler => \&process_GCB_test,
+                    retain_trailing_comments => 1,
                     ),
      Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
                      Skip => $Validation_Documentation,
                     ),
      Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
                      Handler => \&process_SB_test,
+                    retain_trailing_comments => 1,
                     ),
      Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
                      Skip => $Validation_Documentation,
                     ),
      Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
                      Handler => \&process_WB_test,
+                    retain_trailing_comments => 1,
                     ),
      Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
                      Skip => $Validation_Documentation,
@@ -19132,6 +19511,7 @@ my @input_file_objects = (
                     ),
      Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
                      Handler => \&process_LB_test,
+                    retain_trailing_comments => 1,
                     ),
      Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
                      Skip => $Validation_Documentation,
@@ -19245,6 +19625,19 @@ my @input_file_objects = (
                      Property => 'Indic_Positional_Category',
                      Has_Missings_Defaults => $NOT_IGNORED,
                     ),
+    Input_file->new('TangutSources.txt', v9.0.0,
+                    Skip => 'Specifies source mappings for Tangut ideographs'
+                          . ' and components. This data file also includes'
+                          . ' informative radical-stroke values that are used'
+                          . ' internally by Unicode',
+                   ),
+    Input_file->new('VerticalOrientation.txt', v10.0.0,
+                    Property => 'Vertical_Orientation',
+                    Has_Missings_Defaults => $NOT_IGNORED,
+                   ),
+    Input_file->new('NushuSources.txt', v10.0.0,
+                    Skip => 'Specifies source material for Nushu characters',
+                   ),
  );
  
  # End of all the preliminaries.
@@ -19577,6 +19970,13 @@ if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
      }
      print "\nAll done\n" if $verbosity >= $VERBOSE;
  }
+
+if ($version_of_mk_invlist_bounds lt $v_version) {
+    Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
+                . " to be checked and possibly updated to Unicode"
+                . " $string_version");
+}
+
  exit(0);
  
  # TRAILING CODE IS USED BY make_property_test_script()
@@ -19634,7 +20034,7 @@ sub Expect($$$$) {
          $Tests++;
  
          # A string eval is needed because of the 'no warnings'.
-        # Assumes no parens in the regular expression
+        # Assumes no parentheses in the regular expression
          my $result = eval "$no_warnings
                              my \$RegObj = qr($regex);
                              $string =~ \$RegObj ? 1 : 0";
@@ -19702,7 +20102,10 @@ if (defined &locales_enabled) {
  }
  
  # Eval'd so can run on versions earlier than the property is available in
-my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
+my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
+if (! defined $WB_Extend_or_Format_re) {
+    $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
+}
  
  sub _test_break($$) {
      # Test various break property matches.  The 2nd parameter gives the
@@ -19724,6 +20127,15 @@ sub _test_break($$) {
      my $break_type = shift;
  
      my $line   = (caller 1)[2];   # Line number
+    my $comment = "";
+
+    if ($template =~ / ( .*? ) \s* \# (.*) /x) {
+        $template = $1;
+        $comment = $2;
+
+        # Replace leading spaces with a single one.
+        $comment =~ s/ ^ \s* / # /x;
+    }
  
      # The line contains characters above the ASCII range, but in Latin1.  It
      # may or may not be in utf8, and if it is, it may or may not know it.  So,
@@ -19862,12 +20274,24 @@ sub _test_break($$) {
              my $pattern = "(?$modifier:$break_pattern)";
  
              # Actually do the test
+            my $matched_text;
              my $matched = $string =~ qr/$pattern/;
-            print "not " unless $matched;
+            if ($matched) {
+                $matched_text = "matched";
+            }
+            else {
+                $matched_text = "failed to match";
+                print "not ";
+
+                if (TODO_FAILING_BREAKS) {
+                    $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
+                    $comment =~ s/#/# TODO/;
+                }
+            }
+            print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
  
-            # Fancy display of test results
-            $matched = ($matched) ? "matched" : "failed to match";
-            print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
+            # Only print the comment on the first use of this line
+            $comment = "";
  
              # Repeat with the first \B{} in the pattern.  This makes sure the
              # code in regexec.c:find_byclass() for \B gets executed
@@ -19875,8 +20299,10 @@ sub _test_break($$) {
                  my $B_pattern = "$1$2";
                  $matched = $string =~ qr/$B_pattern/;
                  print "not " unless $matched;
-                $matched = ($matched) ? "matched" : "failed to match";
-                print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
+                $matched_text = ($matched) ? "matched" : "failed to match";
+                print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
+                print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
+                print "\n";
              }
          }
  
@@ -19901,7 +20327,9 @@ sub _test_break($$) {
              } else {
                  $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
                                                      split "", $matches[$i]);
-                print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
+                print "not ok $Tests -";
+                print " # TODO" if TODO_FAILING_BREAKS;
+                print " In \"$display_string\" =~ /(\\X)/g, \\X #",
                      $i + 1,
                      " should have matched $should_display[$i]",
                      " but instead matched $matches[$i]",
@@ -19915,7 +20343,9 @@ sub _test_break($$) {
          if (@matches == @should_match) {
              print "ok $Tests - Nothing was left over; line $line\n";
          } else {
-            print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
+            print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
+            print " # TODO" if TODO_FAILING_BREAKS;
+            print "\n";
          }
      }
  
@@ -19943,15 +20373,3 @@ sub Finished() {
      exit($Fails ? -1 : 0);
  }
  
-Error('\p{Script=InGreek}');    # Bug #69018
-Test_GCB("1100 $nobreak 1161");  # Bug #70940
-Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
-Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
-Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
-
-# Make sure this gets tested; it was not part of the official test suite at
-# the time this was addded.  Note that this is as it would appear in the
-# official suite, and gets modified to check for the perl tailoring by
-# Test_WB()
-Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
-Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");