perluniprops: Improve sorting

[perl5.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 33306e4..8a7be25 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -8,6 +8,11 @@
  # compatible, but that has now been abandoned, and newer constructs are used
  # as convenient.
  
+# NOTE: this script can run quite slowly in older/slower systems.
+# It can also consume a lot of memory (128 MB or more), you may need
+# to raise your process resource limits (e.g. in bash, "ulimit -a"
+# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
+
  my $start_time;
  BEGIN { # Get the time the script started running; do it at compilation to
          # get it as close as possible
@@ -31,6 +36,17 @@ my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  
  sub NON_ASCII_PLATFORM { ord("A") != 65 }
  
+# When a new version of Unicode is published, unfortunately the algorithms for
+# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
+# manually.  The changes may or may not be backward compatible with older
+# releases.  The code is in regen/mk_invlist.pl and regexec.c.  Make the
+# changes, then come back here and set the variable below to what version the
+# code is expecting.  If a newer version of Unicode is being compiled than
+# expected, a warning will be generated.  If an older version is being
+# compiled, any bounds tests that fail in the generated test file (-maketest
+# option) will be marked as TODO.
+my $version_of_mk_invlist_bounds = v10.0.0;
+
  ##########################################################################
  #
  # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
@@ -347,6 +363,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  #
  # trace ... if main::DEBUG && $to_trace;
  #
+# main::stack_trace() will display what its name implies
+#
  # If there is just one or a few files that you're debugging, you can easily
  # cause most everything else to be skipped.  Change the line
  #
@@ -404,7 +422,7 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  #
  # A NOTE ON UNIHAN
  #
-# This program can generate tables from the Unihan database.  But that db
+# This program can generate tables from the Unihan database.  But that DB
  # isn't normally available, so it is marked as optional.  Prior to version
  # 5.2, this database was in a single file, Unihan.txt.  In 5.2 the database
  # was split into 8 different files, all beginning with the letters 'Unihan'.
@@ -472,8 +490,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  # handled by Unicode::Normalize, nor will it compile when presented a version
  # that has them.  However, you can trivially get it to compile by simply
  # ignoring those decompositions, by changing the croak to a carp.  At the time
-# of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or
-# cpan/Unicode-Normalize/mkheader) reads
+# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
+# dist/Unicode-Normalize/mkheader) reads
  #
  #   croak("Weird Canonical Decomposition of U+$h");
  #
@@ -612,12 +630,28 @@ our $to_trace = 0;
      }
  }
  
+sub stack_trace() {
+    local $to_trace = 1 if main::DEBUG;
+    my $line = (caller(0))[2];
+    my $i = 1;
+
+    # Accumulate the stack trace
+    while (1) {
+        my ($pkg, $file, $caller_line, $caller) = caller $i++;
+
+        last unless defined $caller;
+
+        trace "called from $caller() at line $line";
+        $line = $caller_line;
+    }
+}
+
  # This is for a rarely used development feature that allows you to compare two
  # versions of the Unicode standard without having to deal with changes caused
  # by the code points introduced in the later version.  You probably also want
  # to use the -annotate option when using this.  Run this program on a unicore
  # containing the starting release you want to compare.  Save that output
-# structrue.  Then, switching to a unicore with the ending release, change the
+# structure.  Then, switching to a unicore with the ending release, change the
  # 0 in the $string_compare_versions definition just below to a string
  # containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
  # to the starting release.  This program will then compile, but throw away all
@@ -626,7 +660,7 @@ our $to_trace = 0;
  # common to both releases, and you can see the changes caused just by the
  # underlying release semantic changes.  For versions earlier than 3.2, you
  # must copy a version of DAge.txt into the directory.
-my $string_compare_versions = DEBUG && 0; #  e.g., "2.1";
+my $string_compare_versions = DEBUG && ""; #  e.g., "2.1";
  my $compare_versions = DEBUG
                         && $string_compare_versions
                         && pack "C*", split /\./, $string_compare_versions;
@@ -697,7 +731,8 @@ while (@ARGV) {
          $verbosity = 0;
      }
      elsif ($arg eq '-w') {
-        $write_unchanged_files = 1; # update the files even if havent changed
+        # update the files even if they haven't changed
+        $write_unchanged_files = 1;
      }
      elsif ($arg eq '-check') {
          my $this = shift @ARGV;
@@ -1105,17 +1140,16 @@ my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
  my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
  my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
  
-# We work with above-Unicode code points, up to UV_MAX.   But when you get
-# that high, above IV_MAX, some operations don't work, and you can easily get
-# overflow.  Therefore for internal use, we use a much smaller number,
-# translating it to UV_MAX only for output.  The exact number is immaterial
-# (all Unicode code points are treated exactly the same), but the algorithm
-# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
+# We work with above-Unicode code points, up to IV_MAX, but we may want to use
+# sentinels above that number.  Therefore for internal use, we use a much
+# smaller number, translating it to IV_MAX only for output.  The exact number
+# is immaterial (all Unicode code points are treated exactly the same), but
+# the algorithm requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
  my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
  my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
  my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
  
-my $MAX_PLATFORM_CODEPOINT = ~0;
+my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
  
  # Matches legal code point.  4-6 hex numbers, If there are 6, the first
  # two must be 10; if there are 5, the first must not be a 0.  Written this way
@@ -1127,7 +1161,7 @@ my $run_on_code_point_re =
              qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
  my $code_point_re = qr/\b$run_on_code_point_re/;
  
-# This matches the beginning of the line in the Unicode db files that give the
+# This matches the beginning of the line in the Unicode DB files that give the
  # defaults for code points not listed (i.e., missing) in the file.  The code
  # depends on this ending with a semi-colon, so it can assume it is a valid
  # field when the line is split() by semi-colons
@@ -1384,6 +1418,7 @@ my $has_hangul_syllables = 0;
  my $needing_code_points_ending_in_code_point = 0;
  
  my @backslash_X_tests;     # List of tests read in for testing \X
+my @LB_tests;              # List of tests read in for testing \b{lb}
  my @SB_tests;              # List of tests read in for testing \b{sb}
  my @WB_tests;              # List of tests read in for testing \b{wb}
  my @unhandled_properties;  # Will contain a list of properties found in
@@ -1493,7 +1528,7 @@ sub populate_char_info ($) {
  
      $viacode[$i] = $perl_charname->value_of($i) || "";
      $age[$i] = (defined $age)
-               ? (($age->value_of($i) =~ / ^ \d \. \d $ /x)
+               ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
                    ? $age->value_of($i)
                    : "")
                 : "";
@@ -2069,7 +2104,7 @@ package Input_file;
  #   1) call before the first line is read, for pre processing
  #   2) call to adjust each line of the input before the main handler gets
  #      them.  This can be automatically generated, if appropriately simple
-#      enough, by specifiying a Properties parameter in the constructor.
+#      enough, by specifying a Properties parameter in the constructor.
  #   3) call upon EOF before the main handler exits its loop
  #   4) call at the end, for post processing
  #
@@ -2077,7 +2112,7 @@ package Input_file;
  # each_line_handler()s.  So, if the format of the line is not in the desired
  # format for the main handler, these are used to do that adjusting.  They can
  # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
-# so the $_ output of one is used as the input to the next.  The eof handler
+# so the $_ output of one is used as the input to the next.  The EOF handler
  # is also stackable, but none of the others are, but could easily be changed
  # to be so.
  #
@@ -2161,7 +2196,7 @@ sub trace { return main::trace(@_); }
      # not otherwise be processed, and to not raise a warning about not being
      # handled.  In the constructor call, any value that evaluates to a numeric
      # 0 or undef means don't skip.  Any other value is a string giving the
-    # reason it is being skippped, and this will appear in generated pod.
+    # reason it is being skipped, and this will appear in generated pod.
      # However, an empty string reason will suppress the pod entry.
      # Internally, calls that evaluate to numeric 0 are changed into undef to
      # distinguish them from an empty string call.
@@ -2174,11 +2209,20 @@ sub trace { return main::trace(@_); }
      # 'handler'
      main::set_access('each_line_handler', \%each_line_handler, 'c');
  
+    my %retain_trailing_comments;
+    # This is used to not discard the comments that end data lines.  This
+    # would be used only for files with non-typical syntax, and most code here
+    # assumes that comments have been stripped, so special handlers would have
+    # to be written.  It is assumed that the code will use these in
+    # single-quoted contexts, and so any "'" marks in the comment will be
+    # prefixed by a backslash.
+    main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
+
      my %properties; # Optional ordered list of the properties that occur in each
      # meaningful line of the input file.  If present, an appropriate
      # each_line_handler() is automatically generated and pushed onto the stack
      # of such handlers.  This is useful when a file contains multiple
-    # proerties per line, but no other special considerations are necessary.
+    # properties per line, but no other special considerations are necessary.
      # The special value "<ignored>" means to discard the corresponding input
      # field.
      # Any @missing lines in the file should also match this syntax; no such
@@ -2279,6 +2323,15 @@ sub trace { return main::trace(@_); }
      # may not be the actual reality, but probably nobody cares anyway for
      # these obsolete characters.)
      #
+    # [3] if present is the default value for the property to assign for code
+    # points not given in the input.  If not present, the default from the
+    # normal property is used
+    #
+    # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
+    # it means to not add the name in [1] as an alias to the property name
+    # used for these.  Normally, when compiling Unicode versions that don't
+    # invoke the early handling, the name is added as a synonym.
+    #
      # Not all files can be handled in the above way, and so the code ref
      # alternative is available.  It can do whatever it needs to.  The other
      # array elements are optional in this case, and the code is free to use or
@@ -2289,6 +2342,9 @@ sub trace { return main::trace(@_); }
      # makes for easier testing later on.
      main::set_access('early', \%early, 'c');
  
+    my %only_early;
+    main::set_access('only_early', \%only_early, 'c');
+
      my %required_even_in_debug_skip;
      # debug_skip is used to speed up compilation during debugging by skipping
      # processing files that are not needed for the task at hand.  However,
@@ -2319,6 +2375,7 @@ sub trace { return main::trace(@_); }
  
          # Set defaults
          $handler{$addr} = \&main::process_generic_property_file;
+        $retain_trailing_comments{$addr} = 0;
          $non_skip{$addr} = 0;
          $skip{$addr} = undef;
          $has_missings_defaults{$addr} = $NO_DEFAULTS;
@@ -2399,6 +2456,11 @@ sub trace { return main::trace(@_); }
          my $progress;
          my $function_instead_of_file = 0;
  
+        if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
+            $only_early{$addr} = 1;
+            pop $early{$addr}->@*;
+        }
+
          # If we are compiling a Unicode release earlier than the file became
          # available, the constructor may have supplied a substitute
          if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
@@ -2407,6 +2469,8 @@ sub trace { return main::trace(@_); }
              unshift $early{$addr}->@*, 1;
  
              # See the definition of %early for what the array elements mean.
+            # Note that we have just unshifted onto the array, so the numbers
+            # below are +1 of those in the %early description.
              # If we have a property this defines, create a table and default
              # map for it now (at essentially compile time), so that it will be
              # available for the whole of run time.  (We will want to add this
@@ -2423,10 +2487,13 @@ sub trace { return main::trace(@_); }
                                                  Perl_Extension => 1,
                                                  );
  
-                # Use the default mapping for the regular property for this
-                # substitute one.
-                if (    defined $property{$addr}
-                    &&  defined $default_mapping{$property{$addr}})
+                # If not specified by the constructor, use the default mapping
+                # for the regular property for this substitute one.
+                if ($early{$addr}[4]) {
+                    $prop_object->set_default_map($early{$addr}[4]);
+                }
+                elsif (    defined $property{$addr}
+                       &&  defined $default_mapping{$property{$addr}})
                  {
                      $prop_object
                          ->set_default_map($default_mapping{$property{$addr}});
@@ -2632,7 +2699,7 @@ END
          # once per file, as it destroy's the EOF handlers
  
          # flag to make sure extracted files are processed early
-        state $seen_non_extracted_non_age = 0;
+        state $seen_non_extracted = 0;
  
          my $self = shift;
          Carp::carp_extra_args(\@_) if main::DEBUG && @_;
@@ -2645,7 +2712,7 @@ END
              $handle{$addr} = 'pretend_is_open';
          }
          else {
-            if ($seen_non_extracted_non_age) {
+            if ($seen_non_extracted) {
                  if ($file =~ /$EXTRACTED/i) # Some platforms may change the
                                              # case of the file's name
                  {
@@ -2662,13 +2729,12 @@ END
                      # We only do this check for generic property files
                      && $handler{$addr} == \&main::process_generic_property_file
  
-                    && $file !~ /$EXTRACTED/i
-                    && lc($file) ne 'dage.txt')
+                    && $file !~ /$EXTRACTED/i)
              {
                  # We don't set this (by the 'if' above) if we have no
                  # extracted directory, so if running on an early version,
                  # this test won't work.  Not worth worrying about.
-                $seen_non_extracted_non_age = 1;
+                $seen_non_extracted = 1;
              }
  
              # Mark the file as having being processed, and warn if it
@@ -2695,7 +2761,10 @@ END
                  # official property, we still have to allow the publicly
                  # inaccessible early name so that the core code which uses it
                  # will work regardless.
-                if (! $early{$addr}[0] && $early{$addr}->@* > 2) {
+                if (   ! $only_early{$addr}
+                    && ! $early{$addr}[0]
+                    && $early{$addr}->@* > 2)
+                {
                      my $early_property_name = $early{$addr}[2];
                      if ($property{$addr} ne $early_property_name) {
                          main::property_ref($property{$addr})
@@ -2972,9 +3041,21 @@ END
                  next;
              }
  
-            # Remove comments and trailing space, and skip this line if the
-            # result is empty
-            s/#.*//;
+            # Unless to keep, remove comments.  If to keep, ignore
+            # comment-only lines
+            if ($retain_trailing_comments{$addr}) {
+                next if / ^ \s* \# /x;
+
+                # But escape any single quotes (done in both the comment and
+                # non-comment portion; this could be a bug someday, but not
+                # likely)
+                s/'/\\'/g;
+            }
+            else {
+                s/#.*//;
+            }
+
+            # Remove trailing space, and skip this line if the result is empty
              s/\s+$//;
              next if /^$/;
  
@@ -3096,7 +3177,7 @@ END
  
  #   Not currently used, not fully tested.
  #    sub peek {
-#        # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
+#        # Non-destructive lookahead one non-adjusted, non-comment, non-blank
  #        # record.  Not callable from an each_line_handler(), nor does it call
  #        # an each_line_handler() on the line.
  #
@@ -5242,6 +5323,14 @@ use parent '-norequire', '_Range_List_Base';
          return $self->_add_delete('+', @_);
      }
  
+    sub replace_map {
+        # Replace a range
+
+        my $self = shift;
+
+        return $self->_add_delete('+', @_, Replace => $UNCONDITIONALLY);
+    }
+
      sub add_duplicate {
          # Adds entry to a range list which can duplicate an existing entry
  
@@ -5382,6 +5471,15 @@ sub trace { return main::trace(@_); }
      # used to override calculations.
      main::set_access('format', \%format, 'r', 'p_s');
  
+    my %has_dependency;
+    # A boolean that gives whether some other table in this property is
+    # defined as the complement of this table.  This is a crude, but currently
+    # sufficient, mechanism to make this table not get destroyed before what
+    # is dependent on it is.  Other dependencies could be added, so the name
+    # was chosen to reflect a more general situation than actually is
+    # currently the case.
+    main::set_access('has_dependency', \%has_dependency, 'r', 's');
+
      sub new {
          # All arguments are key => value pairs, which you can see below, most
          # of which match fields documented above.  Otherwise: Re_Pod_Entry,
@@ -5437,6 +5535,7 @@ sub trace { return main::trace(@_); }
          $note{$addr} = [ ];
          $file_path{$addr} = [ ];
          $locked{$addr} = "";
+        $has_dependency{$addr} = 0;
  
          push @{$description{$addr}}, $description if $description;
          push @{$note{$addr}}, $note if $note;
@@ -5717,6 +5816,9 @@ END
          }
  
          # Look at each alias
+        my $is_last_resort = 0;
+        my $deprecated_or_discouraged
+                                = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x;
          foreach my $alias ($self->aliases()) {
  
              # Don't use an alias that isn't ok to use for an external name.
@@ -5725,10 +5827,13 @@ END
              my $name = main::Standardize($alias->name);
              trace $self, $name if main::DEBUG && $to_trace;
  
-            # Take the first one, or a shorter one that isn't numeric.  This
+            # Take the first one, or any non-deprecated non-discouraged one
+            # over one that is, or a shorter one that isn't numeric.  This
              # relies on numeric aliases always being last in the array
              # returned by aliases().  Any alpha one will have precedence.
-            if (! defined $short_name{$addr}
+            if (   ! defined $short_name{$addr}
+                || (   $is_last_resort
+                    && $alias->status !~ $deprecated_or_discouraged)
                  || ($name =~ /\D/
                      && length($name) < length($short_name{$addr})))
              {
@@ -5736,14 +5841,16 @@ END
                  ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
  
                  $nominal_short_name_length{$addr} = length $name;
+                $is_last_resort = $alias->status =~ $deprecated_or_discouraged;
              }
          }
  
          # If the short name isn't a nice one, perhaps an equivalent table has
          # a better one.
-        if (! defined $short_name{$addr}
-            || $short_name{$addr} eq ""
-            || $short_name{$addr} eq "_")
+        if (   $self->can('children')
+            && (   ! defined $short_name{$addr}
+                || $short_name{$addr} eq ""
+                || $short_name{$addr} eq "_"))
          {
              my $return;
              foreach my $follower ($self->children) {    # All equivalents
@@ -6252,6 +6359,22 @@ END
                      }
  
                      if ($write_as_invlist) {
+                        if (   $previous_end > 0
+                            && $output_range_counts{$addr})
+                        {
+                            my $complement_count = $start - $previous_end - 1;
+                            if ($complement_count > 1) {
+                                $OUT[-1] = merge_single_annotation_line(
+                                    $OUT[-1],
+                                       "#"
+                                     . (" " x 17)
+                                     . "["
+                                     .  main::clarify_code_point_count(
+                                                            $complement_count)
+                                      . "] in complement\n",
+                                    $comment_indent);
+                            }
+                        }
  
                          # Inversion list format has a single number per line,
                          # the starting code point of a range that matches the
@@ -6380,7 +6503,15 @@ END
                                      $range_name = "Hangul Syllable";
                                  }
  
-                                if ($i != $start || $range_end < $end) {
+                                # If the annotation would just repeat what's
+                                # already being output as the range, skip it.
+                                # (When an inversion list is being written, it
+                                # isn't a repeat, as that always is in
+                                # decimal)
+                                if (   $write_as_invlist
+                                    || $i != $start
+                                    || $range_end < $end)
+                                {
                                      if ($range_end < $MAX_WORKING_CODEPOINT)
                                      {
                                          $annotation = sprintf "%04X..%04X",
@@ -6394,6 +6525,7 @@ END
                                  else { # Indent if not displaying code points
                                      $annotation = " " x 4;
                                  }
+
                                  if ($range_name) {
                                      $annotation .= " $age[$i]" if $age[$i];
                                      $annotation .= " $range_name";
@@ -6936,18 +7068,29 @@ sub trace { return main::trace(@_); }
  
      sub set_default_map {
          # Define what code points that are missing from the input files should
-        # map to
+        # map to.  The optional second parameter 'full_name' indicates to
+        # force using the full name of the map instead of its standard name.
  
          my $self = shift;
          my $map = shift;
+        my $use_full_name = shift // 0;
          Carp::carp_extra_args(\@_) if main::DEBUG && @_;
  
+        if ($use_full_name && $use_full_name ne 'full_name') {
+            Carp::my_carp_bug("Second parameter to set_default_map() if"
+                            . " present, must be 'full_name'");
+        }
+
          my $addr = do { no overloading; pack 'J', $self; };
  
          # Convert the input to the standard equivalent, if any (won't have any
          # for $STRING properties)
-        my $standard = $self->_find_table_from_alias->{$map};
-        $map = $standard->name if defined $standard;
+        my $standard = $self->property->table($map);
+        if (defined $standard) {
+            $map = ($use_full_name)
+                   ? $standard->full_name
+                   : $standard->name;
+        }
  
          # Warn if there already is a non-equivalent default map for this
          # property.  Note that a default map can be a ref, which means that
@@ -7597,6 +7740,7 @@ END
      # Accessors for the underlying list that should fail if locked.
      for my $sub (qw(
                      add_duplicate
+                    replace_map
                  ))
      {
          no strict "refs";
@@ -7868,13 +8012,23 @@ sub trace { return main::trace(@_); }
          # complement's if it has one.
  
          my $self = shift;
-        my $complement;
-        if (($complement = $self->complement) != 0) {
-            return ~ $complement->_range_list;
-        }
-        else {
-            return $self->SUPER::_range_list;
+        my $complement = $self->complement;
+
+        # In order to avoid re-complementing on each access, only do the
+        # complement the first time, and store the result in this table's
+        # range list to use henceforth.  However, this wouldn't work if the
+        # controlling (complement) table changed after we do this, so lock it.
+        # Currently, the value of the complement isn't needed until after it
+        # is fully constructed, so this works.  If this were to change, the
+        # each_range iteration functionality would no longer work on this
+        # complement.
+        if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
+            $self->_set_range_list($self->SUPER::_range_list
+                                + ~ $complement->_range_list);
+            $complement->lock;
          }
+
+        return $self->SUPER::_range_list;
      }
  
      sub add_alias {
@@ -8076,6 +8230,15 @@ sub trace { return main::trace(@_); }
          }
          my $addr = do { no overloading; pack 'J', $self; };
          $complement{$addr} = $other;
+
+        # Be sure the other property knows we are depending on them; or the
+        # other table if it is one in the current property.
+        if ($self->property != $other->property) {
+            $other->property->set_has_dependency(1);
+        }
+        else {
+            $other->set_has_dependency(1);
+        }
          $self->lock;
          return;
      }
@@ -8662,6 +8825,15 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
      main::set_access('pre_declared_maps',
                                      \%pre_declared_maps, 'r', 's');
  
+    my %has_dependency;
+    # A boolean that gives whether some table somewhere is defined as the
+    # complement of a table in this property.  This is a crude, but currently
+    # sufficient, mechanism to make this property not get destroyed before
+    # what is dependent on it is.  Other dependencies could be added, so the
+    # name was chosen to reflect a more general situation than actually is
+    # currently the case.
+    main::set_access('has_dependency', \%has_dependency, 'r', 's');
+
      sub new {
          # The only required parameter is the positionally first, name.  All
          # other parameters are key => value pairs.  See the documentation just
@@ -8700,6 +8872,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
          $has_only_code_point_maps{$addr} = 1;
          $table_ref{$addr} = { };
          $unique_maps{$addr} = { };
+        $has_dependency{$addr} = 0;
  
          $map{$addr} = Map_Table->new($name,
                                      Full_Name => $full_name{$addr},
@@ -9212,6 +9385,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
                      range_count
                      ranges
                      range_size_1
+                    replace_map
                      reset_each_range
                      set_comment
                      set_default_map
@@ -10563,8 +10737,8 @@ ea ; W         ; Wide
  END
      }
  
-    if (-e 'LineBreak.txt') {
-        push @return, split /\n/, <<'END';
+    if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
+        my @lb = split /\n/, <<'END';
  lb ; AI        ; Ambiguous
  lb ; AL        ; Alphabetic
  lb ; B2        ; Break_Both
@@ -10595,6 +10769,12 @@ lb ; SY        ; Break_Symbols
  lb ; XX        ; Unknown
  lb ; ZW        ; ZWSpace
  END
+        # If this Unicode version predates the lb property, we use our
+        # substitute one
+        if (-e 'LBsubst.txt') {
+            $_ = s/^lb/_Perl_LB/r for @lb;
+        }
+        push @return, @lb;
      }
  
      if (-e 'DNormalizationProps.txt') {
@@ -11581,7 +11761,16 @@ END
                                            . $CMD_DELIM
                                            . $fields[$CHARNAME];
              }
-            elsif ($fields[$CHARNAME] =~ /^CJK/) {
+            elsif ($fields[$CATEGORY] eq 'Lo') {    # Is a letter
+
+                # All the CJK ranges like this have the name given as a
+                # special case in the next code line.  And for the others, we
+                # hope that Unicode continues to use the correct name in
+                # future releases, so we don't have to make further special
+                # cases.
+                my $name = ($fields[$CHARNAME] =~ /^CJK/)
+                           ? 'CJK UNIFIED IDEOGRAPH'
+                           : uc $fields[$CHARNAME];
  
                  # The name for these contains the code point itself, and all
                  # are defined to have the same base name, regardless of what
@@ -11593,7 +11782,7 @@ END
                                             . '='
                                             . $CP_IN_NAME
                                             . $CMD_DELIM
-                                           . 'CJK UNIFIED IDEOGRAPH';
+                                           . $name;
  
              }
              elsif ($fields[$CATEGORY] eq 'Co'
@@ -12011,6 +12200,18 @@ sub process_GCB_test {
      return;
  }
  
+sub process_LB_test {
+
+    my $file = shift;
+    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+    while ($file->next_line) {
+        push @LB_tests, $_;
+    }
+
+    return;
+}
+
  sub process_SB_test {
  
      my $file = shift;
@@ -12096,6 +12297,24 @@ sub process_NamedSequences {
      }
  }
  
+sub filter_substitute_lb {
+    # Used on Unicodes that predate the LB property, where there is a
+    # substitute file.  This just does the regular ea_lb handling for such
+    # files, and then substitutes the long property value name for the short
+    # one that comes with the file.  (The other break files have the long
+    # names in them, so this is the odd one out.)  The reason for doing this
+    # kludge is that regen/mk_invlists.pl is expecting the long name.  This
+    # also fixes the typo 'Inseperable' that leads to problems.
+
+    filter_early_ea_lb;
+    return unless $_;
+
+    my @fields = split /\s*;\s*/;
+    $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
+    $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
+    $_ = join '; ', @fields;
+}
+
  sub filter_old_style_arabic_shaping {
      # Early versions used a different term for the later one.
  
@@ -13253,9 +13472,9 @@ END
  
      # For each property, fill in any missing mappings, and calculate the re
      # match tables.  If a property has more than one missing mapping, the
-    # default is a reference to a data structure, and requires data from other
-    # properties to resolve.  The sort is used to cause these to be processed
-    # last, after all the other properties have been calculated.
+    # default is a reference to a data structure, and may require data from
+    # other properties to resolve.  The sort is used to cause these to be
+    # processed last, after all the other properties have been calculated.
      # (Fortunately, the missing properties so far don't depend on each other.)
      foreach my $property
          (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
@@ -13499,6 +13718,18 @@ numerals.
  END
      ));
  
+    # Make sure this assumption in perl core code is valid in this Unicode
+    # release, with known exceptions
+    foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
+        next if $range->end - $range->start == 9;
+        next if $range->start == 0x1D7CE;   # This whole range was added in 3.1
+        next if $range->end == 0x19DA && $v_version eq v5.2.0;
+        next if $range->end - $range->start < 9 && $v_version le 4.0.0;
+        Carp::my_carp("Range $range unexpectedly doesn't contain 10"
+                    . " decimal digits.  Code in regcomp.c assumes it does,"
+                    . " and will have to be fixed.  Proceeding anyway.");
+    }
+
      Property->new('Legacy_Case_Folding',
                      File => "Fold",
                      Directory => $map_directory,
@@ -13565,7 +13796,7 @@ sub pre_3_dot_1_Nl () {
      return $Nl;
  }
  
-sub calculate_Assigned() {  # Calculate the gc != Cn code points; may be
+sub calculate_Assigned() {  # Set $Assigned to the gc != Cn code points; may be
                              # called before the Cn's are completely filled.
                              # Works on Unicodes earlier than ones that
                              # explicitly specify Cn.
@@ -13686,8 +13917,10 @@ sub handle_compare_versions () {
      # since the first compare version.
      my $delta = Range_List->new();
      foreach my $table ($age->tables) {
+        use version;
          next if $table == $age->table('Unassigned');
-        next if $table->name le $string_compare_versions;
+        next if version->parse($table->name)
+             le version->parse($string_compare_versions);
          $delta += $table;
      }
      if ($delta->is_empty) {
@@ -13710,6 +13943,9 @@ sub handle_compare_versions () {
          next if     $this_block == $no_block
                  ||  ! ($this_block & $Assigned)->is_empty;
          $this_block->set_fate($SUPPRESSED, $after_first_version);
+        foreach my $range ($this_block->ranges) {
+            $block->replace_map($range->start, $range->end, 'No_Block')
+        }
          $no_block += $this_block;
      }
  
@@ -14273,7 +14509,8 @@ sub compile_perl() {
  
      my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
                                  Description => '\w, including beyond ASCII;'
-                                            . ' = \p{Alnum} + \pM + \p{Pc}',
+                                            . ' = \p{Alnum} + \pM + \p{Pc}'
+                                            . ' + \p{Join_Control}',
                                  Initialize => $Alnum + $gc->table('Mark'),
                                  );
      my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
@@ -14708,7 +14945,6 @@ sub compile_perl() {
                                      + ord("(")
                                      + ord(")")
                                      + ord("-")
-                                    + utf8::unicode_to_native(0xA0) # NBSP
                          );
  
      my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
@@ -14998,8 +15234,8 @@ END
                                                     0x2060 .. 0x206F,
                                                     0xFE00 .. 0xFE0F,
                                                     0xFFF0 .. 0xFFFB,
-                                                   0xE0000 .. 0xE0FFF,
                                                    ]);
+        $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
          $quotemeta += $temp;
      }
      calculate_DI();
@@ -15021,6 +15257,95 @@ END
          }
      }
  
+    # Perl tailors the WordBreak property so that \b{wb} doesn't split
+    # adjacent spaces into separate words.  First create a copy of the regular
+    # WB property as '_Perl_WB'.  (On Unicode releases earlier than when WB
+    # was defined for, this will already have been done by the substitute file
+    # portion for 'Input_file' code for WB.)
+    my $perl_wb = property_ref('_Perl_WB');
+    if (! defined $perl_wb) {
+        $perl_wb = Property->new('_Perl_WB',
+                                 Fate => $INTERNAL_ONLY,
+                                 Perl_Extension => 1,
+                                 Directory => $map_directory,
+                                 Type => $STRING);
+        my $wb = property_ref('Word_Break');
+        $perl_wb->initialize($wb);
+        $perl_wb->set_default_map($wb->default_map);
+    }
+
+    # And simply replace the mappings of horizontal space characters that
+    # otherwise would map to the default to instead map to our tailoring.
+    my $default = $perl_wb->default_map;
+    for my $range ($Blank->ranges) {
+        for my $i ($range->start .. $range->end) {
+            next unless $perl_wb->value_of($i) eq $default;
+            $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
+                              Replace => $UNCONDITIONALLY);
+        }
+    }
+
+    # Create a version of the LineBreak property with the mappings that are
+    # omitted in the default algorithm remapped to what
+    # http://www.unicode.org/reports/tr14 says they should be.
+    #
+    # Original            Resolved  General_Category
+    # AI, SG, XX      AL      Any
+    # SA              CM      Only Mn or Mc
+    # SA              AL      Any except Mn and Mc
+    # CJ              NS      Any
+    #
+    # All property values are also written out in their long form, as
+    # regen/mk_invlist.pl expects that.  This also fixes occurrences of the
+    # typo in early Unicode versions: 'inseperable'.
+    my $perl_lb = property_ref('_Perl_LB');
+    if (! defined $perl_lb) {
+        $perl_lb = Property->new('_Perl_LB',
+                                 Fate => $INTERNAL_ONLY,
+                                 Perl_Extension => 1,
+                                 Directory => $map_directory,
+                                 Type => $STRING);
+        my $lb = property_ref('Line_Break');
+
+        # Populate from $lb, but use full name and fix typo.
+        foreach my $range ($lb->ranges) {
+            my $full_name = $lb->table($range->value)->full_name;
+            $full_name = 'Inseparable'
+                                if standardize($full_name) eq 'inseperable';
+            $perl_lb->add_map($range->start, $range->end, $full_name);
+        }
+    }
+
+    $perl_lb->set_default_map('Alphabetic', 'full_name');    # XX -> AL
+
+    for my $range ($perl_lb->ranges) {
+        my $value = standardize($range->value);
+        if (   $value eq standardize('Unknown')
+            || $value eq standardize('Ambiguous')
+            || $value eq standardize('Surrogate'))
+        {
+            $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
+                              Replace => $UNCONDITIONALLY);
+        }
+        elsif ($value eq standardize('Conditional_Japanese_Starter')) {
+            $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
+                              Replace => $UNCONDITIONALLY);
+        }
+        elsif ($value eq standardize('Complex_Context')) {
+            for my $i ($range->start .. $range->end) {
+                my $gc_val = $gc->value_of($i);
+                if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
+                    $perl_lb->add_map($i, $i, 'Combining_Mark',
+                                      Replace => $UNCONDITIONALLY);
+                }
+                else {
+                    $perl_lb->add_map($i, $i, 'Alphabetic',
+                                      Replace => $UNCONDITIONALLY);
+                }
+            }
+        }
+    }
+
      # Here done with all the basic stuff.  Ready to populate the information
      # about each character if annotating them.
      if ($annotate) {
@@ -15048,7 +15373,7 @@ sub add_perl_synonyms() {
      # the single-form, \p{name}.  These are:
      #   All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
      #       \p{Is_Name} as synonyms
-    #   \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
+    #   \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
      #   \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
      #   \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
      #       conflict, \p{Value} and \p{Is_Value} as well
@@ -15062,8 +15387,15 @@ sub add_perl_synonyms() {
                                                              property_ref('*');
      push @tables, $gc->tables;
  
-    # If the version of Unicode includes the Script property, add its tables
-    push @tables, $script->tables if defined $script;
+    # If the version of Unicode includes the Script Extensions (preferably),
+    # or Script property, add its tables
+    my $scx = property_ref("Script_Extensions");
+    if (defined $scx) {
+        push @tables, $scx->tables;
+    }
+    else {
+        push @tables, $script->tables if defined $script;
+    }
  
      # The Block tables are kept separate because they are treated differently.
      # And the earliest versions of Unicode didn't include them, so add only if
@@ -15126,18 +15458,19 @@ sub add_perl_synonyms() {
  
                  if (! defined $pre_existing) {
  
-                    # No name collision, so ok to add the perl synonym.
+                    # No name collision, so OK to add the perl synonym.
  
                      my $make_re_pod_entry;
                      my $ok_as_filename;
                      my $status = $alias->status;
                      if ($nominal_property == $block) {
  
-                        # For block properties, the 'In' form is preferred for
-                        # external use; the pod file contains wild cards for
-                        # this and the 'Is' form so no entries for those; and
-                        # we don't want people using the name without the
-                        # 'In', so discourage that.
+                        # For block properties, only the compound form is
+                        # preferred for external use; the others are
+                        # discouraged.  The pod file contains wild cards for
+                        # the 'In' and 'Is' forms so no entries for those; and
+                        # we don't want people using the name without any
+                        # prefix, so discourage that.
                          if ($prefix eq "") {
                              $make_re_pod_entry = 1;
                              $status = $status || $DISCOURAGED;
@@ -15145,7 +15478,7 @@ sub add_perl_synonyms() {
                          }
                          elsif ($prefix eq 'In_') {
                              $make_re_pod_entry = 0;
-                            $status = $status || $NORMAL;
+                            $status = $status || $DISCOURAGED;
                              $ok_as_filename = 1;
                          }
                          else {
@@ -15220,7 +15553,7 @@ sub add_perl_synonyms() {
                      next;
                  }
  
-                # Here, there is a name collision, but it still could be ok if
+                # Here, there is a name collision, but it still could be OK if
                  # the tables match the identical set of code points, in which
                  # case, we can combine the names.  Compare each table's code
                  # point list to see if they are identical.
@@ -15924,7 +16257,7 @@ sub make_re_pod_entries($) {
                  # And if this is a compound form name, see if there is a
                  # single form equivalent
                  my $single_form;
-                if ($table_property != $perl) {
+                if ($table_property != $perl && $table_property != $block) {
  
                      # Special case the binary N tables, so that will print
                      # \P{single}, but use the Y table values to populate
@@ -16186,7 +16519,7 @@ sub make_ucd_table_pod_entries {
                      Carp::my_carp("Bad news.  $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations.  Proceeding anyway.");
                  }
  
-                # We modifiy the info column of the one being output to
+                # We modify the info column of the one being output to
                  # indicate the ambiguity.  Set $which to point to that one's
                  # info.
                  my $which;
@@ -16248,27 +16581,69 @@ sub pod_alphanumeric_sort {
          return -1
      }
  
-    # Determine if the two operands are numeric property values or not.
-    # A numeric property will look like xyz: 3.  But the number
-    # can begin with an optional minus sign, and may have a
-    # fraction or rational component, like xyz: 3/2.  If either
-    # isn't numeric, use alphabetic sort.
-    my ($a_initial, $a_number) =
-        ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
-    return $a cmp $b unless defined $a_number;
-    my ($b_initial, $b_number) =
-        ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
-    return $a cmp $b unless defined $b_number;
-
-    # Here they are both numeric, but use alphabetic sort if the
-    # initial parts don't match
-    return $a cmp $b if $a_initial ne $b_initial;
+    # Determine if the two operands are compound or not, and if so if are
+    # "numeric" property values or not, like \p{Age: 3.0}.  But there are also
+    # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
+    # all of which this considers numeric, and for sorting, looks just at the
+    # numeric parts.  It can also be a rational like \p{Numeric Value=-1/2}.
+    my $split_re = qr/
+        ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
+                     # property name
+        [:=] \s*     # The syntax for the compound form
+        (?:          # followed by ...
+            (        # $2 gets defined if what follows is a "numeric"
+                     # expression, which is ...
+              ( -? \d+ (?: [.\/] \d+)?  # An integer, float, or rational
+                                        # number, optionally signed
+               | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131.  Either
+                                         # of these go into $3
+             | ( V \d+ _ \d+ )           # or a Unicode's Age property version
+                                         # number, into $4
+            )
+            | .* $    # If not "numeric", accept anything so that $1 gets
+                      # defined if it is any compound form
+        ) /ix;
+    my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
+    my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
+
+    # Sort alphabeticlly on the whole property name if either operand isn't
+    # compound, or they differ.
+    return $a cmp $b if   ! defined $a_initial
+                       || ! defined $b_initial
+                       || $a_initial ne $b_initial;
+
+    if (! defined $a_numeric) {
+
+        # If neither is numeric, use alpha sort
+        return $a cmp $b if ! defined $b_numeric;
+        return 1;  # Sort numeric ahead of alpha
+    }
+
+    # Here $a is numeric
+    return -1 if ! defined $b_numeric;  # Numeric sorts before alpha
+
+    # Here they are both numeric in the same property.
+    # Convert version numbers into regular numbers
+    if (defined $a_version) {
+        ($a_number = $a_version) =~ s/^V//i;
+        $a_number =~ s/_/./;
+    }
+    else {  # Otherwise get rid of the, e.g., CCC in CCC9 */
+        $a_number =~ s/ ^ [[:alpha:]]+ //x;
+    }
+    if (defined $b_version) {
+        ($b_number = $b_version) =~ s/^V//i;
+        $b_number =~ s/_/./;
+    }
+    else {
+        $b_number =~ s/ ^ [[:alpha:]]+ //x;
+    }
  
      # Convert rationals to floating for the comparison.
      $a_number = eval $a_number if $a_number =~ qr{/};
      $b_number = eval $b_number if $b_number =~ qr{/};
  
-    return $a_number <=> $b_number;
+    return $a_number <=> $b_number || $a cmp $b;
  }
  
  sub make_pod () {
@@ -16292,20 +16667,22 @@ sub make_pod () {
                                                  '\p{Block: *}'
                                                      . (($has_In_conflicts)
                                                        ? " $exception_message"
-                                                      : ""));
+                                                      : ""),
+                                                 $DISCOURAGED);
          @block_warning = << "END";
  
-Matches in the Block property have shortcuts that begin with "In_".  For
-example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>.  For
-backward compatibility, if there is no conflict with another shortcut, these
-may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>.  But, N.B., there
-are numerous such conflicting shortcuts.  Use of these forms for Block is
-discouraged, and are flagged as such, not only because of the potential
-confusion as to what is meant, but also because a later release of Unicode may
-preempt the shortcut, and your program would no longer be correct.  Use the
-"In_" form instead to avoid this, or even more clearly, use the compound form,
-e.g., C<\\p{blk:latin1}>.  See L<perlunicode/"Blocks"> for more information
-about this.
+In particular, matches in the Block property have single forms
+defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at
+all,  Like all B<DISCOURAGED> forms, these are not stable.  For example,
+C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>,
+C<\\p{Is_Deseret}>, or C<\\p{Deseret}>.  But, a new Unicode version may
+come along that would force Perl to change the meaning of one or more of
+these, and your program would no longer be correct.  Currently there are no
+such conflicts with the form that begins C<"In_">, but there are many with the
+other two shortcuts, and Unicode continues to define new properties that begin
+with C<"In">, so it's quite possible that a conflict will occur in the future.
+The compound form is guaranteed to not become obsolete, and its meaning is
+clearer anyway.  See L<perlunicode/"Blocks"> for more information about this.
  END
      }
      my $text = $Is_flags_text;
@@ -16439,7 +16816,7 @@ END
          push @bad_re_properties, "\n=back\n";
      }
  
-    # Similiarly, generate a list of files that we don't use, grouped by the
+    # Similarly, generate a list of files that we don't use, grouped by the
      # reasons why (Don't output if the reason is empty).  First, create a hash
      # whose keys are the reasons, and whose values are anonymous arrays of all
      # the files that share that reason.
@@ -16648,18 +17025,21 @@ Properties marked with $a_bold_obsolete in the table are considered (plain)
  obsolete.  Generally this designation is given to properties that Unicode once
  used for internal purposes (but not any longer).
  
-=back
+=item Discouraged
+
+This is not actually a Unicode-specified obsolescence, but applies to certain
+Perl extensions that are present for backwards compatibility, but are
+discouraged from being used.  These are not obsolete, but their meanings are
+not stable.  Future Unicode versions could force any of these extensions to be
+removed without warning, replaced by another property with the same name that
+means something different.  $A_bold_discouraged flags each such entry in the
+table.  Use the equivalent shown instead.
  
-Some Perl extensions are present for backwards compatibility and are
-discouraged from being used, but are not obsolete.  $A_bold_discouraged
-flags each such entry in the table.  Future Unicode versions may force
-some of these extensions to be removed without warning, replaced by another
-property with the same name that means something different.  Use the
-equivalent shown instead.
+@block_warning
  
  =back
  
-@block_warning
+=back
  
  The table below has two columns.  The left column contains the C<\\p{}>
  constructs to look up, possibly preceded by the flags mentioned above; and
@@ -16724,34 +17104,20 @@ B<Legend summary:>
  
  =over 4
  
-=item *
+=item Z<>B<*> is a wild-card
  
-B<*> is a wild-card
-
-=item *
-
-B<(\\d+)> in the info column gives the number of Unicode code points matched
+=item B<(\\d+)> in the info column gives the number of Unicode code points matched
  by this property.
  
-=item *
-
-B<$DEPRECATED> means this is deprecated.
-
-=item *
-
-B<$OBSOLETE> means this is obsolete.
-
-=item *
+=item B<$DEPRECATED> means this is deprecated.
  
-B<$STABILIZED> means this is stabilized.
+=item B<$OBSOLETE> means this is obsolete.
  
-=item *
+=item B<$STABILIZED> means this is stabilized.
  
-B<$STRICTER> means tighter (stricter) name matching applies.
+=item B<$STRICTER> means tighter (stricter) name matching applies.
  
-=item *
-
-B<$DISCOURAGED> means use of this form is discouraged, and may not be
+=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
  stable.
  
  =back
@@ -17361,7 +17727,7 @@ sub make_UCD () {
              foreach my $prop_alias ($property->aliases) {
                  my $prop_alias_name = standardize($prop_alias->name);
  
-                # If no =value, there's just one combination possibe for this
+                # If no =value, there's just one combination possible for this
                  if (! $value_name) {
  
                      # The property may be suppressed, but there may be a proxy
@@ -17753,7 +18119,7 @@ sub write_all_tables() {
                      # the children.
                      make_re_pod_entries($table) if defined $pod_directory;
  
-                    # See if the the table matches identical code points with
+                    # See if the table matches identical code points with
                      # something that has already been output.  In that case,
                      # no need to have two files with the same code points in
                      # them.  We use the table's hash() method to store these
@@ -17916,7 +18282,7 @@ sub write_all_tables() {
                          # the table.  That is, all the property-values given
                          # by this table.  By agreement with Unicode::UCD,
                          # if the name and full name are identical, and there
-                        # are no other names, drop the duplcate entry to save
+                        # are no other names, drop the duplicate entry to save
                          # memory.
                          if (@values_list == 2
                              && $values_list[0] eq $values_list[1])
@@ -18080,14 +18446,14 @@ sub generate_separator($) {
  
  sub generate_tests($$$$$) {
      # This used only for making the test script.  It generates test cases that
-    # are expected to compile successfully in perl.  Note that the lhs and
-    # rhs are assumed to already be as randomized as the caller wants.
+    # are expected to compile successfully in perl.  Note that the LHS and
+    # RHS are assumed to already be as randomized as the caller wants.
  
      my $lhs = shift;           # The property: what's to the left of the colon
                                 #  or equals separator
      my $rhs = shift;           # The property value; what's to the right
      my $valid_code = shift;    # A code point that's known to be in the
-                               # table given by lhs=rhs; undef if table is
+                               # table given by LHS=RHS; undef if table is
                                 # empty
      my $invalid_code = shift;  # A code point known to not be in the table;
                                 # undef if the table is all code points
@@ -18128,13 +18494,13 @@ sub generate_error($$$) {
                                      # colon or equals separator
      my $rhs = shift;                # The property value; what's to the right
      my $already_in_error = shift;   # Boolean; if true it's known that the
-                                # unmodified lhs and rhs will cause an error.
+                                # unmodified LHS and RHS will cause an error.
                                  # This routine should not force another one
      # Get the colon or equal
      my $separator = generate_separator($lhs);
  
      # Since this is an error only, don't bother to randomly decide whether to
-    # put the error on the left or right side; and assume that the rhs is
+    # put the error on the left or right side; and assume that the RHS is
      # loosely matched, again for convenience rather than rigor.
      $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
  
@@ -18333,10 +18699,33 @@ sub make_property_test_script() {
      # or multiple lines. main::write doesn't count the lines.
      my @output;
  
+    push @output, <<'EOF_CODE';
+Error('\p{Script=InGreek}');    # Bug #69018
+Test_GCB("1100 $nobreak 1161");  # Bug #70940
+Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
+Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
+Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
+
+# Make sure this gets tested; it was not part of the official test suite at
+# the time this was added.  Note that this is as it would appear in the
+# official suite, and gets modified to check for the perl tailoring by
+# Test_WB()
+Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
+Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
+EOF_CODE
+
      # Sort these so get results in same order on different runs of this
      # program
-    foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) {
-        foreach my $table (sort { $a->name cmp $b->name } $property->tables) {
+    foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
+                                    or
+                                 lc $a->name cmp lc $b->name
+                               } property_ref('*'))
+    {
+        foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
+                                    or
+                                  lc $a->name cmp lc $b->name
+                                } $property->tables)
+        {
  
              # Find code points that match, and don't match this table.
              my $valid = $table->get_valid_code_point;
@@ -18462,6 +18851,18 @@ sub make_property_test_script() {
                                  # conflating is possible.  In our example, we
                                  # don't want 2/3 matching 7/10, if there is
                                  # a 7/10 code point.
+
+                                # First, integers are not in the rationals
+                                # table.  Don't generate an error if this
+                                # rounds to an integer using the given
+                                # precision.
+                                my $round = sprintf "%.0f", $table_name;
+                                next PLACE if abs($table_name - $round)
+                                                        < $MAX_FLOATING_SLOP;
+
+                                # Here, isn't close enough to an integer to be
+                                # confusable with one.  Now, see it it's
+                                # "close" to a known rational
                                  for my $existing
                                          (keys %nv_floating_to_rational)
                                  {
@@ -18490,18 +18891,74 @@ sub make_property_test_script() {
                      }
                  }
              }
-        }
-    }
+            $table->DESTROY();
+        }
+        $property->DESTROY();
+    }
+
+    # Make any test of the boundary (break) properties TODO if the code
+    # doesn't match the version being compiled
+    my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
+                             ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
+                             : "\nsub TODO_FAILING_BREAKS { 0 }\n";
+
+    @output= map {
+        map s/^/    /mgr,
+        map "$_;\n",
+        split /;\n/, $_
+    } @output;
+
+    # Cause there to be 'if' statements to only execute a portion of this
+    # long-running test each time, so that we can have a bunch of .t's running
+    # in parallel
+    my $chunks = 10     # Number of test files
+               - 1      # For GCB & SB
+               - 1      # For WB
+               - 4;     # LB split into this many files
+    my @output_chunked;
+    my $chunk_count=0;
+    my $chunk_size= int(@output / $chunks) + 1;
+    while (@output) {
+        $chunk_count++;
+        my @chunk= splice @output, 0, $chunk_size;
+        push @output_chunked,
+            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+                @chunk,
+            "}\n";
+    }
+
+    $chunk_count++;
+    push @output_chunked,
+        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+            (map {"    Test_GCB('$_');\n"} @backslash_X_tests),
+            (map {"    Test_SB('$_');\n"} @SB_tests),
+        "}\n";
+
+
+    $chunk_size= int(@LB_tests / 4) + 1;
+    @LB_tests = map {"    Test_LB('$_');\n"} @LB_tests;
+    while (@LB_tests) {
+        $chunk_count++;
+        my @chunk= splice @LB_tests, 0, $chunk_size;
+        push @output_chunked,
+            "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+                @chunk,
+            "}\n";
+    }
+
+    $chunk_count++;
+    push @output_chunked,
+        "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
+            (map {"    Test_WB('$_');\n"} @WB_tests),
+        "}\n";
  
      &write($t_path,
             0,           # Not utf8;
             [$HEADER,
+            $TODO_FAILING_BREAKS,
              <DATA>,
-            @output,
-            (map {"Test_GCB('$_');\n"} @backslash_X_tests),
-            (map {"Test_SB('$_');\n"} @SB_tests),
-            (map {"Test_WB('$_');\n"} @WB_tests),
-            "Finished();\n"
+            @output_chunked,
+            "Finished();\n",
             ]);
  
      return;
@@ -18594,15 +19051,11 @@ my $Validation_Documentation = "Documentation of validation Tests";
  # This is a list of the input files and how to handle them.  The files are
  # processed in their order in this list.  Some reordering is possible if
  # desired, but the PropertyAliases and PropValueAliases files should be first,
-# and the extracted before the others except DAge.txt (as data in an extracted
-# file can be over-ridden by the non-extracted.  Some other files depend on
-# data derived from an earlier file, like UnicodeData requires data from Jamo,
-# and the case changing and folding requires data from Unicode.  Mostly, it is
-# safest to order by first version releases in (except the Jamo).  DAge.txt is
-# read before the extracted ones because of the rarely used feature
-# $compare_versions.  In the unlikely event that there were ever an extracted
-# file that contained the Age property information, it would have to go in
-# front of DAge.
+# and the extracted before the others (as data in an extracted file can be
+# over-ridden by the non-extracted.  Some other files depend on data derived
+# from an earlier file, like UnicodeData requires data from Jamo, and the case
+# changing and folding requires data from Unicode.  Mostly, it is safest to
+# order by first version releases in (except the Jamo).
  #
  # The version strings allow the program to know whether to expect a file or
  # not, but if a file exists in the directory, it will be processed, even if it
@@ -18624,10 +19077,6 @@ my @input_file_objects = (
                       Has_Missings_Defaults => $NOT_IGNORED,
                       Required_Even_in_Debug_Skip => 1,
                      ),
-    Input_file->new('DAge.txt', v3.2.0,
-                    Has_Missings_Defaults => $NOT_IGNORED,
-                    Property => 'Age'
-                   ),
      Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
                      Property => 'General_Category',
                     ),
@@ -18670,6 +19119,14 @@ my @input_file_objects = (
                      Property => 'Joining_Type',
                      Has_Missings_Defaults => $NOT_IGNORED,
                     ),
+    Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
+                    Skip => 'This file adds no new information not already'
+                          . ' present in other files',
+                    # And it's unnecessary programmer work to handle this new
+                    # format.  Previous Derived files actually had bug fixes
+                    # in them that were useful, but that should not be the
+                    # case here.
+                   ),
      Input_file->new('Jamo.txt', v2.0.0,
                      Property => 'Jamo_Short_Name',
                      Each_Line_Handler => \&filter_jamo_line,
@@ -18768,9 +19225,21 @@ my @input_file_objects = (
                      Has_Missings_Defaults => $NOT_IGNORED,
                      Property => 'Line_Break',
                      # Early versions had problematic syntax
-                    Each_Line_Handler => (($v_version lt v3.1.0)
-                                        ? \&filter_early_ea_lb
-                                        : undef),
+                    Each_Line_Handler => ($v_version ge v3.1.0)
+                                          ? undef
+                                          : ($v_version lt v3.0.0)
+                                            ? \&filter_substitute_lb
+                                            : \&filter_early_ea_lb,
+                    # Must use long names for property values see comments at
+                    # sub filter_substitute_lb
+                    Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
+                               'Alphabetic', # default to this because XX ->
+                                             # AL
+
+                               # Don't use _Perl_LB as a synonym for
+                               # Line_Break in later perls, as it is tailored
+                               # and isn't the same as Line_Break
+                               'ONLY_EARLY' ],
                     ),
      Input_file->new('EastAsianWidth.txt', v3.0.0,
                      Property => 'East_Asian_Width',
@@ -18849,6 +19318,10 @@ my @input_file_objects = (
                      Withdrawn => v5.1,
                      Skip => $Documentation,
                     ),
+    Input_file->new('DAge.txt', v3.2.0,
+                    Has_Missings_Defaults => $NOT_IGNORED,
+                    Property => 'Age'
+                   ),
      Input_file->new('HangulSyllableType.txt', v4.0,
                      Has_Missings_Defaults => $NOT_IGNORED,
                      Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
@@ -18865,13 +19338,14 @@ my @input_file_objects = (
                       # for the release it is.  To get it to actually mean
                       # something useful, someone would have to be using an
                       # earlier Unicode release, and copy it into the directory
-                     # for that release and recomplile.  So far there has been
+                     # for that release and recompile.  So far there has been
                       # no demand to do that, so this hasn't been implemented.
                      Skip => 'Documentation of corrections already '
                            . 'incorporated into the Unicode data base',
                     ),
      Input_file->new('StandardizedVariants.html', v3.2.0,
-                    Skip => 'Provides a visual display of the standard '
+                    Skip => 'Obsoleted as of Unicode 9.0, but previously '
+                          . 'provided a visual display of the standard '
                            . 'variant sequences derived from '
                            . 'F<StandardizedVariants.txt>.',
                          # I don't know why the html came earlier than the
@@ -18890,7 +19364,12 @@ my @input_file_objects = (
                      Skip => $Documentation,
                     ),
      Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
-                    Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
+                    Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter',
+
+                               # Don't use _Perl_WB as a synonym for
+                               # Word_Break in later perls, as it is tailored
+                               # and isn't the same as Word_Break
+                               'ONLY_EARLY' ],
                      Property => 'Word_Break',
                      Has_Missings_Defaults => $NOT_IGNORED,
                     ),
@@ -18901,18 +19380,21 @@ my @input_file_objects = (
                     ),
      Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
                      Handler => \&process_GCB_test,
+                    retain_trailing_comments => 1,
                     ),
      Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
                      Skip => $Validation_Documentation,
                     ),
      Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
                      Handler => \&process_SB_test,
+                    retain_trailing_comments => 1,
                     ),
      Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
                      Skip => $Validation_Documentation,
                     ),
      Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
                      Handler => \&process_WB_test,
+                    retain_trailing_comments => 1,
                     ),
      Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
                      Skip => $Validation_Documentation,
@@ -18962,7 +19444,8 @@ my @input_file_objects = (
                            . 'F<NamedSequences.txt> and recompile perl',
                     ),
      Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
-                    Skip => $Validation,
+                    Handler => \&process_LB_test,
+                    retain_trailing_comments => 1,
                     ),
      Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
                      Skip => $Validation_Documentation,
@@ -19076,6 +19559,19 @@ my @input_file_objects = (
                      Property => 'Indic_Positional_Category',
                      Has_Missings_Defaults => $NOT_IGNORED,
                     ),
+    Input_file->new('TangutSources.txt', v9.0.0,
+                    Skip => 'Specifies source mappings for Tangut ideographs'
+                          . ' and components. This data file also includes'
+                          . ' informative radical-stroke values that are used'
+                          . ' internally by Unicode',
+                   ),
+    Input_file->new('VerticalOrientation.txt', v10.0.0,
+                    Property => 'Vertical_Orientation',
+                    Has_Missings_Defaults => $NOT_IGNORED,
+                   ),
+    Input_file->new('NushuSources.txt', v10.0.0,
+                    Skip => 'Specifies source material for Nushu characters',
+                   ),
  );
  
  # End of all the preliminaries.
@@ -19309,10 +19805,10 @@ if (! $rebuild) {
      }
  }
  if (! $rebuild) {
-    print "Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
+    print "$0: Files seem to be ok, not bothering to rebuild.  Add '-w' option to force build\n";
      exit(0);
  }
-print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
+print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
  
  # Ready to do the major processing.  First create the perl pseudo-property.
  $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
@@ -19408,6 +19904,13 @@ if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
      }
      print "\nAll done\n" if $verbosity >= $VERBOSE;
  }
+
+if ($version_of_mk_invlist_bounds lt $v_version) {
+    Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
+                . " to be checked and possibly updated to Unicode"
+                . " $string_version");
+}
+
  exit(0);
  
  # TRAILING CODE IS USED BY make_property_test_script()
@@ -19465,7 +19968,7 @@ sub Expect($$$$) {
          $Tests++;
  
          # A string eval is needed because of the 'no warnings'.
-        # Assumes no parens in the regular expression
+        # Assumes no parentheses in the regular expression
          my $result = eval "$no_warnings
                              my \$RegObj = qr($regex);
                              $string =~ \$RegObj ? 1 : 0";
@@ -19521,28 +20024,52 @@ utf8::upgrade($breakable_utf8);
  my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
  utf8::upgrade($nobreak_utf8);
  
-use Config;
+my $are_ctype_locales_available;
  my $utf8_locale;
  chdir 't' if -d 't';
  eval { require "./loc_tools.pl" };
-$utf8_locale = &find_utf8_ctype_locale if defined &find_utf8_ctype_locale;
+if (defined &locales_enabled) {
+    $are_ctype_locales_available = locales_enabled('LC_CTYPE');
+    if ($are_ctype_locales_available) {
+        $utf8_locale = &find_utf8_ctype_locale;
+    }
+}
+
+# Eval'd so can run on versions earlier than the property is available in
+my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
+if (! defined $WB_Extend_or_Format_re) {
+    $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
+}
  
  sub _test_break($$) {
-    # Test qr/\X/ matches.  The input is a line from auxiliary/GCBTest.txt
-    # Each such line is a sequence of code points given by their hex numbers,
-    # separated by the two characters defined just before this subroutine that
-    # indicate that either there can or cannot be a break between the adjacent
-    # code points.  If there isn't a break, that means the sequence forms an
-    # extended grapheme cluster, which means that \X should match the whole
-    # thing.  If there is a break, \X should stop there.  This is all
-    # converted by this routine into a match:
-    #   $string =~ /(\X)/,
-    # Each \X should match the next cluster; and that is what is checked.
+    # Test various break property matches.  The 2nd parameter gives the
+    # property name.  The input is a line from auxiliary/*Test.txt for the
+    # given property.  Each such line is a sequence of Unicode (not native)
+    # code points given by their hex numbers, separated by the two characters
+    # defined just before this subroutine that indicate that either there can
+    # or cannot be a break between the adjacent code points.  All these are
+    # tested.
+    #
+    # For the gcb property extra tests are made.  if there isn't a break, that
+    # means the sequence forms an extended grapheme cluster, which means that
+    # \X should match the whole thing.  If there is a break, \X should stop
+    # there.  This is all converted by this routine into a match: $string =~
+    # /(\X)/, Each \X should match the next cluster; and that is what is
+    # checked.
  
      my $template = shift;
      my $break_type = shift;
  
      my $line   = (caller 1)[2];   # Line number
+    my $comment = "";
+
+    if ($template =~ / ( .*? ) \s* \# (.*) /x) {
+        $template = $1;
+        $comment = $2;
+
+        # Replace leading spaces with a single one.
+        $comment =~ s/ ^ \s* / # /x;
+    }
  
      # The line contains characters above the ASCII range, but in Latin1.  It
      # may or may not be in utf8, and if it is, it may or may not know it.  So,
@@ -19560,6 +20087,61 @@ sub _test_break($$) {
          $template =~ s/$breakable_utf8/$breakable/g;
      }
  
+    # Perl customizes wb.  So change the official tests accordingly
+    if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
+
+        # Split into elements that alternate between code point and
+        # break/no-break
+        my @line = split / +/, $template;
+
+        # Look at each code point and its following one
+        for (my $i = 1; $i <  @line - 1 - 1; $i+=2) {
+
+            # The customization only involves changing some breaks to
+            # non-breaks.
+            next if $line[$i+1] =~ /$nobreak/;
+
+            my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
+            my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
+
+            # And it only affects adjacent space characters.
+            next if $lhs !~ /\s/u;
+
+            # But, we want to make sure to test spaces followed by a Extend
+            # or Format.
+            next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
+
+            # To test the customization, add some white-space before this to
+            # create a span.  The $lhs white space may or may not be bound to
+            # that span, and also with the $rhs.  If the $rhs is a binding
+            # character, the $lhs is bound to it and not to the span, unless
+            # $lhs is vertical space.  In all other cases, the $lhs is bound
+            # to the span.  If the $rhs is white space, it is bound to the
+            # $lhs
+            my $bound;
+            my $span;
+            if ($rhs =~ /$WB_Extend_or_Format_re/) {
+                if ($lhs =~ /\v/) {
+                    $bound = $breakable;
+                    $span = $nobreak;
+                }
+                else {
+                    $bound = $nobreak;
+                    $span = $breakable;
+                }
+            }
+            else {
+                $span = $nobreak;
+                $bound = $nobreak;
+            }
+
+            splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
+            $i += 4;
+            $line[$i+1] = $bound;
+        }
+        $template = join " ", @line;
+    }
+
      # The input is just the break/no-break symbols and sequences of Unicode
      # code points as hex digits separated by spaces for legibility. e.g.:
      # ÷ 0020 × 0308 ÷ 0020 ÷
@@ -19602,9 +20184,13 @@ sub _test_break($$) {
              $display_upgrade = " (utf8-upgraded)";
          }
  
-        # The /l modifier has C after it to indicate the locale to try
-        my @modifiers = qw(a aa d lC u i);
-        push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+        my @modifiers = qw(a aa d u i);
+        if ($are_ctype_locales_available) {
+            push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+
+            # The /l modifier has C after it to indicate the locale to try
+            push @modifiers, "lC";
+        }
  
          # Test for each of the regex modifiers.
          for my $modifier (@modifiers) {
@@ -19614,13 +20200,7 @@ sub _test_break($$) {
              if ($modifier =~ / ^ l (.*) /x) {
                  my $locale = $1;
                  $display_locale = "(locale = $locale)";
-                use Config;
-                if (defined $Config{d_setlocale}) {
-                    eval { require POSIX; import POSIX 'locale_h'; };
-                    if (defined &POSIX::LC_CTYPE) {
-                        POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
-                    }
-                }
+                POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
                  $modifier = 'l';
              }
  
@@ -19628,12 +20208,24 @@ sub _test_break($$) {
              my $pattern = "(?$modifier:$break_pattern)";
  
              # Actually do the test
+            my $matched_text;
              my $matched = $string =~ qr/$pattern/;
-            print "not " unless $matched;
+            if ($matched) {
+                $matched_text = "matched";
+            }
+            else {
+                $matched_text = "failed to match";
+                print "not ";
+
+                if (TODO_FAILING_BREAKS) {
+                    $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
+                    $comment =~ s/#/# TODO/;
+                }
+            }
+            print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
  
-            # Fancy display of test results
-            $matched = ($matched) ? "matched" : "failed to match";
-            print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
+            # Only print the comment on the first use of this line
+            $comment = "";
  
              # Repeat with the first \B{} in the pattern.  This makes sure the
              # code in regexec.c:find_byclass() for \B gets executed
@@ -19641,7 +20233,10 @@ sub _test_break($$) {
                  my $B_pattern = "$1$2";
                  $matched = $string =~ qr/$B_pattern/;
                  print "not " unless $matched;
-                print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
+                $matched_text = ($matched) ? "matched" : "failed to match";
+                print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
+                print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
+                print "\n";
              }
          }
  
@@ -19664,9 +20259,11 @@ sub _test_break($$) {
                  }
                  print " correctly matched $should_display[$i]; line $line\n";
              } else {
-                $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
+                $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
                                                      split "", $matches[$i]);
-                print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
+                print "not ok $Tests -";
+                print " # TODO" if TODO_FAILING_BREAKS;
+                print " In \"$display_string\" =~ /(\\X)/g, \\X #",
                      $i + 1,
                      " should have matched $should_display[$i]",
                      " but instead matched $matches[$i]",
@@ -19680,7 +20277,9 @@ sub _test_break($$) {
          if (@matches == @should_match) {
              print "ok $Tests - Nothing was left over; line $line\n";
          } else {
-            print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
+            print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
+            print " # TODO" if TODO_FAILING_BREAKS;
+            print "\n";
          }
      }
  
@@ -19691,6 +20290,10 @@ sub Test_GCB($) {
      _test_break(shift, 'gcb');
  }
  
+sub Test_LB($) {
+    _test_break(shift, 'lb');
+}
+
  sub Test_SB($) {
      _test_break(shift, 'sb');
  }
@@ -19704,8 +20307,3 @@ sub Finished() {
      exit($Fails ? -1 : 0);
  }
  
-Error('\p{Script=InGreek}');    # Bug #69018
-Test_GCB("1100 $nobreak 1161");  # Bug #70940
-Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
-Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
-Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726