mktables: Get fold generation to work on early Unicodes

[perl5.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 511ad02..3cce20a 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -4,17 +4,9 @@
  # Any files created or read by this program should be listed in 'mktables.lst'
  # Use -makelist to regenerate it.
  
-# Needs 'no overloading' to run faster on miniperl.  Code commented out at the
-# subroutine objaddr can be used instead to work as far back (untested) as
-# 5.8: needs pack "U".  But almost all occurrences of objaddr have been
-# removed in favor of using 'no overloading'.  You also would have to go
-# through and replace occurrences like:
-#       my $addr = do { no overloading; pack 'J', $self; }
-# with
-#       my $addr = main::objaddr $self;
-# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
-# that instituted the change to main::objaddr, and subsequent commits that
-# changed 0+$self to pack 'J', $self.)
+# There was an attempt when this was first rewritten to make it 5.8
+# compatible, but that has now been abandoned, and newer constructs are used
+# as convenient.
  
  my $start_time;
  BEGIN { # Get the time the script started running; do it at compilation to
@@ -292,8 +284,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  # As mentioned earlier, some properties are given in more than one file.  In
  # particular, the files in the extracted directory are supposedly just
  # reformattings of the others.  But they contain information not easily
-# derivable from the other files, including results for Unihan, which this
-# program doesn't ordinarily look at, and for unassigned code points.  They
+# derivable from the other files, including results for Unihan (which isn't
+# usually available to this program) and for unassigned code points.  They
  # also have historically had errors or been incomplete.  In an attempt to
  # create the best possible data, this program thus processes them first to
  # glean information missing from the other files; then processes those other
@@ -437,8 +429,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  #
  # There is a bug in the 3.2 data file in which some values for the
  # kPrimaryNumeric property have commas and an unexpected comment.  A filter
-# could be added for these; or for a particular installation, the Unihan.txt
-# file could be edited to fix them.
+# could be added to correct these; or for a particular installation, the
+# Unihan.txt file could be edited to fix them.
  #
  # HOW TO ADD A FILE TO BE PROCESSED
  #
@@ -484,12 +476,13 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  # handled by Unicode::Normalize, nor will it compile when presented a version
  # that has them.  However, you can trivially get it to compile by simply
  # ignoring those decompositions, by changing the croak to a carp.  At the time
-# of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
+# of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or
+# cpan/Unicode-Normalize/mkheader) reads
  #
  #   croak("Weird Canonical Decomposition of U+$h");
  #
-# Simply change to a carp.  It will compile, but will not know about any three
-# character decomposition.
+# Simply comment it out.  It will compile, but will not know about any three
+# character decompositions.
  
  # The number of code points in \p{alpha=True} halved in 2.1.9.  It turns out
  # that the reason is that the CJK block starting at 4E00 was removed from
@@ -512,10 +505,13 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  # name for the class, it would not have been affected, but if it used the
  # mnemonic, it would have been.
  #
-# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that code
+# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1.  Before that, code
  # points which eventually came to have this script property value, instead
  # mapped to "Unknown".  But in the next release all these code points were
  # moved to \p{sc=common} instead.
+
+# The tests furnished  by Unicode for testing WordBreak and SentenceBreak
+# generate errors in 5.0 and earlier.
  #
  # The default for missing code points for BidiClass is complicated.  Starting
  # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
@@ -595,8 +591,8 @@ our $to_trace = 0;
                                      || $caller_name eq 'trace');
  
          my $output = "";
+        #print STDERR __LINE__, ": ", join ", ", @input, "\n";
          foreach my $string (@input) {
-            #print STDERR __LINE__, ": ", join ", ", @input, "\n";
              if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
                  $output .= simple_dumper($string);
              }
@@ -622,10 +618,11 @@ our $to_trace = 0;
  
  # This is for a rarely used development feature that allows you to compare two
  # versions of the Unicode standard without having to deal with changes caused
-# by the code points introduced in the later version.  Change the 0 to a
-# string containing a SINGLE dotted Unicode release number (e.g. "2.1").  Only
-# code points introduced in that release and earlier will be used; later ones
-# are thrown away.  You use the version number of the earliest one you want to
+# by the code points introduced in the later version.  You probably also want
+# to use the -annotate option when using this.  Change the 0 to a string
+# containing a SINGLE dotted Unicode release number (e.g. "2.1").  Only code
+# points introduced in that release and earlier will be used; later ones are
+# thrown away.  You use the version number of the earliest one you want to
  # compare; then run this program on directory structures containing each
  # release, and compare the outputs.  These outputs will therefore include only
  # the code points common to both releases, and you can see the changes caused
@@ -886,7 +883,8 @@ if ($v_version ge v5.2.0) {
      }
  }
  
-# Enum values for to_output_map() method in the Map_Table package.
+# Enum values for to_output_map() method in the Map_Table package. (0 is don't
+# output)
  my $EXTERNAL_MAP = 1;
  my $INTERNAL_MAP = 2;
  my $OUTPUT_ADJUSTED = 3;
@@ -917,7 +915,7 @@ my @unimplemented_properties;
  
  # With this release, it is automatically handled if the Unihan db is
  # downloaded
-push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
+push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version lt v5.2.0;
  
  # There are several types of obsolete properties defined by Unicode.  These
  # must be hand-edited for every new Unicode release.
@@ -958,8 +956,7 @@ my %why_obsolete;    # Documentation only
          # existence is not noted in the comment.
          'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
  
-        'Indic_Matra_Category' => "Provisional",
-        'Indic_Syllabic_Category' => "Provisional",
+        'Indic_Matra_Category' => "Withdrawn by Unicode while still provisional",
  
          # Don't suppress ISO_Comment, as otherwise special handling is needed
          # to differentiate between it and gc=c, which can be written as 'isc',
@@ -1199,7 +1196,9 @@ my $DEVELOPMENT_ONLY=<<"EOF";
  
  EOF
  
-my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
+my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
+                                   ? "10FFFF"
+                                   : "FFFF";
  my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
  my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
  
@@ -1229,8 +1228,7 @@ my $code_point_re = qr/\b$run_on_code_point_re/;
  # defaults for code points not listed (i.e., missing) in the file.  The code
  # depends on this ending with a semi-colon, so it can assume it is a valid
  # field when the line is split() by semi-colons
-my $missing_defaults_prefix =
-            qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
+my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
  
  # Property types.  Unicode has more types, but these are sufficient for our
  # purposes.
@@ -1374,18 +1372,21 @@ my %loose_to_file_of;       # loosely maps table names to their respective
                              # files
  my %stricter_to_file_of;    # same; but for stricter mapping.
  my %loose_property_to_file_of; # Maps a loose property name to its map file
+my %strict_property_to_file_of; # Same, but strict
  my @inline_definitions = "V0"; # Each element gives a definition of a unique
                              # inversion list.  When a definition is inlined,
                              # its value in the hash it's in (one of the two
                              # defined just above) will include an index into
                              # this array.  The 0th element is initialized to
-                            # the definition for a zero length invwersion list
+                            # the definition for a zero length inversion list
  my %file_to_swash_name;     # Maps the file name to its corresponding key name
                              # in the hash %utf8::SwashInfo
  my %nv_floating_to_rational; # maps numeric values floating point numbers to
                               # their rational equivalent
  my %loose_property_name_of; # Loosely maps (non_string) property names to
                              # standard form
+my %strict_property_name_of; # Strictly maps (non_string) property names to
+                            # standard form
  my %string_property_loose_to_name; # Same, for string properties.
  my %loose_defaults;         # keys are of form "prop=value", where 'prop' is
                              # the property name in standard loose form, and
@@ -1466,6 +1467,8 @@ my $has_hangul_syllables = 0;
  my $needing_code_points_ending_in_code_point = 0;
  
  my @backslash_X_tests;     # List of tests read in for testing \X
+my @SB_tests;              # List of tests read in for testing \b{sb}
+my @WB_tests;              # List of tests read in for testing \b{wb}
  my @unhandled_properties;  # Will contain a list of properties found in
                             # the input that we didn't process.
  my @match_properties;      # Properties that have match tables, to be
@@ -1491,6 +1494,7 @@ my $block;
  my $perl_charname;
  my $print;
  my $All;
+my $Assigned;   # All assigned characters in this Unicode release
  my $script;
  
  # Are there conflicting names because of beginning with 'In_', or 'Is_'
@@ -1631,12 +1635,7 @@ sub populate_char_info ($) {
                                                      containing_range($i)->end);
              }
          }
-        elsif ($v_version lt v2.0.0) {  # No surrogates in earliest releases
-            $viacode[$i] = $gc->value_of($i);
-            $annotate_char_type[$i] = $UNKNOWN_TYPE;
-            $printable[$i] = 0;
-        }
-        elsif ($gc-> table('Surrogate')->contains($i)) {
+        elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
              $viacode[$i] = 'Surrogate';
              $annotate_char_type[$i] = $SURROGATE_TYPE;
              $printable[$i] = 0;
@@ -2127,11 +2126,11 @@ package Input_file;
  # file line is read.  This allows the EOF handler to flush buffers, for
  # example.  The difference between the two routines is that the lines inserted
  # by insert_lines() are subjected to the each_line_handler()s.  (So if you
-# called it from such a handler, you would get infinite recursion.)  Lines
-# inserted by insert_adjusted_lines() go directly to the main handler without
-# any adjustments.  If the  post-processing handler calls any of these, there
-# will be no effect.  Some error checking for these conditions could be added,
-# but it hasn't been done.
+# called it from such a handler, you would get infinite recursion without some
+# mechanism to prevent that.)  Lines inserted by insert_adjusted_lines() go
+# directly to the main handler without any adjustments.  If the
+# post-processing handler calls any of these, there will be no effect.  Some
+# error checking for these conditions could be added, but it hasn't been done.
  #
  # carp_bad_line() should be called to warn of bad input lines, which clears $_
  # to prevent further processing of the line.  This routine will output the
@@ -2233,7 +2232,8 @@ sub trace { return main::trace(@_); }
  
      my %post_handler;
      # Subroutine to call after all the lines of the file are read in and
-    # processed.  If undef, no such handler is called.
+    # processed.  If undef, no such handler is called.  Note that this cannot
+    # add lines to be processed; instead use eof_handler
      main::set_access('post_handler', \%post_handler, qw{ c });
  
      my %progress_message;
@@ -2287,6 +2287,8 @@ sub trace { return main::trace(@_); }
          $file{$addr} = main::internal_file_to_platform(shift);
          $first_released{$addr} = shift;
  
+        undef $file{$addr} if $first_released{$addr} gt $v_version;
+
          # The rest of the arguments are key => value pairs
          # %constructor_fields has been set up earlier to list all possible
          # ones.  Either set or push, depending on how the default has been set
@@ -2335,13 +2337,15 @@ sub trace { return main::trace(@_); }
          # including its reason
          if ($skip{$addr}) {
              $optional{$addr} = 1;
-            $skipped_files{$file{$addr}} = $skip{$addr}
+            $skipped_files{$file{$addr}} = $skip{$addr} if $file{$addr};
          }
          elsif ($properties{$addr}) {
  
-            # Add a handler for each line in the input so that it creates a
-            # separate input line for each property in those input lines, thus
-            # making them suitable for process_generic_property_file().
+            # Similarly, there may be more than one property represented on
+            # each line, with no clue but the constructor input what those
+            # might be.  Add a handler for each line in the input so that it
+            # creates a separate input line for each property in those input
+            # lines, thus making them suitable to handle generically.
  
              push @{$each_line_handler{$addr}},
                   sub {
@@ -2372,7 +2376,7 @@ sub trace { return main::trace(@_); }
                  };
          }
  
-        {   # On non-ascii platforms, we use a special handler
+        {   # On non-ascii platforms, we use a special pre-handler
              no strict;
              no warnings 'once';
              *next_line = (main::NON_ASCII_PLATFORM)
@@ -2416,7 +2420,8 @@ sub trace { return main::trace(@_); }
          # than this Unicode version), and isn't there.  This means if someone
          # copies it into an earlier version's directory, we will go ahead and
          # process it.
-        return if $first_released{$addr} gt $v_version && ! -e $file;
+        return if $first_released{$addr} gt $v_version
+                  && (! defined $file || ! -e $file);
  
          # If in debugging mode and this file doesn't have the non-skip
          # flag set, and isn't one of the critical files, skip it.
@@ -2455,17 +2460,16 @@ sub trace { return main::trace(@_); }
                  if (! $optional{$addr}  # File could be optional
                      && $v_version ge $first_released{$addr})
                  {
-                    print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
+                    print STDERR "Skipping processing input file '$file' because not found\n";
                  }
                  return;
              }
          }
          else {
-
-            # Here, the file exists.  Some platforms may change the case of
-            # its name
              if ($seen_non_extracted_non_age) {
-                if ($file =~ /$EXTRACTED/i) {
+                if ($file =~ /$EXTRACTED/i) # Some platforms may change the
+                                            # case of the file's name
+                {
                      Carp::my_carp_bug(main::join_lines(<<END
  $file should be processed just after the 'Prop...Alias' files, and before
  anything not in the $EXTRACTED_DIR directory.  Proceeding, but the results may
@@ -2485,7 +2489,7 @@ END
                  $seen_non_extracted_non_age = 1;
              }
  
-            # And mark the file as having being processed, and warn if it
+            # Mark the file as having being processed, and warn if it
              # isn't a file we are expecting.  As we process the files,
              # they are deleted from the hash, so any that remain at the
              # end of the program are files that we didn't process.
@@ -2503,8 +2507,8 @@ END
                  return;
              }
  
-            # Open the file, converting the slashes used in this program
-            # into the proper form for the OS
+            # Here, we are going to process the file.  Open it, converting the
+            # slashes used in this program into the proper form for the OS
              my $file_handle;
              if (not open $file_handle, "<", $file) {
                  Carp::my_carp("Can't open $file.  Skipping: $!");
@@ -2513,6 +2517,11 @@ END
              $handle{$addr} = $file_handle; # Cache the open file handle
  
              if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
+
+                # UnicodeData.txt has no version marker; the others started
+                # getting it in 3.2.  Unihan files have the version somewhere
+                # in the first comment block; the other files have it as the
+                # very first line
                  if ($file !~ /^Unihan/i) {
                      $_ = <$file_handle>;
                      if ($_ !~ / - $string_version \. /x) {
@@ -3727,7 +3736,7 @@ sub trace { return main::trace(@_); }
          #       => $MULTIPLE_BEFORE means that if this range duplicates an
          #                         existing one, but has a different value,
          #                         don't replace the existing one, but insert
-        #                         this, one so that the same range can occur
+        #                         this one so that the same range can occur
          #                         multiple times.  They are stored LIFO, so
          #                         that the final one inserted is the first one
          #                         returned in an ordered search of the table.
@@ -3742,6 +3751,7 @@ sub trace { return main::trace(@_); }
          #                         existing range, this one is discarded
          #                         (leaving the existing one in its original,
          #                         higher priority position
+        #       => $CROAK         Die with an error if is already there
          #       => anything else  is the same as => $IF_NOT_EQUIVALENT
          #
          # "same value" means identical for non-type-0 ranges, and it means
@@ -3830,7 +3840,7 @@ sub trace { return main::trace(@_); }
  
                  # Here, the new range starts just after the current highest in
                  # the range list, and they have the same type and value.
-                # Extend the current range to incorporate the new one.
+                # Extend the existing range to incorporate the new one.
                  @{$r}[-1]->set_end($end);
              }
  
@@ -4223,7 +4233,7 @@ sub trace { return main::trace(@_); }
              # In other words,
              #   r[$i-1]->end < $start <= r[$i]->end
              # And:
-            #   r[$i-1]->end < $start <= $end <= r[$j]->end
+            #   r[$i-1]->end < $start <= $end < r[$j+1]->start
              #
              # Also:
              #   $clean_insert is a boolean which is set true if and only if
@@ -4232,24 +4242,18 @@ sub trace { return main::trace(@_); }
  
              # We now have enough information to decide if this call is a no-op
              # or not.  It is a no-op if this is an insertion of already
-            # existing data.
+            # existing data.  To be so, it must be contained entirely in one
+            # range.
  
              if (main::DEBUG && $to_trace && $clean_insert
-                                         && $i == $j
-                                         && $start >= $r->[$i]->start)
+                                         && $start >= $r->[$i]->start
+                                         && $end   <= $r->[$i]->end)
              {
                      trace "no-op";
              }
              return if $clean_insert
-                      && $i == $j # more than one affected range => not no-op
-
-                      # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
-                      # Further, $start and/or $end is >= r[$i]->start
-                      # The test below hence guarantees that
-                      #     r[$i]->start < $start <= $end <= r[$i]->end
-                      # This means the input range is contained entirely in
-                      # the one at $i, so is a no-op
-                      && $start >= $r->[$i]->start;
+                      && $start >= $r->[$i]->start
+                      && $end   <= $r->[$i]->end;
          }
  
          # Here, we know that some action will have to be taken.  We have
@@ -5277,7 +5281,7 @@ END
          $status = $NORMAL unless defined $status;
  
          # An internal name does not get documented, unless overridden by the
-        # input.
+        # input; same for making tests for it.
          my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
  
          Carp::carp_extra_args(\%args) if main::DEBUG && %args;
@@ -5347,7 +5351,7 @@ END
                  $insert_position,
                  0,
                  Alias->new($name, $loose_match, $make_re_pod_entry,
-                                                $ok_as_filename, $status, $ucd);
+                           $ok_as_filename, $status, $ucd);
  
          # This name may be shorter than any existing ones, so clear the cache
          # of the shortest, so will have to be recalculated.
@@ -6889,7 +6893,7 @@ END
                  # There are tables which end up only having one element per
                  # range, but it is not worth keeping track of for making just
                  # this comment a little better.
-                $comment.= <<END;
+                $comment .= <<END;
  non-comment portions of the main body of lines of this file is:
  START\\tSTOP\\tMAPPING where START is the starting code point of the
  range, in hex; STOP is the ending point, or if omitted, the range has just one
@@ -7849,7 +7853,6 @@ END
                              main::uniques($leader, @{$equivalents{$addr}});
          my $has_unrelated = (@parents >= 2);  # boolean, ? are there unrelated
                                                # tables
-
          for my $parent (@parents) {
  
              my $property = $parent->property;
@@ -7919,8 +7922,7 @@ END
                                      ?  main::max(scalar @table_aliases,
                                                   scalar @property_aliases)
                                      : 0;
-                trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
-
+                trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
  
                  my $property_had_compound_name = 0;
  
@@ -8895,15 +8897,15 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
  
  package main;
  
-    sub display_chr {
-        # Converts an ordinal character value to a displayable string, using a
-        # NBSP to hold combining characters.
+sub display_chr {
+    # Converts an ordinal printable character value to a displayable string,
+    # using a dotted circle to hold combining characters.
  
-        my $ord = shift;
-        my $chr = chr $ord;
-        return $chr if $ccc->table(0)->contains($ord);
-        return chr(utf8::unicode_to_native(0xA0)) . $chr;
-    }
+    my $ord = shift;
+    my $chr = chr $ord;
+    return $chr if $ccc->table(0)->contains($ord);
+    return "\x{25CC}$chr";
+}
  
  sub join_lines($) {
      # Returns lines of the input joined together, so that they can be folded
@@ -9444,7 +9446,6 @@ sub dump_inside_out {
  
      my $object = shift;
      my $fields_ref = shift;
-    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
  
      my $addr = do { no overloading; pack 'J', $object; };
  
@@ -9710,14 +9711,14 @@ sub finish_property_setup {
                                                'AL');
              $lb->set_default_map($default);
          }
+    }
  
-        # If has the URS property, make sure that the standard aliases are in
-        # it, since not in the input tables in some versions.
-        my $urs = property_ref('Unicode_Radical_Stroke');
-        if (defined $urs) {
-            $urs->add_alias('cjkRSUnicode');
-            $urs->add_alias('kRSUnicode');
-        }
+    # If has the URS property, make sure that the standard aliases are in
+    # it, since not in the input tables in some versions.
+    my $urs = property_ref('Unicode_Radical_Stroke');
+    if (defined $urs) {
+        $urs->add_alias('cjkRSUnicode');
+        $urs->add_alias('kRSUnicode');
      }
  
      # For backwards compatibility with applications that may read the mapping
@@ -9943,7 +9944,6 @@ END
      # program generates for this block property value
      #$file->insert_lines('blk; n/a; Herited');
  
-
      # Process each line of the file ...
      while ($file->next_line) {
  
@@ -9960,6 +9960,11 @@ END
          # thus shifting the former field 0 to after them.)
          splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
  
+        if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
+            my $new_style = $data[1] =~ s/-/_/gr;
+            splice @data, 1, 0, $new_style;
+        }
+
          # Field 0 is a short name unless "n/a"; field 1 is the full name.  If
          # there is no short name, use the full one in element 1
          if ($data[0] eq "n/a") {
@@ -10553,7 +10558,8 @@ sub output_perl_charnames_line ($$) {
                                                              $line));
                      }
  
-                    # And process the first range, like any other.
+                    # And set things up so that the below will process this first
+                    # range, like any other.
                      $low = $this_range->start;
                      $high = $this_range->end;
                  }
@@ -11156,8 +11162,10 @@ END
                  $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
  
                  # Rationals require extra effort.
-                register_fraction($fields[$NUMERIC])
-                                                if $fields[$NUMERIC] =~ qr{/};
+                if ($fields[$NUMERIC] =~ qr{/}) {
+                    reduce_fraction(\$fields[$NUMERIC]);
+                    register_fraction($fields[$NUMERIC])
+                }
              }
          }
  
@@ -11721,6 +11729,30 @@ sub process_GCB_test {
      return;
  }
  
+sub process_SB_test {
+
+    my $file = shift;
+    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+    while ($file->next_line) {
+        push @SB_tests, $_;
+    }
+
+    return;
+}
+
+sub process_WB_test {
+
+    my $file = shift;
+    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+    while ($file->next_line) {
+        push @WB_tests, $_;
+    }
+
+    return;
+}
+
  sub process_NamedSequences {
      # NamedSequences.txt entries are just added to an array.  Because these
      # don't look like the other tables, they have their own handler.
@@ -12003,10 +12035,8 @@ sub filter_old_style_case_folding {
      Carp::carp_extra_args(\@_) if main::DEBUG && @_;
  
      my @fields = split /\s*;\s*/;
-    if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
-        $fields[1] = 'I';
-    }
-    elsif ($fields[1] eq 'L') {
+
+    if ($fields[1] eq 'L') {
          $fields[1] = 'C';             # L => C always
      }
      elsif ($fields[1] eq 'E') {
@@ -12171,6 +12201,50 @@ sub register_fraction($) {
      return;
  }
  
+sub gcd($$) {   # Greatest-common-divisor; from
+                # http://en.wikipedia.org/wiki/Euclidean_algorithm
+    my ($a, $b) = @_;
+
+    use integer;
+
+    while ($b != 0) {
+       my $temp = $b;
+       $b = $a % $b;
+       $a = $temp;
+    }
+    return $a;
+}
+
+sub reduce_fraction($) {
+    my $fraction_ref = shift;
+
+    # Reduce a fraction to lowest terms.  The Unicode data may be reducible,
+    # hence this is needed.  The argument is a reference to the
+    # string denoting the fraction, which must be of the form:
+    if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
+        Carp::my_carp_bug("Non-fraction input '$$fraction_ref'.  Unchanged");
+        return;
+    }
+
+    my $sign = $1;
+    my $numerator = $2;
+    my $denominator = $3;
+
+    use integer;
+
+    # Find greatest common divisor
+    my $gcd = gcd($numerator, $denominator);
+
+    # And reduce using the gcd.
+    if ($gcd != 1) {
+        $numerator    /= $gcd;
+        $denominator  /= $gcd;
+        $$fraction_ref = "$sign$numerator/$denominator";
+    }
+
+    return;
+}
+
  sub filter_numeric_value_line {
      # DNumValues contains lines of a different syntax than the typical
      # property file:
@@ -12195,7 +12269,9 @@ sub filter_numeric_value_line {
              $_ = "";
              return;
          }
+        reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
          $rational = $fields[3];
+
          $_ = join '; ', @fields[ 0, 3 ];
      }
      else {
@@ -12368,7 +12444,7 @@ sub filter_blocks_lines {
  
      # Change hyphens and blanks in the block name field only
      $fields[1] =~ s/[ -]/_/g;
-    $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g;   # Capitalize first letter of word
+    $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg;   # Capitalize first letter of word
  
      $_ = join("; ", @fields);
      return;
@@ -12967,6 +13043,22 @@ sub filter_early_version_name_alias_line {
      return;
  }
  
+sub filter_all_caps_script_names {
+
+    # Some early Unicode releases had the script names in all CAPS.  This
+    # converts them to just the first letter of each word being capital.
+
+    my ($range, $script, @remainder)
+        = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+    my @words = split "_", $script;
+    for my $word (@words) {
+        $word =
+            ucfirst(lc($word)) if $word ne 'CJK';
+    }
+    $script = join "_", @words;
+    $_ = join ";", $range, $script, @remainder;
+}
+
  sub finish_Unicode() {
      # This routine should be called after all the Unicode files have been read
      # in.  It:
@@ -13131,7 +13223,14 @@ END
  
                  # Find the non-default table.
                  for my $table ($property->tables) {
-                    next if $table == $default_table;
+                    if ($table == $default_table) {
+                        if ($v_version le v5.0.0) {
+                            $table->add_alias($_) for qw(N No F False);
+                        }
+                        next;
+                    } elsif ($v_version le v5.0.0) {
+                        $table->add_alias($_) for qw(Y Yes T True);
+                    }
                      $non_default_table = $table;
                  }
                  $default_table->set_complement($non_default_table);
@@ -13250,8 +13349,6 @@ END
      $gc->table('Ll')->set_caseless_equivalent($LC);
      $gc->table('Lu')->set_caseless_equivalent($LC);
  
-    my $Cs = $gc->table('Cs');
-
      # Create digit and case fold tables with the original file names for
      # backwards compatibility with applications that read them directly.
      my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
@@ -13337,6 +13434,26 @@ sub pre_3_dot_1_Nl () {
      return $Nl;
  }
  
+sub calculate_Assigned() {  # Calculate the gc != Cn code points; may be
+                            # called before the Cn's are completely filled.
+                            # Works on Unicodes earlier than ones that
+                            # explicitly specify Cn.
+    return if defined $Assigned;
+
+    if (! defined $gc || $gc->is_empty()) {
+        Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
+    }
+
+    $Assigned = $perl->add_match_table('Assigned',
+                                Description  => "All assigned code points",
+                                );
+    while (defined (my $range = $gc->each_range())) {
+        my $standard_value = standardize($range->value);
+        next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
+        $Assigned->add_range($range->start, $range->end);
+    }
+}
+
  sub compile_perl() {
      # Create perl-defined tables.  Almost all are part of the pseudo-property
      # named 'perl' internally to this program.  Many of these are recommended
@@ -13375,16 +13492,12 @@ sub compile_perl() {
      }
  
      my $Any = $perl->add_match_table('Any',
-                                     Description  => "All Unicode code points: [\\x{0000}-\\x{10FFFF}]",
+                                     Description  => "All Unicode code points: [\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
                                       );
-    $Any->add_range(0, 0x10FFFF);
+    $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
      $Any->add_alias('Unicode');
  
-    # Assigned is the opposite of gc=unassigned
-    my $Assigned = $perl->add_match_table('Assigned',
-                                Description  => "All assigned code points",
-                                Initialize => ~ $gc->table('Unassigned'),
-                                );
+    calculate_Assigned();
  
      # Our internal-only property should be treated as more than just a
      # synonym; grandfather it in to the pod.
@@ -13430,32 +13543,27 @@ sub compile_perl() {
  
          # There are quite a few code points in Lower, that aren't in gc=lc,
          # and not all are in all releases.
-        foreach my $code_point (    utf8::unicode_to_native(0xAA),
-                                    utf8::unicode_to_native(0xBA),
-                                    0x02B0 .. 0x02B8,
-                                    0x02C0 .. 0x02C1,
-                                    0x02E0 .. 0x02E4,
-                                    0x0345,
-                                    0x037A,
-                                    0x1D2C .. 0x1D6A,
-                                    0x1D78,
-                                    0x1D9B .. 0x1DBF,
-                                    0x2071,
-                                    0x207F,
-                                    0x2090 .. 0x209C,
-                                    0x2170 .. 0x217F,
-                                    0x24D0 .. 0x24E9,
-                                    0x2C7C .. 0x2C7D,
-                                    0xA770,
-                                    0xA7F8 .. 0xA7F9,
-        ) {
-            # Don't include the code point unless it is assigned in this
-            # release
-            my $category = $gc->value_of(hex $code_point);
-            next if ! defined $category || $category eq 'Cn';
-
-            $Lower += $code_point;
-        }
+        my $temp = Range_List->new(Initialize => [
+                                                utf8::unicode_to_native(0xAA),
+                                                utf8::unicode_to_native(0xBA),
+                                                0x02B0 .. 0x02B8,
+                                                0x02C0 .. 0x02C1,
+                                                0x02E0 .. 0x02E4,
+                                                0x0345,
+                                                0x037A,
+                                                0x1D2C .. 0x1D6A,
+                                                0x1D78,
+                                                0x1D9B .. 0x1DBF,
+                                                0x2071,
+                                                0x207F,
+                                                0x2090 .. 0x209C,
+                                                0x2170 .. 0x217F,
+                                                0x24D0 .. 0x24E9,
+                                                0x2C7C .. 0x2C7D,
+                                                0xA770,
+                                                0xA7F8 .. 0xA7F9,
+                                ]);
+        $Lower += $temp & $Assigned;
      }
      my $Posix_Lower = $perl->add_match_table("PosixLower",
                              Description => "[a-z]",
@@ -13776,11 +13884,18 @@ sub compile_perl() {
                              Initialize => $Cntrl & $ASCII,
                              );
  
+    my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
+    if (defined (my $Cs = $gc->table('Cs'))) {
+        $perl_surrogate += $Cs;
+    }
+    else {
+        push @tables_that_may_be_empty, '_Perl_Surrogate';
+    }
+
      # $controls is a temporary used to construct Graph.
      my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
-                                                + $gc->table('Control'));
-    # Cs not in release 1
-    $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
+                                                + $gc->table('Control')
+                                                + $perl_surrogate);
  
      # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
      my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
@@ -13886,14 +14001,21 @@ sub compile_perl() {
      # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
      # of the MU issue.
      foreach my $range ($loc_problem_folds->ranges) {
-        foreach my $code_point($range->start .. $range->end) {
+        foreach my $code_point ($range->start .. $range->end) {
              my $fold_range = $cf->containing_range($code_point);
              next unless defined $fold_range;
  
+            # Skip if folds to itself
+            next if $fold_range->value eq $CODE_POINT;
+
              my @hex_folds = split " ", $fold_range->value;
-            my $start_cp = hex $hex_folds[0];
+            my $start_cp = $hex_folds[0];
+            next if $start_cp eq $CODE_POINT;
+            $start_cp = hex $start_cp;
              foreach my $i (0 .. @hex_folds - 1) {
-                my $cp = hex $hex_folds[$i];
+                my $cp = $hex_folds[$i];
+                next if $cp eq $CODE_POINT;
+                $cp = hex $cp;
                  next unless $cp > 255;    # Already have the < 256 ones
  
                  $loc_problem_folds->add_range($cp, $cp);
@@ -13907,9 +14029,13 @@ sub compile_perl() {
           Description =>
                "Code points whose fold is a string of more than one character",
      );
+    if ($v_version lt v3.0.1) {
+        push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char';
+    }
  
      # Look through all the known folds to populate these tables.
      foreach my $range ($cf->ranges) {
+        next if $range->value eq $CODE_POINT;
          my $start = $range->start;
          my $end = $range->end;
          $any_folds->add_range($start, $end);
@@ -14119,107 +14245,6 @@ sub compile_perl() {
                                      + utf8::unicode_to_native(0xA0) # NBSP
                          );
  
-    # These two tables are for matching \X, which is based on the 'extended'
-    # grapheme cluster, which came in 5.1; create empty ones if not already
-    # present.  The straight 'grapheme cluster' (non-extended) is used prior
-    # to 5.1, and differs from the extended (see
-    # http://www.unicode.org/reports/tr29/) only by these two tables, so we
-    # get the older definition automatically when they are empty.
-    my $gcb = property_ref('Grapheme_Cluster_Break');
-    my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
-                                        Perl_Extension => 1,
-                                        Fate => $INTERNAL_ONLY);
-    if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
-        $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
-    }
-    else {
-        push @tables_that_may_be_empty, $perl_prepend->complete_name;
-    }
-
-    # All the tables with _X_ in their names are used in defining \X handling,
-    # and are based on the Unicode GCB property.  Basically, \X matches:
-    #   CR LF
-    #   | Prepend* Begin Extend*
-    #   | .
-    # Begin is:           ( Special_Begin | ! Control )
-    # Begin is also:      ( Regular_Begin | Special_Begin )
-    #   where Regular_Begin is defined as ( ! Control - Special_Begin )
-    # Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
-    # Extend is:          ( Grapheme_Extend | Spacing_Mark )
-    # Control is:         [ GCB_Control | CR | LF ]
-    # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
-
-    foreach my $gcb_name (qw{ L V T LV LVT }) {
-
-        # The perl internal extension's name is the gcb table name prepended
-        # with an '_X_'
-        my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
-                                        Perl_Extension => 1,
-                                        Fate => $INTERNAL_ONLY,
-                                        Initialize => $gcb->table($gcb_name),
-                                        );
-        # Version 1 had mostly different Hangul syllables that were removed
-        # from later versions, so some of the tables may not apply.
-        if ($v_version lt v2.0) {
-            push @tables_that_may_be_empty, $perl_table->complete_name;
-        }
-    }
-
-    # More GCB.  Populate a combined hangul syllables table
-    my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
-                                          Perl_Extension => 1,
-                                          Fate => $INTERNAL_ONLY);
-    $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
-    $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
-
-    my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
-                                    Fate => $INTERNAL_ONLY);
-    if ($v_version ge v6.2) {
-        $ri += $gcb->table('RI');
-    }
-    else {
-        push @tables_that_may_be_empty, $ri->full_name;
-    }
-
-    my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start',
-                                       Perl_Extension => 1,
-                                       Fate => $INTERNAL_ONLY,
-                                       Initialize => $lv_lvt_v
-                                                   + $gcb->table('L')
-                                                   + $gcb->table('T')
-                                                   + $ri
-                                      );
-    $specials_begin->add_comment(join_lines( <<END
-For use in \\X; matches first (perhaps only) character of potential
-multi-character sequences that can begin an extended grapheme cluster.  They
-need special handling because of their complicated nature.
-END
-    ));
-    my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
-                                       Perl_Extension => 1,
-                                       Fate => $INTERNAL_ONLY,
-                                       Initialize => ~ $gcb->table('Control')
-                                                   - $specials_begin
-                                                   - $gcb->table('CR')
-                                                   - $gcb->table('LF')
-                                      );
-    $regular_begin->add_comment(join_lines( <<END
-For use in \\X; matches first character of anything that can begin an extended
-grapheme cluster, except those that require special handling.
-END
-    ));
-
-    my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
-                                        Fate => $INTERNAL_ONLY,
-                                        Initialize => $gcb->table('Extend')
-                                       );
-    if (defined (my $sm = $gcb->table('SpacingMark'))) {
-        $extend += $sm;
-    }
-    $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
-
-    # End of GCB \X processing
-
      my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
  
      if (@named_sequences) {
@@ -14282,8 +14307,9 @@ order; and finally any abbreviations, again in no particular order.
  END
  
      # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
-    # precedence before 6.1, so should be first in the file; the other names
-    # have precedence starting in 6.1,
+    # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
+    # so should be first in the file; the other names have precedence starting
+    # in 6.1,
      my $before_or_after = ($v_version lt v6.1.0)
                            ? $MULTIPLE_BEFORE
                            : $MULTIPLE_AFTER;
@@ -14428,6 +14454,25 @@ END
          $unassigned->set_equivalent_to($age_default, Related => 1);
      }
  
+    my $patws = $perl->add_match_table('_Perl_PatWS',
+                                       Perl_Extension => 1,
+                                       Fate => $INTERNAL_ONLY);
+    if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
+        $patws->initialize($off_patws->table('Y'));
+    }
+    else {
+        $patws->initialize([ ord("\t"),
+                             ord("\n"),
+                             utf8::unicode_to_native(0x0B), # VT
+                             ord("\f"),
+                             ord("\r"),
+                             ord(" "),
+                             utf8::unicode_to_native(0x85), # NEL
+                             0x200E..0x200F,             # Left, Right marks
+                             0x2028..0x2029              # Line, Paragraph seps
+                           ] );
+    }
+
      # See L<perlfunc/quotemeta>
      my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
                                             Perl_Extension => 1,
@@ -14436,22 +14481,78 @@ END
                                             # Initialize to what's common in
                                             # all Unicode releases.
                                             Initialize =>
-                                                $Space
-                                                + $gc->table('Control')
+                                                  $gc->table('Control')
+                                                + $Space
+                                                + $patws
+                                                + ((~ $Word) & $ASCII)
                             );
  
-    # In early releases without the proper Unicode properties, just set to \W.
-    if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
-        || ! defined (my $patws = property_ref('Pattern_White_Space'))
-        || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
-    {
-        $quotemeta += ~ $Word;
+    if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
+        $quotemeta += $patsyn->table('Y');
+    }
+    else {
+        $quotemeta += ((~ $Word) & Range->new(0, 255))
+                    - utf8::unicode_to_native(0xA8)
+                    - utf8::unicode_to_native(0xAF)
+                    - utf8::unicode_to_native(0xB2)
+                    - utf8::unicode_to_native(0xB3)
+                    - utf8::unicode_to_native(0xB4)
+                    - utf8::unicode_to_native(0xB7)
+                    - utf8::unicode_to_native(0xB8)
+                    - utf8::unicode_to_native(0xB9)
+                    - utf8::unicode_to_native(0xBC)
+                    - utf8::unicode_to_native(0xBD)
+                    - utf8::unicode_to_native(0xBE);
+        $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
+                        # same in all releases
+                        0x2010 .. 0x2027,
+                        0x2030 .. 0x203E,
+                        0x2041 .. 0x2053,
+                        0x2055 .. 0x205E,
+                        0x2190 .. 0x245F,
+                        0x2500 .. 0x2775,
+                        0x2794 .. 0x2BFF,
+                        0x2E00 .. 0x2E7F,
+                        0x3001 .. 0x3003,
+                        0x3008 .. 0x3020,
+                        0x3030 .. 0x3030,
+                        0xFD3E .. 0xFD3F,
+                        0xFE45 .. 0xFE46
+                      ];
+    }
+
+    if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
+        $quotemeta += $di->table('Y')
      }
      else {
-        $quotemeta += $patsyn->table('Y')
-                   + $patws->table('Y')
-                   + $di->table('Y')
-                   + ((~ $Word) & $ASCII);
+        if ($v_version ge v2.0) {
+            $quotemeta += $gc->table('Cf')
+                       +  $gc->table('Cs');
+        }
+        $quotemeta += $gc->table('Cc')
+                    - $Space;
+        my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
+                                                   0x2060 .. 0x206F,
+                                                   0xFE00 .. 0xFE0F,
+                                                   0xFFF0 .. 0xFFFB,
+                                                   0xE0000 .. 0xE0FFF,
+                                                  ]);
+        $quotemeta += $temp & $Assigned;
+    }
+
+    my $nchar = $perl->add_match_table('_Perl_Nchar',
+                                       Perl_Extension => 1,
+                                       Fate => $INTERNAL_ONLY);
+    if (defined (my $off_nchar = property_ref('Nchar'))) {
+        $nchar->initialize($off_nchar->table('Y'));
+    }
+    else {
+        $nchar->initialize([ 0xFFFE .. 0xFFFF ]);
+        if ($v_version ge v2.0) {   # First release with these nchars
+            for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
+                $nchar += [ $i .. $i+1 ];
+            }
+        }
      }
  
      # Finished creating all the perl properties.  All non-internal non-string
@@ -14773,7 +14874,7 @@ sub register_file_for_name($$$) {
      my $file = shift;            # The file name in the final directory.
      Carp::carp_extra_args(\@_) if main::DEBUG && @_;
  
-    trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
+    trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
  
      if ($table->isa('Property')) {
          $table->set_file_path(@$directory_ref, $file);
@@ -14794,7 +14895,12 @@ sub register_file_for_name($$$) {
          # property's map table
          foreach my $alias ($table->aliases) {
              my $name = $alias->name;
-            $loose_property_to_file_of{standardize($name)} = $file;
+            if ($name =~ /^_/) {
+                $strict_property_to_file_of{lc $name} = $file;
+            }
+            else {
+                $loose_property_to_file_of{standardize($name)} = $file;
+            }
          }
  
          # And a way for utf8_heavy to find the proper key in the SwashInfo
@@ -15644,9 +15750,9 @@ sub pod_alphanumeric_sort {
      # The first few character columns are filler, plus the '\p{'; and get rid
      # of all the trailing stuff, starting with the trailing '}', so as to sort
      # on just 'Name=Value'
-    (my $a = lc $a) =~ s/^ .*? { //x;
+    (my $a = lc $a) =~ s/^ .*? \{ //x;
      $a =~ s/}.*//;
-    (my $b = lc $b) =~ s/^ .*? { //x;
+    (my $b = lc $b) =~ s/^ .*? \{ //x;
      $b =~ s/}.*//;
  
      # Determine if the two operands are both internal only or both not.
@@ -16260,6 +16366,10 @@ controlling lists contained in the program
  C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
  (C<\%Config> is available from the Config module).
  
+Also, perl can be recompiled to operate on an earlier version of the Unicode
+standard.  Further information is at
+C<\$Config{privlib}>/F<unicore/README.perl>.
+
  =head1 Other information in the Unicode data base
  
  The Unicode data base is delivered in two different formats.  The XML version
@@ -16310,6 +16420,10 @@ sub make_Heavy () {
                             = simple_dumper(\%loose_property_name_of, ' ' x 4);
      chomp $loose_property_name_of;
  
+    my $strict_property_name_of
+                           = simple_dumper(\%strict_property_name_of, ' ' x 4);
+    chomp $strict_property_name_of;
+
      my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
      chomp $stricter_to_file_of;
  
@@ -16348,6 +16462,10 @@ sub make_Heavy () {
                          = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
      chomp $loose_property_to_file_of;
  
+    my $strict_property_to_file_of
+                        = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
+    chomp $strict_property_to_file_of;
+
      my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
      chomp $file_to_swash_name;
  
@@ -16363,6 +16481,11 @@ $INTERNAL_ONLY_HEADER
  $loose_property_name_of
  );
  
+# Same, but strict names
+\%utf8::strict_property_name_of = (
+$strict_property_name_of
+);
+
  # Gives the definitions (in the form of inversion lists) for those properties
  # whose definitions aren't kept in files
  \@utf8::inline_definitions = (
@@ -16411,6 +16534,11 @@ $caseless_equivalent_to
  $loose_property_to_file_of
  );
  
+# Property names to mapping files
+\%utf8::strict_property_to_file_of = (
+$strict_property_to_file_of
+);
+
  # Files to the swash names within them.
  \%utf8::file_to_swash_name = (
  $file_to_swash_name
@@ -16782,8 +16910,8 @@ sub make_UCD () {
      # an element for the Hangul syllables in the appropriate place, and
      # otherwise changes the name to include the "-<code point>" suffix.
      my @algorithm_names;
-    my $done_hangul = 0;
-
+    my $done_hangul = $v_version lt v2.0.0;  # Hanguls as we know them came
+                                             # along in this version
      # Copy it linearly.
      for my $i (0 .. @code_points_ending_in_code_point - 1) {
  
@@ -17232,12 +17360,14 @@ sub write_all_tables() {
                          }
                      }
                      else {
-                        if (exists ($loose_property_name_of{$alias_standard}))
-                        {
-                            Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}.  Old name is retained");
+                        my $hash_ref = ($alias_standard =~ /^_/)
+                                       ? \%strict_property_name_of
+                                       : \%loose_property_name_of;
+                        if (exists $hash_ref->{$alias_standard}) {
+                            Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}.  Old name is retained");
                          }
                          else {
-                            $loose_property_name_of{$alias_standard}
+                            $hash_ref->{$alias_standard}
                                                  = $standard_property_name;
                          }
  
@@ -17488,7 +17618,7 @@ sub generate_tests($$$$$) {
      my @output;
      # Create a complete set of tests, with complements.
      if (defined $valid_code) {
-       push @output, <<"EOC"
+        push @output, <<"EOC"
  Expect(1, $valid_code, '\\p{$name}', $warning);
  Expect(0, $valid_code, '\\p{^$name}', $warning);
  Expect(0, $valid_code, '\\P{$name}', $warning);
@@ -17496,7 +17626,7 @@ Expect(1, $valid_code, '\\P{^$name}', $warning);
  EOC
      }
      if (defined $invalid_code) {
-       push @output, <<"EOC"
+        push @output, <<"EOC"
  Expect(0, $invalid_code, '\\p{$name}', $warning);
  Expect(1, $invalid_code, '\\p{^$name}', $warning);
  Expect(1, $invalid_code, '\\P{$name}', $warning);
@@ -17883,6 +18013,8 @@ sub make_property_test_script() {
              <DATA>,
              @output,
              (map {"Test_GCB('$_');\n"} @backslash_X_tests),
+            (map {"Test_SB('$_');\n"} @SB_tests),
+            (map {"Test_WB('$_');\n"} @WB_tests),
              "Finished();\n"
             ]);
  
@@ -17984,65 +18116,65 @@ END
  my @input_file_objects = (
      Input_file->new('PropertyAliases.txt', v0,
                      Handler => \&process_PropertyAliases,
-                    ),
+                   ),
      Input_file->new(undef, v0,  # No file associated with this
                      Progress_Message => 'Finishing property setup',
                      Handler => \&finish_property_setup,
-                    ),
+                   ),
      Input_file->new('PropValueAliases.txt', v0,
                       Handler => \&process_PropValueAliases,
                       Has_Missings_Defaults => $NOT_IGNORED,
-                     ),
+                    ),
      Input_file->new('DAge.txt', v3.2.0,
                      Has_Missings_Defaults => $NOT_IGNORED,
                      Property => 'Age'
-                    ),
+                   ),
      Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
                      Property => 'General_Category',
-                    ),
+                   ),
      Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
                      Property => 'Canonical_Combining_Class',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
                      Property => 'Numeric_Type',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
                      Property => 'East_Asian_Width',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
                      Property => 'Line_Break',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
                      Property => 'Bidi_Class',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
                      Property => 'Decomposition_Type',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
      Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
                      Property => 'Numeric_Value',
                      Each_Line_Handler => \&filter_numeric_value_line,
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
                      Property => 'Joining_Group',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
  
      Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
                      Property => 'Joining_Type',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new('Jamo.txt', v2.0.0,
                      Property => 'Jamo_Short_Name',
                      Each_Line_Handler => \&filter_jamo_line,
-                    ),
+                   ),
      Input_file->new('UnicodeData.txt', v1.1.5,
                      Pre_Handler => \&setup_UnicodeData,
  
@@ -18077,7 +18209,7 @@ my @input_file_objects = (
                                              \&filter_UnicodeData_line,
                                           ],
                      EOF_Handler => \&EOF_UnicodeData,
-                    ),
+                   ),
      Input_file->new('ArabicShaping.txt', v2.0.0,
                      Each_Line_Handler =>
                          ($v_version lt 4.1.0)
@@ -18087,29 +18219,29 @@ my @input_file_objects = (
                      # not used by Perl
                      Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new('Blocks.txt', v2.0.0,
                      Property => 'Block',
                      Has_Missings_Defaults => $NOT_IGNORED,
                      Each_Line_Handler => \&filter_blocks_lines
-                    ),
+                   ),
      Input_file->new('PropList.txt', v2.0.0,
                      Each_Line_Handler => (($v_version lt v3.1.0)
                                              ? \&filter_old_style_proplist
                                              : undef),
-                    ),
+                   ),
      Input_file->new('Unihan.txt', v2.0.0,
                      Pre_Handler => \&setup_unihan,
                      Optional => 1,
                      Each_Line_Handler => \&filter_unihan_line,
-                        ),
+                   ),
      Input_file->new('SpecialCasing.txt', v2.1.8,
                      Each_Line_Handler => ($v_version eq 2.1.8)
                                           ? \&filter_2_1_8_special_casing_line
                                           : \&filter_special_casing_line,
                      Pre_Handler => \&setup_special_casing,
                      Has_Missings_Defaults => $IGNORED,
-                    ),
+                   ),
      Input_file->new(
                      'LineBreak.txt', v3.0.0,
                      Has_Missings_Defaults => $NOT_IGNORED,
@@ -18118,7 +18250,7 @@ my @input_file_objects = (
                      Each_Line_Handler => (($v_version lt v3.1.0)
                                          ? \&filter_early_ea_lb
                                          : undef),
-                    ),
+                   ),
      Input_file->new('EastAsianWidth.txt', v3.0.0,
                      Property => 'East_Asian_Width',
                      Has_Missings_Defaults => $NOT_IGNORED,
@@ -18126,10 +18258,10 @@ my @input_file_objects = (
                      Each_Line_Handler => (($v_version lt v3.1.0)
                                          ? \&filter_early_ea_lb
                                          : undef),
-                    ),
+                   ),
      Input_file->new('CompositionExclusions.txt', v3.0.0,
                      Property => 'Composition_Exclusion',
-                    ),
+                   ),
      Input_file->new('BidiMirroring.txt', v3.0.1,
                      Property => 'Bidi_Mirroring_Glyph',
                      Has_Missings_Defaults => ($v_version lt v6.2.0)
@@ -18138,12 +18270,11 @@ my @input_file_objects = (
                                                # anything to us, we will use the
                                                # null string
                                                : $IGNORED,
-
-                    ),
+                   ),
      Input_file->new("NormTest.txt", v3.0.0,
                       Handler => \&process_NormalizationsTest,
                       Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
-                    ),
+                   ),
      Input_file->new('CaseFolding.txt', v3.0.1,
                      Pre_Handler => \&setup_case_folding,
                      Each_Line_Handler =>
@@ -18153,60 +18284,63 @@ my @input_file_objects = (
                             \&filter_case_folding_line
                          ],
                      Has_Missings_Defaults => $IGNORED,
-                    ),
+                   ),
      Input_file->new('DCoreProperties.txt', v3.1.0,
                      # 5.2 changed this file
                      Has_Missings_Defaults => (($v_version ge v5.2.0)
                                              ? $NOT_IGNORED
                                              : $NO_DEFAULTS),
-                    ),
+                   ),
      Input_file->new('Scripts.txt', v3.1.0,
                      Property => 'Script',
+                    Each_Line_Handler => (($v_version le v4.0.0)
+                                          ? \&filter_all_caps_script_names
+                                          : undef),
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new('DNormalizationProps.txt', v3.1.0,
                      Has_Missings_Defaults => $NOT_IGNORED,
                      Each_Line_Handler => (($v_version lt v4.0.1)
                                        ? \&filter_old_style_normalization_lines
                                        : undef),
-                    ),
+                   ),
      Input_file->new('HangulSyllableType.txt', v0,
                      Has_Missings_Defaults => $NOT_IGNORED,
                      Property => 'Hangul_Syllable_Type',
                      Pre_Handler => ($v_version lt v4.0.0)
                                     ? \&generate_hst
                                     : undef,
-                    ),
+                   ),
      Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
                      Property => 'Word_Break',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
                      Property => 'Grapheme_Cluster_Break',
                      Has_Missings_Defaults => $NOT_IGNORED,
                      Pre_Handler => ($v_version lt v4.1.0)
                                     ? \&generate_GCB
                                     : undef,
-                    ),
+                   ),
      Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
                      Handler => \&process_GCB_test,
-                    ),
+                   ),
      Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
                      Skip => 'Validation Tests',
-                    ),
+                   ),
      Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
-                    Skip => 'Validation Tests',
-                    ),
+                    Handler => \&process_SB_test,
+                   ),
      Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
-                    Skip => 'Validation Tests',
-                    ),
+                    Handler => \&process_WB_test,
+                   ),
      Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
                      Property => 'Sentence_Break',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    ),
+                   ),
      Input_file->new('NamedSequences.txt', v4.1.0,
                      Handler => \&process_NamedSequences
-                    ),
+                   ),
      Input_file->new('NameAliases.txt', v0,
                      Property => 'Name_Alias',
                      Pre_Handler => ($v_version le v6.0.0)
@@ -18215,43 +18349,43 @@ my @input_file_objects = (
                      Each_Line_Handler => ($v_version le v6.0.0)
                                     ? \&filter_early_version_name_alias_line
                                     : \&filter_later_version_name_alias_line,
-                    ),
+                   ),
      Input_file->new("BidiTest.txt", v5.2.0,
                      Skip => 'Validation Tests',
-                    ),
+                   ),
      Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
                      Optional => 1,
                      Each_Line_Handler => \&filter_unihan_line,
-                    ),
+                   ),
      Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
                      Optional => 1,
                      Each_Line_Handler => \&filter_unihan_line,
-                    ),
+                   ),
      Input_file->new('UnihanIRGSources.txt', v5.2.0,
                      Optional => 1,
                      Pre_Handler => \&setup_unihan,
                      Each_Line_Handler => \&filter_unihan_line,
-                    ),
+                   ),
      Input_file->new('UnihanNumericValues.txt', v5.2.0,
                      Optional => 1,
                      Each_Line_Handler => \&filter_unihan_line,
-                    ),
+                   ),
      Input_file->new('UnihanOtherMappings.txt', v5.2.0,
                      Optional => 1,
                      Each_Line_Handler => \&filter_unihan_line,
-                    ),
+                   ),
      Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
                      Optional => 1,
                      Each_Line_Handler => \&filter_unihan_line,
-                    ),
+                   ),
      Input_file->new('UnihanReadings.txt', v5.2.0,
                      Optional => 1,
                      Each_Line_Handler => \&filter_unihan_line,
-                    ),
+                   ),
      Input_file->new('UnihanVariants.txt', v5.2.0,
                      Optional => 1,
                      Each_Line_Handler => \&filter_unihan_line,
-                    ),
+                   ),
      Input_file->new('ScriptExtensions.txt', v6.0.0,
                      Property => 'Script_Extensions',
                      Pre_Handler => \&setup_script_extensions,
@@ -18259,28 +18393,37 @@ my @input_file_objects = (
                      Has_Missings_Defaults => (($v_version le v6.0.0)
                                              ? $NO_DEFAULTS
                                              : $IGNORED),
-                    ),
-    # The two Indic files are actually available starting in v6.0.0, but their
-    # property values are missing from PropValueAliases.txt in that release,
-    # so that further work would have to be done to get them to work properly
-    # for that release.
-    Input_file->new('IndicMatraCategory.txt', v6.1.0,
+                   ),
+    # These two Indic files are actually not usable as-is until 6.1.0,
+    # because their property values are missing from PropValueAliases.txt
+    # until that release, so that further work would have to be done to get
+    # them to work properly, which isn't worth it because of them being
+    # provisional.
+    Input_file->new('IndicMatraCategory.txt', v6.0.0,
                      Property => 'Indic_Matra_Category',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    Skip => "Provisional; for the analysis and processing of Indic scripts",
-                    ),
-    Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
+                    Skip => "Withdrawn by Unicode while still provisional",
+                   ),
+    Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
                      Property => 'Indic_Syllabic_Category',
                      Has_Missings_Defaults => $NOT_IGNORED,
-                    Skip => "Provisional; for the analysis and processing of Indic scripts",
-                    ),
+                    Skip => (($v_version lt v8.0.0)
+                              ? "Provisional; for the analysis and processing of Indic scripts"
+                              : 0),
+                   ),
      Input_file->new('BidiBrackets.txt', v6.3.0,
-                    Properties => [ 'Bidi_Paired_Bracket', 'Bidi_Paired_Bracket_Type' ],
+                    Properties => [ 'Bidi_Paired_Bracket',
+                                    'Bidi_Paired_Bracket_Type'
+                                  ],
                      Has_Missings_Defaults => $NO_DEFAULTS,
-                    ),
+                   ),
      Input_file->new("BidiCharacterTest.txt", v6.3.0,
                      Skip => 'Validation Tests',
-                    ),
+                   ),
+    Input_file->new('IndicPositionalCategory.txt', v8.0.0,
+                    Property => 'Indic_Positional_Category',
+                    Has_Missings_Defaults => $NOT_IGNORED,
+                   ),
  );
  
  # End of all the preliminaries.
@@ -18611,6 +18754,16 @@ use warnings;
  my $Tests = 0;
  my $Fails = 0;
  
+# loc_tools.pl requires this function to be defined
+sub ok($@) {
+    my ($pass, @msg) = @_;
+    print "not " unless $pass;
+    print "ok ";
+    print ++$Tests;
+    print " - ", join "", @msg if @msg;
+    print "\n";
+}
+
  sub Expect($$$$) {
      my $expected = shift;
      my $ord = shift;
@@ -18864,6 +19017,14 @@ sub Test_GCB($) {
      _test_break(shift, 'gcb');
  }
  
+sub Test_SB($) {
+    _test_break(shift, 'sb');
+}
+
+sub Test_WB($) {
+    _test_break(shift, 'wb');
+}
+
  sub Finished() {
      print "1..$Tests\n";
      exit($Fails ? -1 : 0);