mktables: factor out sub that duplicates utf8_heavy

[perl5.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 8a5c89a..9b2d25a 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -17,7 +17,7 @@
  # changed 0+$self to pack 'J', $self.)
  
  my $start_time;
-BEGIN { # Get the time the script started running; do it at compiliation to
+BEGIN { # Get the time the script started running; do it at compilation to
          # get it as close as possible
      $start_time= time;
  }
@@ -27,12 +27,14 @@ require 5.010_001;
  use strict;
  use warnings;
  use Carp;
+use Config;
  use File::Find;
  use File::Path;
  use File::Spec;
  use Text::Tabs;
  
  sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
+my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
  
  ##########################################################################
  #
@@ -50,7 +52,7 @@ sub DEBUG () { 0 }  # Set to 0 for production; 1 for development
  #   the small actual loop to process the input files and finish up; then
  #   a __DATA__ section, for the .t tests
  #
-# This program works on all releases of Unicode through at least 5.2.  The
+# This program works on all releases of Unicode through at least 6.0.  The
  # outputs have been scrutinized most intently for release 5.1.  The others
  # have been checked for somewhat more than just sanity.  It can handle all
  # existing Unicode character properties in those releases.
@@ -162,7 +164,10 @@ my $map_directory = 'To';        # Where map files go.
  # out.  But all the ones which can be used in regular expression \p{} and \P{}
  # constructs will.  Generally a property will have either its map table or its
  # match tables written but not both.  Again, what gets written is controlled
-# by lists which can easily be changed.
+# by lists which can easily be changed.  Properties have a 'Type', like
+# binary, or string, or enum depending on how many match tables there are and
+# the content of the maps.  This 'Type' is different than a range 'Type', so
+# don't get confused by the two concepts having the same name.
  #
  # For information about the Unicode properties, see Unicode's UAX44 document:
  
@@ -183,11 +188,11 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  # More information on Unicode version glitches is further down in these
  # introductory comments.
  #
-# This program works on all properties as of 5.2, though the files for some
-# are suppressed from apparent lack of demand for them.  You can change which
-# are output by changing lists in this program.
+# This program works on all non-provisional properties as of 6.0, though the
+# files for some are suppressed from apparent lack of demand for them.  You
+# can change which are output by changing lists in this program.
  #
-# The old version of mktables emphasized the term "Fuzzy" to mean Unocde's
+# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
  # loose matchings rules (from Unicode TR18):
  #
  #    The recommended names for UCD properties and property values are in
@@ -418,7 +423,7 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
  # is present in the directory, a table will be generated for that property.
  # In 5.2, several more properties were added.  For your convenience, the two
-# arrays are initialized with all the 5.2 listed properties that are also in
+# arrays are initialized with all the 6.0 listed properties that are also in
  # earlier releases.  But these are commented out.  You can just uncomment the
  # ones you want, or use them as a template for adding entries for other
  # properties.
@@ -473,7 +478,7 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  #
  # Here are some observations about some of the issues in early versions:
  #
-# The number of code points in \p{alpha} halve in 2.1.9.  It turns out that
+# The number of code points in \p{alpha} halved in 2.1.9.  It turns out that
  # the reason is that the CJK block starting at 4E00 was removed from PropList,
  # and was not put back in until 3.1.0
  #
@@ -633,8 +638,9 @@ my $make_list = 1;             # ? Should we write $file_list.  Set to always
                                 # special things
  my $glob_list = 0;             # ? Should we try to include unknown .txt files
                                 # in the input.
-my $output_range_counts = 1;   # ? Should we include the number of code points
-                               # in ranges in the output
+my $output_range_counts = $debugging_build;   # ? Should we include the number
+                                              # of code points in ranges in
+                                              # the output
  my $annotate = 0;              # ? Should character names be in the output
  
  # Verbosity levels; 0 is quiet
@@ -694,6 +700,8 @@ while (@ARGV) {
      }
      elsif ($arg eq '-annotate') {
          $annotate = 1;
+        $debugging_build = 1;
+        $output_range_counts = 1;
      }
      else {
          my $with_c = 'with';
@@ -805,7 +813,7 @@ if ($v_version gt v3.2.0) {
                                  'Canonical_Combining_Class=Attached_Below_Left'
  }
  
-# These are listed in the Property aliases file in 5.2, but Unihan is ignored
+# These are listed in the Property aliases file in 6.0, but Unihan is ignored
  # unless explicitly added.
  if ($v_version ge v5.2.0) {
      my $unihan = 'Unihan; remove from list if using Unihan';
@@ -848,10 +856,10 @@ my %why_obsolete;    # Documentation only
  
      my $other_properties = 'other properties';
      my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
-    my $why_no_expand  = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)",
+    my $why_no_expand  = "Deprecated by Unicode: less useful than UTF-specific calculations",
  
      %why_deprecated = (
-        'Grapheme_Link' => 'Deprecated by Unicode.  Use ccc=vr (Canonical_Combining_Class=Virama) instead',
+        'Grapheme_Link' => 'Deprecated by Unicode:  Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
          'Jamo_Short_Name' => $contributory,
          'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
          'Other_Alphabetic' => $contributory,
@@ -865,7 +873,7 @@ my %why_obsolete;    # Documentation only
      );
  
      %why_suppressed = (
-        # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which
+        # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
          # contains the same information, but without the algorithmically
          # determinable Hangul syllables'.  This file is not published, so it's
          # existence is not noted in the comment.
@@ -882,10 +890,7 @@ my %why_obsolete;    # Documentation only
          'Name' => "Accessible via 'use charnames;'",
          'Name_Alias' => "Accessible via 'use charnames;'",
  
-        # These are sort of jumping the gun; deprecation is proposed for
-        # Unicode version 6.0, but they have never been exposed by Perl, and
-        # likely are soon to be deprecated, so best not to expose them.
-        FC_NFKC_Closure => 'Use NFKC_Casefold instead',
+        FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
          Expands_On_NFC => $why_no_expand,
          Expands_On_NFD => $why_no_expand,
          Expands_On_NFKC => $why_no_expand,
@@ -907,9 +912,15 @@ my %why_obsolete;    # Documentation only
  
  if ($v_version ge 4.0.0) {
      $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
+    if ($v_version ge 6.0.0) {
+        $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
+    }
  }
-if ($v_version ge 5.2.0) {
+if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
      $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
+    if ($v_version ge 6.0.0) {
+        $why_deprecated{'ISO_Comment'} = 'No longer needed for chart generation; otherwise not useful, and code points for it have been removed';
+    }
  }
  
  # Probably obsolete forever
@@ -928,7 +939,7 @@ END
  
  # If you are using the Unihan database, you need to add the properties that
  # you want to extract from it to this table.  For your convenience, the
-# properties in the 5.2 PropertyAliases.txt file are listed, commented out
+# properties in the 6.0 PropertyAliases.txt file are listed, commented out
  my @cjk_properties = split "\n", <<'END';
  #cjkAccountingNumeric; kAccountingNumeric
  #cjkOtherNumeric; kOtherNumeric
@@ -947,7 +958,7 @@ my @cjk_properties = split "\n", <<'END';
  END
  
  # Similarly for the property values.  For your convenience, the lines in the
-# 5.2 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
+# 6.0 PropertyAliases.txt file are listed.  Just remove the first BUT NOT both
  # '#' marks
  my @cjk_property_values = split "\n", <<'END';
  ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
@@ -1030,6 +1041,10 @@ my %ignored_files = (
      'ReadMe.txt' => 'Just comments',
      'README.TXT' => 'Just comments',
      'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property.  Does not fit into current scheme where one code point is mapped',
+    'EmojiSources.txt' => 'Not of general utility: for Japanese legacy cell-phone applications',
+    'IndicMatraCategory.txt' => 'Provisional',
+    'IndicSyllabicCategory.txt' => 'Provisional',
+    'ScriptExtensions.txt' => 'Provisional',
  );
  
  ### End of externally interesting definitions, except for @input_file_objects
@@ -1121,6 +1136,7 @@ my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
  my $UNCONDITIONALLY = 2;   # Replace without conditions.
  my $MULTIPLE = 4;          # Don't replace, but add a duplicate record if
                             # already there
+my $CROAK = 5;             # Die with an error if is already there
  
  # Flags to give property statuses.  The phrases are to remind maintainers that
  # if the flag is changed, the indefinite article referring to it in the
@@ -1280,7 +1296,7 @@ my @printable;          # boolean: And are those characters printable?
  my @annotate_char_type; # Contains a type of those characters, specifically
                          # for the purposes of annotation.
  my $annotate_ranges;    # A map of ranges of code points that have the same
-                        # name for the purposes of annoation.  They map to the
+                        # name for the purposes of annotation.  They map to the
                          # upper edge of the range, so that the end point can
                          # be immediately found.  This is used to skip ahead to
                          # the end of a range, and avoid processing each
@@ -1367,10 +1383,11 @@ sub populate_char_info ($) {
              $end = min($block->containing_range($i)->end,
                         $unassigned_sans_noncharacters-> containing_range($i)->
                                                                           end);
-        } else {
-            my_carp_bug("Can't figure out how to annotate"
-                        . sprintf("U+%04X", $i)
-                        . "Proceeding anyway.");
+        }
+        else {
+            Carp::my_carp_bug("Can't figure out how to annotate "
+                              . sprintf("U+%04X", $i)
+                              . ".  Proceeding anyway.");
              $viacode[$i] = 'UNKNOWN';
              $annotate_char_type[$i] = $UNKNOWN_TYPE;
              $printable[$i] = 0;
@@ -3507,6 +3524,13 @@ sub trace { return main::trace(@_); }
              if ($clean_insert) {
                  if ($r->[$j]->standard_form ne $standard_form) {
                      $clean_insert = 0;
+                    if ($replace == $CROAK) {
+                        main::croak("The range to add "
+                        . sprintf("%04X", $start)
+                        . '-'
+                        . sprintf("%04X", $end)
+                        . " with value '$value' overlaps an existing range $r->[$j]");
+                    }
                  }
                  else {
  
@@ -3677,7 +3701,7 @@ sub trace { return main::trace(@_); }
              $extends_above = ($j+1 < $range_list_size
                              && $r->[$j+1]->start == $end +1
                              && $r->[$j+1]->standard_form eq $standard_form
-                            && $r->[$j-1]->type == $type);
+                            && $r->[$j+1]->type == $type);
          }
          if ($extends_below && $extends_above) { # Adds to both
              $splice_start--;     # start replace at element below
@@ -3799,7 +3823,7 @@ sub trace { return main::trace(@_); }
              trace "i  =[", $i, "]", $r->[$i];
              trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
              trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
-            trace "removed @return";
+            trace "removed ", @return if @return;
          }
  
          # An actual deletion could have changed the maximum in the list.
@@ -4198,8 +4222,6 @@ sub trace { return main::trace(@_); }
          # the character very frequently used.
          return $try_hard if $code == 0x0000;
  
-        return 0 if $try_hard;  # XXX Temporary until fix utf8.c
-
          # shun non-character code points.
          return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
          return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
@@ -4475,7 +4497,7 @@ sub trace { return main::trace(@_); }
              # not, is normal.  The lists are prioritized so the most serious
              # ones are checked first
              if (exists $why_suppressed{$complete_name}
-                # Don't suppress if overriden
+                # Don't suppress if overridden
                  && ! grep { $_ eq $complete_name{$addr} }
                                                      @output_mapped_properties)
              {
@@ -4793,6 +4815,8 @@ sub trace { return main::trace(@_); }
  
      sub add_comment { # Adds the parameter as a comment.
  
+        return unless $debugging_build;
+
          my $self = shift;
          my $comment = shift;
          Carp::carp_extra_args(\@_) if main::DEBUG && @_;
@@ -4952,7 +4976,8 @@ sub trace { return main::trace(@_); }
                  # If there is a range and doesn't need a single point range
                  # output
                  if ($start != $end && ! $range_size_1) {
-                    push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value;
+                    push @OUT, sprintf "%04X\t%04X", $start, $end;
+                    $OUT[-1] .= "\t$value" if $value ne "";
  
                      # Add a comment with the size of the range, if requested.
                      # Expand Tabs to make sure they all start in the same
@@ -5560,6 +5585,8 @@ sub trace { return main::trace(@_); }
          # Just before output, create the comment that heads the file
          # containing this table.
  
+        return unless $debugging_build;
+
          my $self = shift;
          Carp::carp_extra_args(\@_) if main::DEBUG && @_;
  
@@ -5794,7 +5821,7 @@ END
  
                  # The pack() below can't cope with surrogates.
                  if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
-                    Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self.  No map created");
+                    Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self.  No map created");
                      next;
                  }
  
@@ -6534,7 +6561,7 @@ sub trace { return main::trace(@_); }
          # not quite so many.
          # If they are related, one must be a perl extension.  This is because
          # we can't guarantee that Unicode won't change one or the other in a
-        # later release even if they are idential now.
+        # later release even if they are identical now.
  
          my $self = shift;
          my $other = shift;
@@ -6643,6 +6670,8 @@ sub trace { return main::trace(@_); }
          # ones that share the same file.  It lists all such tables, ordered so
          # that related ones are together.
  
+        return unless $debugging_build;
+
          my $leader = shift;   # Should only be called on the leader table of
                                # an equivalent group
          Carp::carp_extra_args(\@_) if main::DEBUG && @_;
@@ -7039,7 +7068,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
          # each of them is stored in %alias_to_property_of as they are defined.
          # But it's possible that this subroutine will be called with some
          # variant, so if the initial lookup fails, it is repeated with the
-        # standarized form of the input name.  If found, besides returning the
+        # standardized form of the input name.  If found, besides returning the
          # result, the input name is added to the list so future calls won't
          # have to do the conversion again.
  
@@ -7193,7 +7222,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
                          . " argument to '-='.  Subtraction ignored.");
              return $self;
          }
-        elsif ($reversed) {   # Shouldnt happen in a -=, but just in case
+        elsif ($reversed) {   # Shouldn't happen in a -=, but just in case
              Carp::my_carp_bug("Can't cope with a "
              .  __PACKAGE__
              . " being the first parameter in a '-='.  Subtraction ignored.");
@@ -7616,7 +7645,7 @@ sub join_lines($) {
      # A blank separates the joined lines except if there is a break; an extra
      # blank is inserted after a period ending a line.
  
-    # Intialize the return with the first line.
+    # Initialize the return with the first line.
      my ($return, @lines) = split "\n", shift;
  
      # If the first line is null, it was an empty line, add the \n back in
@@ -7918,7 +7947,7 @@ sub Standardize($) {
      $name =~ s/^\s+//g;
      $name =~ s/\s+$//g;
  
-    # Convert interior white space and hypens into underscores.
+    # Convert interior white space and hyphens into underscores.
      $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
  
      # Capitalize the letter following an underscore, and convert a sequence of
@@ -7946,6 +7975,30 @@ sub standardize ($) {
      return lc $name;
  }
  
+sub utf8_heavy_name ($$) {
+    # Returns the name that utf8_heavy.pl will use to find a table.  XXX
+    # perhaps this function should be placed somewhere, like Heavy.pl so that
+    # utf8_heavy can use it directly without duplicating code that can get
+    # out-of sync.
+
+    my $table = shift;
+    my $alias = shift;
+    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+    my $property = $table->property;
+    $property = ($property == $perl)
+                ? ""                # 'perl' is never explicitly stated
+                : standardize($property->name) . '=';
+    if ($alias->loose_match) {
+        return $property . standardize($alias->name);
+    }
+    else {
+        return lc ($property . $alias->name);
+    }
+
+    return;
+}
+
  {   # Closure
  
      my $indent_increment = " " x 2;
@@ -8229,7 +8282,7 @@ sub finish_property_setup {
          }
      }
  
-    # This entry is still missing as of 5.2, perhaps because no short name for
+    # This entry is still missing as of 6.0, perhaps because no short name for
      # it.
      if (-e 'NameAliases.txt') {
          my $aliases = property_ref('Name_Alias');
@@ -8261,9 +8314,9 @@ sub finish_property_setup {
      my $fold = property_ref('Case_Folding');
      $fold->set_file('Fold') if defined $fold;
  
-    # utf8.c can't currently cope with non range-size-1 for these, and even if
-    # it were changed to do so, someone else may be using them, expecting the
-    # old style
+    # utf8.c has a different meaning for non range-size-1 for map properties
+    # that this program doesn't currently handle; and even if it were changed
+    # to do so, some other code may be using them expecting range size 1.
      foreach my $property (qw {
                                  Case_Folding
                                  Lowercase_Mapping
@@ -8985,7 +9038,7 @@ sub output_perl_charnames_line ($$) {
          #
          # meaning the codepoints in the range all have the value 'map' under
          # 'property'.
-        # Beginning and trailing white space in each field are not signficant.
+        # Beginning and trailing white space in each field are not significant.
          # Note there is not a trailing semi-colon in the above.  A trailing
          # semi-colon means the map is a null-string.  An omitted map, as
          # opposed to a null-string, is assumed to be 'Y', based on Unicode
@@ -9005,8 +9058,8 @@ sub output_perl_charnames_line ($$) {
          # file, in any order, interspersed in any way.  The first time a
          # property is seen, it gets information about that property and
          # caches it for quick retrieval later.  It also normalizes the maps
-        # so that only one of many synonym is stored.  The Unicode input files
-        # do use some multiple synonyms.
+        # so that only one of many synonyms is stored.  The Unicode input
+        # files do use some multiple synonyms.
  
          my $file = shift;
          Carp::carp_extra_args(\@_) if main::DEBUG && @_;
@@ -9315,19 +9368,17 @@ END
  
                  # If the map begins with a special command to us (enclosed in
                  # delimiters), extract the command(s).
-                if (substr($map, 0, 1) eq $CMD_DELIM) {
-                    while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
-                        my $command = $1;
-                        if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
-                            $replace = $1;
-                        }
-                        elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
-                            $map_type = $1;
-                        }
-                        else {
-                           $file->carp_bad_line("Unknown command line: '$1'");
-                           next LINE;
-                        }
+                while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
+                    my $command = $1;
+                    if ($command =~  / ^ $REPLACE_CMD= (.*) /x) {
+                        $replace = $1;
+                    }
+                    elsif ($command =~  / ^ $MAP_TYPE_CMD= (.*) /x) {
+                        $map_type = $1;
+                    }
+                    else {
+                        $file->carp_bad_line("Unknown command line: '$1'");
+                        next LINE;
                      }
                  }
              }
@@ -9519,7 +9570,7 @@ END
      # the code point and name on each line.  This was actually the hardest
      # thing to design around.  The code points in those ranges may actually
      # have real maps not given by these two lines.  These maps will either
-    # be algorthimically determinable, or in the extracted files furnished
+    # be algorithmically determinable, or in the extracted files furnished
      # with the UCD.  In the event of conflicts between these extracted files,
      # and this one, Unicode says that this one prevails.  But it shouldn't
      # prevail for conflicts that occur in these ranges.  The data from the
@@ -10138,6 +10189,32 @@ END
          }
          return;
      }
+
+    sub filter_v6_ucd {
+
+        # Unicode 6.0 co-opted the name BELL for U+1F514, so change the input
+        # to pretend that U+0007 is ALERT instead, and for Perl 5.14, don't
+        # allow the BELL name for U+1F514, so that the old usage can be
+        # deprecated for one cycle.
+
+        return if $_ !~ /^(?:0007|1F514|070F);/;
+
+        my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
+        if ($code_point eq '0007') {
+            $fields[$CHARNAME] = "ALERT";
+        }
+        elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
+                            # http://www.unicode.org/versions/corrigendum8.html
+            $fields[$BIDI] = "AL";
+        }
+        elsif ($^V lt v5.15.0) { # For 5.16 will convert to use Unicode's name
+            $fields[$CHARNAME] = "";
+        }
+
+        $_ = join ';', $code_point, @fields;
+
+        return;
+    }
  } # End closure for UnicodeData
  
  sub process_GCB_test {
@@ -10308,7 +10385,7 @@ sub filter_special_casing_line {
      # implemented, it would be by hard-coding in the casing functions in the
      # Perl core, not through tables.  But if there is a new condition we don't
      # know about, output a warning.  We know about all the conditions through
-    # 5.2
+    # 6.0
      if ($fields[4] ne "") {
          my @conditions = split ' ', $fields[4];
          if ($conditions[0] ne 'tr'  # We know that these languages have
@@ -10805,7 +10882,7 @@ sub filter_blocks_lines {
          #                                one.
          #   Titlecase                    duplicates UnicodeData.txt: gc=lt
          #   Unassigned Code Value        duplicates UnicodeData.txt: gc=cc
-        #   Zero-width                   never made into offical property;
+        #   Zero-width                   never made into official property;
          #                                subset of gc=cf
          # Most of the properties have the same names in this file as in later
          # versions, but a couple do not.
@@ -10946,7 +11023,8 @@ sub finish_Unicode() {
  
                  # Add mappings to the property for each code point in the list
                  foreach my $range ($list->ranges) {
-                    $property->add_map($range->start, $range->end, $default);
+                    $property->add_map($range->start, $range->end, $default,
+                    Replace => $CROAK);
                  }
              }
  
@@ -10973,11 +11051,12 @@ sub finish_Unicode() {
          }
  
          # Add any remaining code points to the mapping, using the default for
-        # missing code points
+        # missing code points.
          if (defined (my $default_map = $property->default_map)) {
-            foreach my $range ($property->inverse_list->ranges) {
-                $property->add_map($range->start, $range->end, $default_map);
-            }
+
+            # This fills in any missing values with the default.
+            $property->add_map(0, $LAST_UNICODE_CODEPOINT,
+                               $default_map, Replace => $NO);
  
              # Make sure there is a match table for the default
              if (! defined $property->table($default_map)) {
@@ -11075,14 +11154,6 @@ END
      $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
  
      my $Cs = $gc->table('Cs');
-    if (defined $Cs) {
-        $Cs->add_note('Mostly not usable in Perl.');
-        $Cs->add_comment(join_lines(<<END
-Surrogates are used exclusively for I/O in UTF-16, and should not appear in
-Unicode text, and hence their use will generate (usually fatal) messages
-END
-        ));
-    }
  
  
      # Folding information was introduced later into Unicode data.  To get
@@ -11323,7 +11394,8 @@ sub compile_perl() {
                              );
  
      my $Word = $perl->add_match_table('Word',
-                                Description => '\w, including beyond ASCII',
+                                Description => '\w, including beyond ASCII;'
+                                            . ' = \p{Alnum} + \pM + \p{Pc}',
                                  Initialize => $Alnum + $gc->table('Mark'),
                                  );
      $Word->add_alias('XPosixWord');
@@ -11682,7 +11754,7 @@ END
          my $description_start = "Code point's usage introduced in version ";
          $first_age->add_description($description_start . $first_age->name);
  
-        # To construct the accumlated values, for each of the age tables
+        # To construct the accumulated values, for each of the age tables
          # starting with the 2nd earliest, merge the earliest with it, to get
          # all those code points existing in the 2nd earliest.  Repeat merging
          # the new 2nd earliest with the 3rd earliest to get all those existing
@@ -12019,7 +12091,7 @@ END
  
  sub register_file_for_name($$$) {
      # Given info about a table and a datafile that it should be associated
-    # with, register that assocation
+    # with, register that association
  
      my $table = shift;
      my $directory_ref = shift;   # Array of the directory path for the file
@@ -12069,14 +12141,12 @@ sub register_file_for_name($$$) {
          # goes through all aliases in the UCD that we generate regex match
          # files for
          foreach my $alias ($table->aliases) {
-            my $name = $alias->name;
+            my $standard = utf8_heavy_name($table, $alias);
  
              # Generate an entry in either the loose or strict hashes, which
              # will translate the property and alias names combination into the
              # file where the table for them is stored.
-            my $standard;
              if ($alias->loose_match) {
-                $standard = $property . standardize($alias->name);
                  if (exists $loose_to_file_of{$standard}) {
                      Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
                  }
@@ -12085,7 +12155,6 @@ sub register_file_for_name($$$) {
                  }
              }
              else {
-                $standard = lc ($property . $name);
                  if (exists $stricter_to_file_of{$standard}) {
                      Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
                  }
@@ -12098,7 +12167,7 @@ sub register_file_for_name($$$) {
                      # will work.  Also note that this assumes that such a
                      # number is matched strictly; so if that were to change,
                      # this would be wrong.
-                    if ((my $integer_name = $name)
+                    if ((my $integer_name = $alias->name)
                              =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
                      {
                          $stricter_to_file_of{$property . $integer_name}
@@ -12523,7 +12592,6 @@ sub make_table_pod_entries($) {
                  # expression, but with only one of 'Single', 'Short' if there
                  # are both items.
                  if ($short_name || $single_form || $table->conflicting) {
-                    $parenthesized .= '(';
                      $parenthesized .= "Short: $short_name" if $short_name;
                      if ($short_name && $single_form) {
                          $parenthesized .= ', ';
@@ -12543,18 +12611,16 @@ sub make_table_pod_entries($) {
              # to go on every entry.
              my $conflicting = join " NOR ", $table->conflicting;
              if ($conflicting) {
-                $parenthesized .= '(' if ! $parenthesized;
-                $parenthesized .=  '; ' if $parenthesized ne '(';
+                $parenthesized .=  '; ' if $parenthesized ne "";
                  $parenthesized .= "NOT $conflicting";
              }
-            $parenthesized .= ')' if $parenthesized;
  
-            push @info, $parenthesized if $parenthesized;
+            push @info, "($parenthesized)" if $parenthesized;
  
              if ($table_property != $perl && $table->perl_extension) {
                  push @info, '(Perl extension)';
              }
-            push @info, "($string_count)" if $output_range_counts;
+            push @info, "($string_count)";
  
              # Now, we have both the entry and info so add them to the
              # list of all the properties.
@@ -12865,7 +12931,7 @@ both single and compound forms.
  B<Compound forms> consist of two components, separated by an equals sign or a
  colon.  The first component is the property name, and the second component is
  the particular value of the property to match against, for example,
-'\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters
+'\\p{Script: Greek}' and '\\p{Script=Greek}' both mean to match characters
  whose Script property is Greek.
  
  B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
@@ -12917,29 +12983,28 @@ adjacent to (but within) the braces and the colon or equal sign.
  =back
  
  Some properties are considered obsolete, but still available.  There are
-several varieties of obsolesence:
+several varieties of obsolescence:
  
  =over 4
  
  =item Obsolete
  
  Properties marked with $a_bold_obsolete in the table are considered
-obsolete.  At the time of this writing (Unicode version 5.2) there is no
-information in the Unicode standard about the implications of a property being
  obsolete.
  
  =item Stabilized
  
-Obsolete properties may be stabilized.  This means that they are not actively
-maintained by Unicode, and will not be extended as new characters are added to
-the standard.  Such properties are marked with $a_bold_stabilized in the
-table.  At the time of this writing (Unicode version 5.2) there is no further
-information in the Unicode standard about the implications of a property being
-stabilized.
+Obsolete properties may be stabilized.  Such a determination does not indicate
+that the property should or should not be used; instead it is a declaration
+that the property will not be maintained nor extended for newly encoded
+characters.  Such properties are marked with $a_bold_stabilized in the
+table.
  
  =item Deprecated
  
-Obsolete properties may be deprecated.  This means that their use is strongly
+An obsolete property may be deprecated, perhaps because its original intent
+has been replaced by another property or because its specification was somehow
+defective.  This means that its use is strongly
  discouraged, so much so that a warning will be issued if used, unless the
  regular expression is in the scope of a C<S<no warnings 'deprecated'>>
  statement.  $A_bold_deprecated flags each such entry in the table, and
@@ -12962,7 +13027,7 @@ flags each such entry in the table.
  @block_warning
  
  The table below has two columns.  The left column contains the \\p{}
-constructs to look up, possibly preceeded by the flags mentioned above; and
+constructs to look up, possibly preceded by the flags mentioned above; and
  the right column contains information about them, like a description, or
  synonyms.  It shows both the single and compound forms for each property that
  has them.  If the left column is a short name for a property, the right column
@@ -13064,8 +13129,9 @@ the properties are listed enclosed in (parentheses).
  =back
  
  An installation can choose to allow any of these to be matched by changing the
-controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
-and then re-running F<$0>.  (C<\%Config> is available from the Config module).
+controlling lists contained in the program
+C<\$Config{privlib}>/F<unicore/mktables> and then re-running F<mktables>.
+(C<\%Config> is available from the Config module).
  
  =head1 Files in the I<To> directory (for serious hackers only)
  
@@ -13094,8 +13160,8 @@ names in parentheses), and any flags or comments about them, are:
  @map_tables_actually_output
  
  An installation can choose to change which files are generated by changing the
-controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
-and then re-running F<$0>.
+controlling lists contained in the program
+C<\$Config{privlib}>/F<unicore/mktables> and then re-running F<mktables>.
  
  Each of these files defines two hash entries to help reading programs decipher
  it.  One of them looks like this:
@@ -13514,7 +13580,7 @@ sub write_all_tables() {
              $filename = $table->file;
          }
  
-        # Use specified filename if avaliable, or default to property's
+        # Use specified filename if available, or default to property's
          # shortest name.  We need an 8.3 safe filename (which means "an 8
          # safe" filename, since after the dot is only 'pl', which is < 3)
          # The 2nd parameter is if the filename shouldn't be changed, and
@@ -14065,7 +14131,12 @@ my @input_file_objects = (
                                              ? \&filter_v1_ucd
                                              : ($v_version eq v2.1.5)
                                                  ? \&filter_v2_1_5_ucd
-                                                : undef),
+
+                                                # And for 5.14 Perls with 6.0,
+                                                # have to also make changes
+                                                : ($v_version ge v6.0.0)
+                                                    ? \&filter_v6_ucd
+                                                    : undef),
  
                                              # And the main filter
                                              \&filter_UnicodeData_line,
@@ -14477,7 +14548,7 @@ if ( $file_list and $make_list ) {
  #
  # - First section is input files
  #   ($0 itself is not listed but is automatically considered an input)
-# - Section seperator is /^=+\$/
+# - Section separator is /^=+\$/
  # - Second section is a list of output files.
  # - Lines matching /^\\s*#/ are treated as comments
  #   which along with blank lines are ignored.