mktables: Suppress invalid warning on early Unicode releases

[perl5.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 8d38d0e..58f3678 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -608,6 +608,7 @@ sub uniques {
  $0 = File::Spec->canonpath($0);
  
  my $make_test_script = 0;      # ? Should we output a test script
+my $make_norm_test_script = 0; # ? Should we output a normalization test script
  my $write_unchanged_files = 0; # ? Should we update the output files even if
                                 #    we don't think they have changed
  my $use_directory = "";        # ? Should we chdir somewhere.
@@ -668,6 +669,10 @@ while (@ARGV) {
      {
          $make_test_script = 1;
      }
+    elsif ($arg eq '-makenormtest')
+    {
+        $make_norm_test_script = 1;
+    }
      elsif ($arg eq '-makelist') {
          $make_list = 1;
      }
@@ -1345,6 +1350,9 @@ my %loose_names_ending_in_code_point;   # Same as above, but has blanks, dashes
  # anonymous hash.
  my @code_points_ending_in_code_point;
  
+# To hold Unicode's normalization test suite
+my @normalization_tests;
+
  # Boolean: does this Unicode version have the hangul syllables, and are we
  # writing out a table for them?
  my $has_hangul_syllables = 0;
@@ -2174,6 +2182,7 @@ sub trace { return main::trace(@_); }
          fallback => 0,
          qw("") => "_operator_stringify",
          "." => \&main::_operator_dot,
+        ".=" => \&main::_operator_dot_equal,
      ;
  
      sub _operator_stringify {
@@ -2851,6 +2860,7 @@ sub trace { return main::trace(@_); }
          fallback => 0,
          qw("") => "_operator_stringify",
          "." => \&main::_operator_dot,
+        ".=" => \&main::_operator_dot_equal,
      ;
  
      sub _operator_stringify {
@@ -3001,6 +3011,7 @@ sub trace { return main::trace(@_); }
          fallback => 0,
          qw("") => "_operator_stringify",
          "." => \&main::_operator_dot,
+        ".=" => \&main::_operator_dot_equal,
      ;
  
      sub _operator_stringify {
@@ -3394,9 +3405,17 @@ sub trace { return main::trace(@_); }
          #                         multiple times.  They are stored LIFO, so
          #                         that the final one inserted is the first one
          #                         returned in an ordered search of the table.
+        #                         If this is an exact duplicate, including the
+        #                         value, the original will be moved to be
+        #                         first, before any other duplicate ranges
+        #                         with different values.
          #       => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
          #                         FIFO, so that this one is inserted after all
-        #                         others that currently exist.
+        #                         others that currently exist.  If this is an
+        #                         exact duplicate, including value, of an
+        #                         existing range, this one is discarded
+        #                         (leaving the existing one in its original,
+        #                         higher priority position
          #       => anything else  is the same as => $IF_NOT_EQUIVALENT
          #
          # "same value" means identical for non-type-0 ranges, and it means
@@ -3436,6 +3455,9 @@ sub trace { return main::trace(@_); }
              Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . ").  No action taken.");
              return;
          }
+        if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
+            Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ".  Adding it anyway");
+        }
          #local $to_trace = 1 if main::DEBUG;
  
          if ($operation eq '-') {
@@ -3678,7 +3700,10 @@ sub trace { return main::trace(@_); }
  
              # If to place this new record after, move to beyond all existing
              # ones; but don't add this one if identical to any of them, as it
-            # isn't really a multiple
+            # isn't really a multiple.  This leaves the original order, so
+            # that the current request is ignored.  The reasoning is that the
+            # previous request that wanted this record to have high priority
+            # should have precedence.
              if ($replace == $MULTIPLE_AFTER) {
                  while ($i < @$r && $r->[$i]->start == $start) {
                      return if $value eq $r->[$i]->value
@@ -3686,6 +3711,22 @@ sub trace { return main::trace(@_); }
                      $i++;
                  }
              }
+            else {
+                # If instead we are to place this new record before any
+                # existing ones, remove any identical ones that come after it.
+                # This changes the existing order so that the new one is
+                # first, as is being requested.
+                for (my $j = $i + 1;
+                     $j < @$r && $r->[$j]->start == $start;
+                     $j++)
+                {
+                    if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
+                        splice @$r, $j, 1;
+                        last;   # There should only be one instance, so no
+                                # need to keep looking
+                    }
+                }
+            }
  
              trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
              my @return = splice @$r,
@@ -4147,6 +4188,21 @@ sub trace { return main::trace(@_); }
  
                      return $self->_union($other)
                  },
+        '+=' => sub { my $self = shift;
+                    my $other = shift;
+                    my $reversed = shift;
+
+                    if ($reversed) {
+                        Carp::my_carp_bug("Bad news.  Can't cope with '"
+                        . ref($other)
+                        . ' += '
+                        . ref($self)
+                        . "'.  undef returned.");
+                        return;
+                    }
+
+                    return $self->_union($other)
+                },
          '&' => sub { my $self = shift;
                      my $other = shift;
  
@@ -4833,6 +4889,7 @@ END
      use overload
          fallback => 0,
          "." => \&main::_operator_dot,
+        ".=" => \&main::_operator_dot_equal,
          '!=' => \&main::_operator_not_equal,
          '==' => \&main::_operator_equal,
      ;
@@ -6151,7 +6208,7 @@ END
          my $comment = "";
  
          my $status = $self->status;
-        if ($status) {
+        if ($status && $status ne $PLACEHOLDER) {
              my $warn = uc $status_past_participles{$status};
              $comment .= <<END;
  
@@ -6770,6 +6827,16 @@ sub trace { return main::trace(@_); }
          '+=' => sub {
                          my $self = shift;
                          my $other = shift;
+                        my $reversed = shift;
+
+                        if ($reversed) {
+                            Carp::my_carp_bug("Bad news.  Can't cope with '"
+                            . ref($other)
+                            . ' += '
+                            . ref($self)
+                            . "'.  undef returned.");
+                            return;
+                        }
  
                          return if $self->carp_if_locked;
  
@@ -6787,6 +6854,14 @@ sub trace { return main::trace(@_); }
                          }
                          return $self;
                      },
+        '&=' => sub {
+                        my $self = shift;
+                        my $other = shift;
+
+                        return if $self->carp_if_locked;
+                        $self->_set_range_list($self->_range_list & $other);
+                        return $self;
+                    },
          '-' => sub { my $self = shift;
                      my $other = shift;
                      my $reversed = shift;
@@ -7236,7 +7311,9 @@ END
                      my $flag = $property->status
                                  || $table->status
                                  || $table_alias_object->status;
-                    $flags{$flag} = $status_past_participles{$flag} if $flag;
+                    if ($flag && $flag ne $PLACEHOLDER) {
+                        $flags{$flag} = $status_past_participles{$flag};
+                    }
  
                      $loose_count++;
  
@@ -7634,6 +7711,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
          fallback => 0,
          qw("") => "_operator_stringify",
          "." => \&main::_operator_dot,
+        ".=" => \&main::_operator_dot_equal,
          '==' => \&main::_operator_equal,
          '!=' => \&main::_operator_not_equal,
          '=' => sub { return shift },
@@ -8725,6 +8803,24 @@ sub _operator_dot {
              : "$self$other";
  }
  
+sub _operator_dot_equal {
+    # Overloaded '.=' method that is common to all packages.
+
+    my $self = shift;
+    my $other = shift;
+    my $reversed = shift;
+    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+    $other = "" unless defined $other;
+
+    if ($reversed) {
+        return $other .= "$self";
+    }
+    else {
+        return "$self" . "$other";
+    }
+}
+
  sub _operator_equal {
      # Generic overloaded '==' routine.  To be equal, they must be the exact
      # same object
@@ -9129,6 +9225,9 @@ sub process_PropValueAliases {
      # Process each line of the file ...
      while ($file->next_line) {
  
+        # Fix typo in input file
+        s/CCC133/CCC132/g if $v_version eq v6.1.0;
+
          my ($property, @data) = split /\s*;\s*/;
  
          # The ccc property has an extra field at the beginning, which is the
@@ -9523,6 +9622,33 @@ END
      return @return;
  }
  
+sub process_NormalizationsTest {
+
+    # Each line looks like:
+    #      source code point; NFC; NFD; NFKC; NFKD
+    # e.g.
+    #       1E0A;1E0A;0044 0307;1E0A;0044 0307;
+
+    my $file= shift;
+    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+    # Process each line of the file ...
+    while ($file->next_line) {
+
+        next if /^@/;
+
+        my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
+
+        foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
+            $$var = pack "U0U*", map { hex } split " ", $$var;
+            $$var =~ s/(\\)/$1$1/g;
+        }
+
+        push @normalization_tests,
+                "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
+    } # End of looping through the file
+}
+
  sub output_perl_charnames_line ($$) {
  
      # Output the entries in Perl_charnames specially, using 5 digits instead
@@ -10294,6 +10420,7 @@ END
              $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number.  Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
              if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
                  $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC].  Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
+                $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'.  Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
                  $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
              }
              elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
@@ -10549,7 +10676,7 @@ END
          Carp::carp_extra_args(\@_) if main::DEBUG && @_;
  
          # Flush the buffers.
-        foreach my $i (1 .. $last_field) {
+        foreach my $i (0 .. $last_field) {
              $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
          }
  
@@ -10731,7 +10858,7 @@ END
                              # http://www.unicode.org/versions/corrigendum8.html
              $fields[$BIDI] = "AL";
          }
-        elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
+        elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
              $fields[$CHARNAME] = "";
          }
  
@@ -11628,15 +11755,273 @@ sub  filter_script_extensions_line {
  }
  
  sub setup_early_name_alias {
+    my $file= shift;
+    Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+    # This has the effect of pretending that the Name_Alias property was
+    # available in all Unicode releases.  Strictly speaking, this property
+    # should not be availabe in early releases, but doing this allows
+    # charnames.pm to work on older releases without change.  Prior to v5.16
+    # it had these names hard-coded inside it.  Unicode 6.1 came along and
+    # created these names, and so they were removed from charnames.
+
      my $aliases = property_ref('Name_Alias');
-    $aliases = Property->new('Name_Alias') if ! defined $aliases;
+    if (! defined $aliases) {
+        $aliases = Property->new('Name_Alias', Default_Map => "");
+    }
+
+    $file->insert_lines(get_old_name_aliases());
  
-    # Before 6.0, this wasn't a problem, and after it, this alias is part of
-    # the Unicode-delivered file.
-    $aliases->add_map(7, 7, "ALERT: control") if $v_version eq v6.0.0;
      return;
  }
  
+sub get_old_name_aliases () {
+
+    # The Unicode_1_Name field, contains most of these names.  One would
+    # expect, given the field's name, that its values would be fixed across
+    # versions, giving the true Unicode version 1 name for the character.
+    # Sadly, this is not the case.  Actually Version 1.1.5 had no names for
+    # any of the controls; Version 2.0 introduced names for the C0 controls,
+    # and 3.0 introduced C1 names.  3.0.1 removed the name INDEX; and 3.2
+    # changed some names: it
+    #   changed to parenthesized versions like "NEXT LINE" to
+    #       "NEXT LINE (NEL)";
+    #   changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
+    #   changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
+    #   changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
+    # This list contains all the names that were defined so that
+    # charnames::vianame(), etc. understand them all EVEN if this version of
+    # Unicode didn't specify them (this could be construed as a bug).
+    # mktables elsewhere gives preference to the Unicode_1_Name field over
+    # these names, so that viacode() will return the correct value for that
+    # version of Unicode, except when that version doesn't define a name,
+    # viacode() will return one anyway (this also could be construed as a
+    # bug).  But these potential "bugs" allow for the smooth working of code
+    # on earlier Unicode releases.
+
+    my @return = split /\n/, <<'END';
+0000;NULL;control
+0000;NUL;abbreviation
+0001;START OF HEADING;control
+0001;SOH;abbreviation
+0002;START OF TEXT;control
+0002;STX;abbreviation
+0003;END OF TEXT;control
+0003;ETX;abbreviation
+0004;END OF TRANSMISSION;control
+0004;EOT;abbreviation
+0005;ENQUIRY;control
+0005;ENQ;abbreviation
+0006;ACKNOWLEDGE;control
+0006;ACK;abbreviation
+0007;BELL;control
+0007;BEL;abbreviation
+0008;BACKSPACE;control
+0008;BS;abbreviation
+0009;CHARACTER TABULATION;control
+0009;HORIZONTAL TABULATION;control
+0009;HT;abbreviation
+0009;TAB;abbreviation
+000A;LINE FEED;control
+000A;LINE FEED (LF);control
+000A;NEW LINE;control
+000A;END OF LINE;control
+000A;LF;abbreviation
+000A;NL;abbreviation
+000A;EOL;abbreviation
+000B;LINE TABULATION;control
+000B;VERTICAL TABULATION;control
+000B;VT;abbreviation
+000C;FORM FEED;control
+000C;FORM FEED (FF);control
+000C;FF;abbreviation
+000D;CARRIAGE RETURN;control
+000D;CARRIAGE RETURN (CR);control
+000D;CR;abbreviation
+000E;SHIFT OUT;control
+000E;LOCKING-SHIFT ONE;control
+000E;SO;abbreviation
+000F;SHIFT IN;control
+000F;LOCKING-SHIFT ZERO;control
+000F;SI;abbreviation
+0010;DATA LINK ESCAPE;control
+0010;DLE;abbreviation
+0011;DEVICE CONTROL ONE;control
+0011;DC1;abbreviation
+0012;DEVICE CONTROL TWO;control
+0012;DC2;abbreviation
+0013;DEVICE CONTROL THREE;control
+0013;DC3;abbreviation
+0014;DEVICE CONTROL FOUR;control
+0014;DC4;abbreviation
+0015;NEGATIVE ACKNOWLEDGE;control
+0015;NAK;abbreviation
+0016;SYNCHRONOUS IDLE;control
+0016;SYN;abbreviation
+0017;END OF TRANSMISSION BLOCK;control
+0017;ETB;abbreviation
+0018;CANCEL;control
+0018;CAN;abbreviation
+0019;END OF MEDIUM;control
+0019;EOM;abbreviation
+001A;SUBSTITUTE;control
+001A;SUB;abbreviation
+001B;ESCAPE;control
+001B;ESC;abbreviation
+001C;INFORMATION SEPARATOR FOUR;control
+001C;FILE SEPARATOR;control
+001C;FS;abbreviation
+001D;INFORMATION SEPARATOR THREE;control
+001D;GROUP SEPARATOR;control
+001D;GS;abbreviation
+001E;INFORMATION SEPARATOR TWO;control
+001E;RECORD SEPARATOR;control
+001E;RS;abbreviation
+001F;INFORMATION SEPARATOR ONE;control
+001F;UNIT SEPARATOR;control
+001F;US;abbreviation
+0020;SP;abbreviation
+007F;DELETE;control
+007F;DEL;abbreviation
+0080;PADDING CHARACTER;figment
+0080;PAD;abbreviation
+0081;HIGH OCTET PRESET;figment
+0081;HOP;abbreviation
+0082;BREAK PERMITTED HERE;control
+0082;BPH;abbreviation
+0083;NO BREAK HERE;control
+0083;NBH;abbreviation
+0084;INDEX;control
+0084;IND;abbreviation
+0085;NEXT LINE;control
+0085;NEXT LINE (NEL);control
+0085;NEL;abbreviation
+0086;START OF SELECTED AREA;control
+0086;SSA;abbreviation
+0087;END OF SELECTED AREA;control
+0087;ESA;abbreviation
+0088;CHARACTER TABULATION SET;control
+0088;HORIZONTAL TABULATION SET;control
+0088;HTS;abbreviation
+0089;CHARACTER TABULATION WITH JUSTIFICATION;control
+0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
+0089;HTJ;abbreviation
+008A;LINE TABULATION SET;control
+008A;VERTICAL TABULATION SET;control
+008A;VTS;abbreviation
+008B;PARTIAL LINE FORWARD;control
+008B;PARTIAL LINE DOWN;control
+008B;PLD;abbreviation
+008C;PARTIAL LINE BACKWARD;control
+008C;PARTIAL LINE UP;control
+008C;PLU;abbreviation
+008D;REVERSE LINE FEED;control
+008D;REVERSE INDEX;control
+008D;RI;abbreviation
+008E;SINGLE SHIFT TWO;control
+008E;SINGLE-SHIFT-2;control
+008E;SS2;abbreviation
+008F;SINGLE SHIFT THREE;control
+008F;SINGLE-SHIFT-3;control
+008F;SS3;abbreviation
+0090;DEVICE CONTROL STRING;control
+0090;DCS;abbreviation
+0091;PRIVATE USE ONE;control
+0091;PRIVATE USE-1;control
+0091;PU1;abbreviation
+0092;PRIVATE USE TWO;control
+0092;PRIVATE USE-2;control
+0092;PU2;abbreviation
+0093;SET TRANSMIT STATE;control
+0093;STS;abbreviation
+0094;CANCEL CHARACTER;control
+0094;CCH;abbreviation
+0095;MESSAGE WAITING;control
+0095;MW;abbreviation
+0096;START OF GUARDED AREA;control
+0096;START OF PROTECTED AREA;control
+0096;SPA;abbreviation
+0097;END OF GUARDED AREA;control
+0097;END OF PROTECTED AREA;control
+0097;EPA;abbreviation
+0098;START OF STRING;control
+0098;SOS;abbreviation
+0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
+0099;SGC;abbreviation
+009A;SINGLE CHARACTER INTRODUCER;control
+009A;SCI;abbreviation
+009B;CONTROL SEQUENCE INTRODUCER;control
+009B;CSI;abbreviation
+009C;STRING TERMINATOR;control
+009C;ST;abbreviation
+009D;OPERATING SYSTEM COMMAND;control
+009D;OSC;abbreviation
+009E;PRIVACY MESSAGE;control
+009E;PM;abbreviation
+009F;APPLICATION PROGRAM COMMAND;control
+009F;APC;abbreviation
+00A0;NBSP;abbreviation
+00AD;SHY;abbreviation
+200B;ZWSP;abbreviation
+200C;ZWNJ;abbreviation
+200D;ZWJ;abbreviation
+200E;LRM;abbreviation
+200F;RLM;abbreviation
+202A;LRE;abbreviation
+202B;RLE;abbreviation
+202C;PDF;abbreviation
+202D;LRO;abbreviation
+202E;RLO;abbreviation
+FEFF;BYTE ORDER MARK;alternate
+FEFF;BOM;abbreviation
+FEFF;ZWNBSP;abbreviation
+END
+
+    if ($v_version ge v3.0.0) {
+        push @return, split /\n/, <<'END';
+180B; FVS1; abbreviation
+180C; FVS2; abbreviation
+180D; FVS3; abbreviation
+180E; MVS; abbreviation
+202F; NNBSP; abbreviation
+END
+    }
+
+    if ($v_version ge v3.2.0) {
+        push @return, split /\n/, <<'END';
+034F; CGJ; abbreviation
+205F; MMSP; abbreviation
+2060; WJ; abbreviation
+END
+        # Add in VS1..VS16
+        my $cp = 0xFE00 - 1;
+        for my $i (1..16) {
+            push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
+        }
+    }
+    if ($v_version ge v4.0.0) { # Add in VS17..VS256
+        my $cp = 0xE0100 - 17;
+        for my $i (17..256) {
+            push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
+        }
+    }
+
+    # ALERT did not come along until 6.0, at which point it became preferred
+    # over BELL, and was never in the Unicode_1_Name field.  For the same
+    # reasons, that the other names are made known to all releases by this
+    # function, we make ALERT known too.  By inserting it
+    # last in early releases, BELL is preferred over it; and vice-vers in 6.0
+    my $alert = '0007; ALERT; control';
+    if ($v_version lt v6.0.0) {
+        push @return, $alert;
+    }
+    else {
+        unshift @return, $alert;
+    }
+
+    return @return;
+}
+
  sub filter_later_version_name_alias_line {
  
      # This file has an extra entry per line for the alias type.  This is
@@ -11661,8 +12046,10 @@ sub filter_later_version_name_alias_line {
  sub filter_early_version_name_alias_line {
  
      # Early versions did not have the trailing alias type field; implicitly it
-    # was 'correction'
-    $_ .= "; correction";
+    # was 'correction'.   But our synthetic lines we add in this program do
+    # have it, so test for the type field.
+    $_ .= "; correction" if $_ !~ /;.*;/;
+
      filter_later_version_name_alias_line;
      return;
  }
@@ -11670,16 +12057,37 @@ sub filter_early_version_name_alias_line {
  sub finish_Unicode() {
      # This routine should be called after all the Unicode files have been read
      # in.  It:
-    # 1) Adds the mappings for code points missing from the files which have
+    # 1) Creates properties that are missing from the version of Unicode being
+    #    compiled, and which, for whatever reason, are needed for the Perl
+    #    core to function properly.  These are minimally populated as
+    #    necessary.
+    # 2) Adds the mappings for code points missing from the files which have
      #    defaults specified for them.
-    # 2) At this this point all mappings are known, so it computes the type of
+    # 3) At this this point all mappings are known, so it computes the type of
      #    each property whose type hasn't been determined yet.
-    # 3) Calculates all the regular expression match tables based on the
+    # 4) Calculates all the regular expression match tables based on the
      #    mappings.
-    # 3) Calculates and adds the tables which are defined by Unicode, but
+    # 5) Calculates and adds the tables which are defined by Unicode, but
      #    which aren't derived by them, and certain derived tables that Perl
      #    uses.
  
+    # GCB and hst are not in early Unicode releases; create dummy ones if
+    # they don't exist, as the core needs tables generated from them.
+    my $gcb = property_ref('Grapheme_Cluster_Break');
+    if (! defined $gcb) {
+        $gcb = Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
+                             Status => $PLACEHOLDER,
+                             Type => $ENUM,
+                             Default_Map => 'Other');
+    }
+    my $hst = property_ref('Hangul_Syllable_Type');
+    if (!defined $hst) {
+        $hst = Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
+                             Status => $PLACEHOLDER,
+                             Type => $ENUM,
+                             Default_Map => 'Not_Applicable');
+    }
+
      # For each property, fill in any missing mappings, and calculate the re
      # match tables.  If a property has more than one missing mapping, the
      # default is a reference to a data structure, and requires data from other
@@ -12113,7 +12521,12 @@ sub compile_perl() {
      # identical code points, but their caseless equivalents are not the same,
      # one being 'Cased' and the other being 'LC', and so now must be kept as
      # separate entities.
-    $Title += $lt if defined $lt;
+    if (defined $lt) {
+        $Title += $lt;
+    }
+    else {
+        push @tables_that_may_be_empty, $Title->complete_name;
+    }
  
      # If this Unicode version doesn't have Cased, set up our own.  From
      # Unicode 5.1: Definition D120: A character C is defined to be cased if
@@ -12279,11 +12692,12 @@ sub compile_perl() {
      # Perl's traditional space doesn't include Vertical Tab
      my $XPerlSpace = $perl->add_match_table('XPerlSpace',
                                    Description => '\s, including beyond ASCII',
-                                  Initialize => $Space - 0x000B,
+                                  #Initialize => $Space - 0x000B,
+                                  Initialize => $Space,
                                  );
      $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
      my $PerlSpace = $perl->add_match_table('PerlSpace',
-                        Description => '\s, restricted to ASCII = [ \f\n\r\t]',
+                        Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
                          Initialize => $XPerlSpace & $ASCII,
                              );
  
@@ -12430,9 +12844,9 @@ sub compile_perl() {
  
      # The 'extended' grapheme cluster came in 5.1.  The non-extended
      # definition differs too much from the traditional Perl one to use.
-    if (defined $gcb && defined $gcb->table('SpacingMark')) {
+    if (defined $gcb->table('SpacingMark')) {
  
-        # Note that assumes HST is defined; it came in an earlier release than
+        # Note that assumes hst is defined; it came in an earlier release than
          # GCB.  In the line below, two negatives means: yes hangul
          $begin += ~ property_ref('Hangul_Syllable_Type')
                                                      ->table('Not_Applicable')
@@ -12455,16 +12869,7 @@ sub compile_perl() {
          # We set things up so the Perl core degrades gracefully, possibly with
          # placeholders that match nothing.
  
-        if (! defined $gcb) {
-            $gcb = Property->new('GCB', Status => $PLACEHOLDER);
-        }
-        my $hst = property_ref('HST');
-        if (!defined $hst) {
-            $hst = Property->new('HST', Status => $PLACEHOLDER);
-            $hst->add_match_table('Not_Applicable',
-                                Initialize => $Any,
-                                Matches_All => 1);
-        }
+        my $hst = property_ref('Hangul_Syllable_Type');
  
          # On some releases, here we may not have the needed tables for the
          # perl core, in some releases we may.
@@ -12475,11 +12880,9 @@ sub compile_perl() {
                  push @tables_that_may_be_empty, $table->complete_name;
              }
  
-            # The HST property predates the GCB one, and has identical tables
+            # The hst property predates the GCB one, and has identical tables
              # for some of them, so use it if we can.
-            if ($table->is_empty
-                && defined $hst
-                && defined $hst->table($name))
+            if ($table->is_empty && defined $hst->table($name))
              {
                  $table += $hst->table($name);
              }
@@ -12496,7 +12899,7 @@ sub compile_perl() {
          push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
      } else {
          $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
-        $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
+        $lv_lvt_v->add_comment('For use in \X; matches: hst=LV | hst=LVT | hst=V');
      }
  
      # Was previously constructed to contain both Name and Unicode_1_Name
@@ -12510,95 +12913,93 @@ sub compile_perl() {
      }
  
      my $alias_sentence = "";
+    my %abbreviations;
      my $alias = property_ref('Name_Alias');
-    if (defined $alias) {
-        push @composition, 'Name_Alias';
-        $perl_charname->set_proxy_for('Name_Alias');
-        my %abbreviations;
-
-        # Add each entry in Name_Alias to Perl_Charnames.  Where these go with
-        # respect to any existing entry depends on the entry type.
-        # Corrections go before said entry, as they should be returned in
-        # preference over the existing entry.  (A correction to a correction
-        # should be later in the Name_Alias table, so it will correctly
-        # precede the erroneous correction in Perl_Charnames.)
-        #
-        # Abbreviations go after everything else, so they are saved
-        # temporarily in a hash for later.
-        #
-        # Controls are currently added afterwards.  This is because Perl has
-        # previously used the Unicode1 name, and so should still use that.
-        # (Most of them will be the same anyway, in which case we don't add a
-        # duplicate)
-
-        $alias->reset_each_range;
-        while (my ($range) = $alias->each_range) {
-            next if $range->value eq "";
-            my $code_point = $range->start;
-            if ($code_point != $range->end) {
-                Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
-            }
-            my ($value, $type) = split ': ', $range->value;
-            my $replace_type;
-            if ($type eq 'correction') {
-                $replace_type = $MULTIPLE_BEFORE;
-            }
-            elsif ($type eq 'abbreviation') {
-
-                # Save for later
-                $abbreviations{$value} = $code_point;
-                next;
-            }
-            elsif ($type eq 'control') {
-                $replace_type = $MULTIPLE_AFTER;
-            }
-            else {
-                $replace_type = $MULTIPLE_AFTER;
-            }
-
-            # Actually add; before or after current entry(ies) as determined
-            # above.
-            $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+    push @composition, 'Name_Alias';
+    $perl_charname->set_proxy_for('Name_Alias');
+
+    # Add each entry in Name_Alias to Perl_Charnames.  Where these go with
+    # respect to any existing entry depends on the entry type.  Corrections go
+    # before said entry, as they should be returned in preference over the
+    # existing entry.  (A correction to a correction should be later in the
+    # Name_Alias table, so it will correctly precede the erroneous correction
+    # in Perl_Charnames.)
+    #
+    # Abbreviations go after everything else, so they are saved temporarily in
+    # a hash for later.
+    #
+    # Controls are currently added afterwards.  This is because Perl has
+    # previously used the Unicode1 name, and so should still use that.  (Most
+    # of them will be the same anyway, in which case we don't add a duplicate)
+
+    $alias->reset_each_range;
+    while (my ($range) = $alias->each_range) {
+        next if $range->value eq "";
+        my $code_point = $range->start;
+        if ($code_point != $range->end) {
+            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
+        }
+        my ($value, $type) = split ': ', $range->value;
+        my $replace_type;
+        if ($type eq 'correction') {
+            $replace_type = $MULTIPLE_BEFORE;
+        }
+        elsif ($type eq 'abbreviation') {
+
+            # Save for later
+            $abbreviations{$value} = $code_point;
+            next;
          }
-
-        # Now add the Unicode_1 names for the controls.  These come after the
-        # official names, as they are only recommended (by TR18; unclear as of
-        # this writing if that recommendation will be withdrawn, but if it is,
-        # we want to add them anyway for backwards compatibility).  Only a few
-        # differ from the official names.
-        foreach my $range (property_ref('Unicode_1_Name')->ranges) {
-            my $code_point = $range->start;
-            my $unicode_1_value = $range->value;
-            next if $unicode_1_value eq "";     # Skip if name doesn't exist.
-
-            if ($code_point != $range->end) {
-                Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
-            }
-
-            # To handle EBCDIC, we don't hard code in the code points of the
-            # controls; instead realizing that all of them are below 256.
-            last if $code_point > 255;
-
-            # We only add in the controls.
-            next if $gc->value_of($code_point) ne 'Cc';
-
-            # This won't add an exact duplicate.
-            $perl_charname->add_duplicate($code_point, $unicode_1_value,
-                                          Replace => $MULTIPLE_AFTER);
+        else {
+            $replace_type = $MULTIPLE_AFTER;
          }
  
-        # Now that have everything added, add in abbreviations after
-        # everything else.
-        foreach my $value (keys %abbreviations) {
-            $perl_charname->add_duplicate($abbreviations{$value}, $value, Replace => $MULTIPLE_AFTER);
-        }
-        $alias_sentence = <<END;
+        # Actually add; before or after current entry(ies) as determined
+        # above.
+
+        $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
+    }
+    $alias_sentence = <<END;
  The Name_Alias property adds duplicate code point entries that are
  alternatives to the original name.  If an addition is a corrected
  name, it will be physically first in the table.  The original (less correct,
  but still valid) name will be next; then any alternatives, in no particular
  order; and finally any abbreviations, again in no particular order.
  END
+
+    # Now add the Unicode_1 names for the controls.  The Unicode_1 names had
+    # precedence before 6.1, so should be first in the file; the other names
+    # have precedence starting in 6.1,
+    my $before_or_after = ($v_version lt v6.1.0)
+                          ? $MULTIPLE_BEFORE
+                          : $MULTIPLE_AFTER;
+
+    foreach my $range (property_ref('Unicode_1_Name')->ranges) {
+        my $code_point = $range->start;
+        my $unicode_1_value = $range->value;
+        next if $unicode_1_value eq "";     # Skip if name doesn't exist.
+
+        if ($code_point != $range->end) {
+            Carp::my_carp_bug("Bad News.  Expecting only one code point in the range $range.  Just to keep going, using only the first code point;");
+        }
+
+        # To handle EBCDIC, we don't hard code in the code points of the
+        # controls; instead realizing that all of them are below 256.
+        last if $code_point > 255;
+
+        # We only add in the controls.
+        next if $gc->value_of($code_point) ne 'Cc';
+
+        # This won't add an exact duplicate.
+        $perl_charname->add_duplicate($code_point, $unicode_1_value,
+                                        Replace => $before_or_after);
+    }
+
+    # Now that have everything added, add in abbreviations after
+    # everything else.
+    foreach my $value (keys %abbreviations) {
+        $perl_charname->add_duplicate($abbreviations{$value}, $value,
+                                        Replace => $MULTIPLE_AFTER);
      }
  
      my $comment;
@@ -14451,7 +14852,8 @@ Certain properties are accessible also via core function calls.  These are:
   Uppercase_Mapping          uc()
  
  Also, Case_Folding is accessible through the C</i> modifier in regular
-expressions.
+expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
+operator.
  
  And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
  interpolation in double-quoted strings and regular expressions; and functions
@@ -14942,33 +15344,36 @@ sub make_UCD () {
  
      # Make a list of all combinations of properties/values that are suppressed.
      my @suppressed;
-    foreach my $property_name (keys %why_suppressed) {
+    if (! $debug_skip) {    # This tends to fail in this debug mode
+        foreach my $property_name (keys %why_suppressed) {
  
-        # Just the value
-        my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
+            # Just the value
+            my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
  
-        # The hash may contain properties not in this release of Unicode
-        next unless defined (my $property = property_ref($property_name));
+            # The hash may contain properties not in this release of Unicode
+            next unless defined (my $property = property_ref($property_name));
  
-        # Find all combinations
-        foreach my $prop_alias ($property->aliases) {
-            my $prop_alias_name = standardize($prop_alias->name);
+            # Find all combinations
+            foreach my $prop_alias ($property->aliases) {
+                my $prop_alias_name = standardize($prop_alias->name);
  
-            # If no =value, there's just one combination possibe for this
-            if (! $value_name) {
+                # If no =value, there's just one combination possibe for this
+                if (! $value_name) {
  
-                # The property may be suppressed, but there may be a proxy for
-                # it, so it shouldn't be listed as suppressed
-                next if $prop_alias->ucd;
-                push @suppressed, $prop_alias_name;
-            }
-            else {  # Otherwise
-                foreach my $value_alias ($property->table($value_name)->aliases)
-                {
-                    next if $value_alias->ucd;
+                    # The property may be suppressed, but there may be a proxy
+                    # for it, so it shouldn't be listed as suppressed
+                    next if $prop_alias->ucd;
+                    push @suppressed, $prop_alias_name;
+                }
+                else {  # Otherwise
+                    foreach my $value_alias
+                                    ($property->table($value_name)->aliases)
+                    {
+                        next if $value_alias->ucd;
  
-                    push @suppressed, "$prop_alias_name="
-                                      .  standardize($value_alias->name);
+                        push @suppressed, "$prop_alias_name="
+                                        .  standardize($value_alias->name);
+                    }
                  }
              }
          }
@@ -15299,10 +15704,11 @@ sub write_all_tables() {
                      }
                  }
              }
-            elsif ($count == $MAX_UNICODE_CODEPOINTS) {
-                if ($table == $property || $table->leader == $table) {
+            elsif ($count == $MAX_UNICODE_CODEPOINTS
+                   && ($table == $property || $table->leader == $table)
+                   && $table->property->status != $PLACEHOLDER)
+            {
                      Carp::my_carp("$table unexpectedly matches all Unicode code points.  Proceeding anyway.");
-                }
              }
  
              if ($table->fate == $SUPPRESSED) {
@@ -15575,6 +15981,7 @@ sub write_all_tables() {
      make_UCD;
  
      make_property_test_script() if $make_test_script;
+    make_normalization_test_script() if $make_norm_test_script;
      return;
  }
  
@@ -16025,6 +16432,82 @@ sub make_property_test_script() {
      return;
  }
  
+sub make_normalization_test_script() {
+    print "Making normalization test script\n" if $verbosity >= $PROGRESS;
+
+    my $n_path = 'TestNorm.pl';
+
+    unshift @normalization_tests, <<'END';
+use utf8;
+use Test::More;
+
+sub ord_string {    # Convert packed ords to printable string
+    use charnames ();
+    return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
+                                                unpack "U*", shift) .  "'";
+    #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) .  "'";
+}
+
+sub Test_N {
+    my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
+    my $display_source = ord_string($source);
+    my $display_nfc = ord_string($nfc);
+    my $display_nfd = ord_string($nfd);
+    my $display_nfkc = ord_string($nfkc);
+    my $display_nfkd = ord_string($nfkd);
+
+    use Unicode::Normalize;
+    #    NFC
+    #      nfc ==  toNFC(source) ==  toNFC(nfc) ==  toNFC(nfd)
+    #      nfkc ==  toNFC(nfkc) ==  toNFC(nfkd)
+    #
+    #    NFD
+    #      nfd ==  toNFD(source) ==  toNFD(nfc) ==  toNFD(nfd)
+    #      nfkd ==  toNFD(nfkc) ==  toNFD(nfkd)
+    #
+    #    NFKC
+    #      nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
+    #      toNFKC(nfkc) == toNFKC(nfkd)
+    #
+    #    NFKD
+    #      nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
+    #      toNFKD(nfkc) == toNFKD(nfkd)
+
+    is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
+    is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
+    is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
+    is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
+    is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
+
+    is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
+    is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
+    is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
+    is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
+    is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
+
+    is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
+    is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
+    is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
+    is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
+    is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
+
+    is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
+    is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
+    is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
+    is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
+    is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
+}
+END
+
+    &write($n_path,
+           1,           # Is utf8;
+           [
+            @normalization_tests,
+            'done_testing();'
+            ]);
+    return;
+}
+
  # This is a list of the input files and how to handle them.  The files are
  # processed in their order in this list.  Some reordering is possible if
  # desired, but the v0 files should be first, and the extracted before the
@@ -16175,8 +16658,9 @@ my @input_file_objects = (
      Input_file->new('BidiMirroring.txt', v3.0.1,
                      Property => 'Bidi_Mirroring_Glyph',
                      ),
-    Input_file->new("NormalizationTest.txt", v3.0.1,
-                    Skip => 'Validation Tests',
+    Input_file->new("NormTest.txt", v3.0.0,
+                     Handler => \&process_NormalizationsTest,
+                     Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
                      ),
      Input_file->new('CaseFolding.txt', v3.0.1,
                      Pre_Handler => \&setup_case_folding,
@@ -16234,7 +16718,7 @@ my @input_file_objects = (
      Input_file->new('NamedSequences.txt', v4.1.0,
                      Handler => \&process_NamedSequences
                      ),
-    Input_file->new('NameAliases.txt', v5.0.0,
+    Input_file->new('NameAliases.txt', v0,
                      Property => 'Name_Alias',
                      Pre_Handler => ($v_version le v6.0.0)
                                     ? \&setup_early_name_alias