Unicode/UCD.pm: Clarify pod

[perl5.git] / lib / Unicode / UCD.pm
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm

index 074284f..a882ab5 100644 (file)
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -4,11 +4,8 @@ use strict;
  use warnings;
  no warnings 'surrogate';    # surrogates can be inputs to this
  use charnames ();
-use Unicode::Normalize qw(getCombinClass NFD);
  
-our $VERSION = '0.40';
-
-use Storable qw(dclone);
+our $VERSION = '0.46';
  
  require Exporter;
  
@@ -20,7 +17,7 @@ our @EXPORT_OK = qw(charinfo
                     charinrange
                     general_categories bidi_types
                     compexcl
-                   casefold casespec
+                   casefold all_casefolds casespec
                     namedseq
                      num
                      prop_aliases
@@ -44,6 +41,9 @@ Unicode::UCD - Unicode character database
      use Unicode::UCD 'casefold';
      my $casefold = casefold(0xFB00);
  
+    use Unicode::UCD 'all_casefolds';
+    my $all_casefolds_ref = all_casefolds();
+
      use Unicode::UCD 'casespec';
      my $casespec = casespec(0xFB00);
  
@@ -114,6 +114,7 @@ my $VERSIONFH;
  my $CASEFOLDFH;
  my $CASESPECFH;
  my $NAMEDSEQFH;
+my $v_unicode_version;  # v-string.
  
  sub openunicode {
      my ($rfh, @path) = @_;
@@ -132,6 +133,35 @@ sub openunicode {
      return $f;
  }
  
+sub _dclone ($) {   # Use Storable::dclone if available; otherwise emulate it.
+
+    use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone);
+
+    return dclone(shift) if defined &dclone;
+
+    my $arg = shift;
+    my $type = ref $arg;
+    return $arg unless $type;   # No deep cloning needed for scalars
+
+    if ($type eq 'ARRAY') {
+        my @return;
+        foreach my $element (@$arg) {
+            push @return, &_dclone($element);
+        }
+        return \@return;
+    }
+    elsif ($type eq 'HASH') {
+        my %return;
+        foreach my $key (keys %$arg) {
+            $return{$key} = &_dclone($arg->{$key});
+        }
+        return \%return;
+    }
+    else {
+        croak "_dclone can't handle " . $type;
+    }
+}
+
  =head2 B<charinfo()>
  
      use Unicode::UCD 'charinfo';
@@ -303,30 +333,7 @@ my %SIMPLE_LOWER;
  my %SIMPLE_TITLE;
  my %SIMPLE_UPPER;
  my %UNICODE_1_NAMES;
-
-sub _charinfo_case {
-
-    # Returns the value to set into one of the case fields in the charinfo
-    # structure.
-    #   $char is the character,
-    #   $cased is the case-changed character
-    #   $file is the file in lib/unicore/To/$file that contains the data
-    #       needed for this, in the form that _search() understands.
-    #   $hash_ref points to the hash holding the contents of $file.  It will
-    #       be populated if empty.
-    # By using the 'uc', etc. functions, we avoid loading more files into
-    # memory except for those rare cases where the simple casing (which has
-    # been what charinfo() has always returned, is different than the full
-    # casing.
-    my ($char, $cased, $file, $hash_ref) = @_;
-
-    return "" if $cased eq $char;
-
-    return sprintf("%04X", ord $cased) if length($cased) == 1;
-
-    %$hash_ref =_read_table("unicore/To/$file", 'use_hash') unless %$hash_ref;
-    return $hash_ref->{ord $char} // "";
-}
+my %ISO_COMMENT;
  
  sub charinfo {
  
@@ -339,6 +346,9 @@ sub charinfo {
  
      use feature 'unicode_strings';
  
+    # Will fail if called under minitest
+    use if defined &DynaLoader::boot_DynaLoader, "Unicode::Normalize" => qw(getCombinClass NFD);
+
      my $arg  = shift;
      my $code = _getcode($arg);
      croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
@@ -349,7 +359,7 @@ sub charinfo {
      my %prop;
      my $char = chr($code);
  
-    @CATEGORIES =_read_table("unicore/To/Gc.pl") unless @CATEGORIES;
+    @CATEGORIES =_read_table("To/Gc.pl") unless @CATEGORIES;
      $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
                          // $utf8::SwashInfo{'ToGc'}{'missing'};
  
@@ -361,7 +371,7 @@ sub charinfo {
  
      $prop{'combining'} = getCombinClass($code);
  
-    @BIDIS =_read_table("unicore/To/Bc.pl") unless @BIDIS;
+    @BIDIS =_read_table("To/Bc.pl") unless @BIDIS;
      $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
                      // $utf8::SwashInfo{'ToBc'}{'missing'};
  
@@ -377,14 +387,15 @@ sub charinfo {
      # Having no decomposition implies an empty field; otherwise, all but
      # "Canonical" imply a compatible decomposition, and the type is prefixed
      # to that, as it is in UnicodeData.txt
-    if ($char =~ /\p{Block=Hangul_Syllables}/) {
+    UnicodeVersion() unless defined $v_unicode_version;
+    if ($v_unicode_version ge v2.0.0 && $char =~ /\p{Block=Hangul_Syllables}/) {
          # The code points of the decomposition are output in standard Unicode
          # hex format, separated by blanks.
          $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
                                             unpack "U*", NFD($char);
      }
      else {
-        @DECOMPOSITIONS = _read_table("unicore/Decomposition.pl")
+        @DECOMPOSITIONS = _read_table("Decomposition.pl")
                            unless @DECOMPOSITIONS;
          $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
                                                                  $code) // "";
@@ -406,8 +417,7 @@ sub charinfo {
              # e.g., TAMIL NUMBER TEN.
              $prop{'decimal'} = "";
  
-            @NUMERIC_TYPES =_read_table("unicore/To/Nt.pl")
-                                unless @NUMERIC_TYPES;
+            @NUMERIC_TYPES =_read_table("To/Nt.pl") unless @NUMERIC_TYPES;
              if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
                  eq 'Digit')
              {
@@ -422,17 +432,34 @@ sub charinfo {
  
      $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
  
-    %UNICODE_1_NAMES =_read_table("unicore/To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
+    %UNICODE_1_NAMES =_read_table("To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
      $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
  
-    # This is true starting in 6.0, but, num() also requires 6.0, so
-    # don't need to test for version again here.
-    $prop{'comment'} = "";
+    UnicodeVersion() unless defined $v_unicode_version;
+    if ($v_unicode_version ge v6.0.0) {
+        $prop{'comment'} = "";
+    }
+    else {
+        %ISO_COMMENT = _read_table("To/Isc.pl", "use_hash") unless %ISO_COMMENT;
+        $prop{'comment'} = (defined $ISO_COMMENT{$code})
+                           ? $ISO_COMMENT{$code}
+                           : "";
+    }
+
+    %SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER;
+    $prop{'upper'} = (defined $SIMPLE_UPPER{$code})
+                     ? sprintf("%04X", $SIMPLE_UPPER{$code})
+                     : "";
  
-    $prop{'upper'} = _charinfo_case($char, uc $char, '_suc.pl', \%SIMPLE_UPPER);
-    $prop{'lower'} = _charinfo_case($char, lc $char, '_slc.pl', \%SIMPLE_LOWER);
-    $prop{'title'} = _charinfo_case($char, ucfirst $char, '_stc.pl',
-                                                                \%SIMPLE_TITLE);
+    %SIMPLE_LOWER = _read_table("To/Lc.pl", "use_hash") unless %SIMPLE_LOWER;
+    $prop{'lower'} = (defined $SIMPLE_LOWER{$code})
+                     ? sprintf("%04X", $SIMPLE_LOWER{$code})
+                     : "";
+
+    %SIMPLE_TITLE = _read_table("To/Tc.pl", "use_hash") unless %SIMPLE_TITLE;
+    $prop{'title'} = (defined $SIMPLE_TITLE{$code})
+                     ? sprintf("%04X", $SIMPLE_TITLE{$code})
+                     : "";
  
      $prop{block}  = charblock($code);
      $prop{script} = charscript($code);
@@ -490,8 +517,17 @@ sub _read_table ($;$) {
      my @return;
      my %return;
      local $_;
+    my $list = do "unicore/$table";
+
+    # Look up if this property requires adjustments, which we do below if it
+    # does.
+    require "unicore/Heavy.pl";
+    my $property = $table =~ s/\.pl//r;
+    $property = $utf8::file_to_swash_name{$property};
+    my $to_adjust = defined $property
+                    && $utf8::SwashInfo{$property}{'format'} eq 'a';
  
-    for (split /^/m, do $table) {
+    for (split /^/m, $list) {
          my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
                                          \s* ( \# .* )?  # Optional comment
                                          $ /x;
@@ -499,11 +535,14 @@ sub _read_table ($;$) {
          my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
          if ($return_hash) {
              foreach my $i ($decimal_start .. $decimal_end) {
-                $return{$i} = $value;
+                $return{$i} = ($to_adjust)
+                              ? $value + $i - $decimal_start
+                              : $value;
              }
          }
-        elsif (@return &&
-               $return[-1][1] == $decimal_start - 1
+        elsif (! $to_adjust
+               && @return
+               && $return[-1][1] == $decimal_start - 1
                 && $return[-1][2] eq $value)
          {
              # If this is merely extending the previous range, do just that.
@@ -539,7 +578,8 @@ With a L</code point argument> charblock() returns the I<block> the code point
  belongs to, e.g.  C<Basic Latin>.  The old-style block name is returned (see
  L</Old-style versus new-style block names>).
  If the code point is unassigned, this returns the block it would belong to if
-it were assigned.
+it were assigned.  (If the Unicode version being used is so early as to not
+have blocks, all code points are considered to be in C<No_Block>.)
  
  See also L</Blocks versus Scripts>.
  
@@ -565,7 +605,13 @@ sub _charblocks {
      # Can't read from the mktables table because it loses the hyphens in the
      # original.
      unless (@BLOCKS) {
-       if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
+        UnicodeVersion() unless defined $v_unicode_version;
+        if ($v_unicode_version lt v2.0.0) {
+            my $subrange = [ 0, 0x10FFFF, 'No_Block' ];
+            push @BLOCKS, $subrange;
+            push @{$BLOCKS{$3}}, $subrange;
+        }
+        elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) {
             local $_;
             local $/ = "\n";
             while (<$BLOCKSFH>) {
@@ -594,7 +640,7 @@ sub charblock {
          return 'No_Block';
      }
      elsif (exists $BLOCKS{$arg}) {
-        return dclone $BLOCKS{$arg};
+        return _dclone $BLOCKS{$arg};
      }
  }
  
@@ -610,7 +656,8 @@ sub charblock {
  
  With a L</code point argument> charscript() returns the I<script> the
  code point belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
-If the code point is unassigned, it returns C<"Unknown">.
+If the code point is unassigned or the Unicode version being used is so early
+that it doesn't have scripts, this function returns C<"Unknown">.
  
  If supplied with an argument that can't be a code point, charscript() tries
  to do the opposite and interpret the argument as a script name. The
@@ -627,7 +674,15 @@ my @SCRIPTS;
  my %SCRIPTS;
  
  sub _charscripts {
-    @SCRIPTS =_read_table("unicore/To/Sc.pl") unless @SCRIPTS;
+    unless (@SCRIPTS) {
+        UnicodeVersion() unless defined $v_unicode_version;
+        if ($v_unicode_version lt v3.1.0) {
+            push @SCRIPTS, [ 0, 0x10FFFF, 'Unknown' ];
+        }
+        else {
+            @SCRIPTS =_read_table("To/Sc.pl");
+        }
+    }
      foreach my $entry (@SCRIPTS) {
          $entry->[2] =~ s/(_\w)/\L$1/g;  # Preserve old-style casing
          push @{$SCRIPTS{$entry->[2]}}, $entry;
@@ -646,7 +701,7 @@ sub charscript {
          return $result if defined $result;
          return $utf8::SwashInfo{'ToSc'}{'missing'};
      } elsif (exists $SCRIPTS{$arg}) {
-        return dclone $SCRIPTS{$arg};
+        return _dclone $SCRIPTS{$arg};
      }
  
      return;
@@ -673,7 +728,7 @@ See also L</Blocks versus Scripts>.
  
  sub charblocks {
      _charblocks() unless %BLOCKS;
-    return dclone \%BLOCKS;
+    return _dclone \%BLOCKS;
  }
  
  =head2 B<charscripts()>
@@ -695,7 +750,7 @@ See also L</Blocks versus Scripts>.
  
  sub charscripts {
      _charscripts() unless %SCRIPTS;
-    return dclone \%SCRIPTS;
+    return _dclone \%SCRIPTS;
  }
  
  =head2 B<charinrange()>
@@ -755,7 +810,7 @@ my %GENERAL_CATEGORIES =
   );
  
  sub general_categories {
-    return dclone \%GENERAL_CATEGORIES;
+    return _dclone \%GENERAL_CATEGORIES;
  }
  
  =head2 B<general_categories()>
@@ -823,7 +878,7 @@ the bidi type name.
  =cut
  
  sub bidi_types {
-    return dclone \%BIDI_TYPES;
+    return _dclone \%BIDI_TYPES;
  }
  
  =head2 B<compexcl()>
@@ -832,7 +887,9 @@ sub bidi_types {
  
      my $compexcl = compexcl(0x09dc);
  
-This routine is included for backwards compatibility, but as of Perl 5.12, for
+This routine returns C<undef> if the Unicode version being used is so early
+that it doesn't have this property.  It is included for backwards
+compatibility, but as of Perl 5.12 and more modern Unicode versions, for
  most purposes it is probably more convenient to use one of the following
  instead:
  
@@ -867,6 +924,9 @@ sub compexcl {
      croak __PACKAGE__, "::compexcl: unknown code '$arg'"
         unless defined $code;
  
+    UnicodeVersion() unless defined $v_unicode_version;
+    return if $v_unicode_version lt v3.0.0;
+
      no warnings "non_unicode";     # So works on non-Unicode code points
      return chr($code) =~ /\p{Composition_Exclusion}/;
  }
@@ -893,9 +953,11 @@ sub compexcl {
      }
  
  This returns the (almost) locale-independent case folding of the
-character specified by the L</code point argument>.
+character specified by the L</code point argument>.  (Starting in Perl v5.16,
+the core function C<fc()> returns the C<full> mapping (described below)
+faster than this does, and for entire strings.)
  
-If there is no case folding for that code point, C<undef> is returned.
+If there is no case folding for the input code point, C<undef> is returned.
  
  If there is a case folding for that code point, a reference to a hash
  with the following fields is returned:
@@ -938,7 +1000,7 @@ Note that this
  describes the contents of I<mapping>.  It is defined primarily for backwards
  compatibility.
  
-On versions 3.1 and earlier of Unicode, I<status> can also be
+For Unicode versions between 3.1 and 3.1.1 inclusive, I<status> can also be
  C<I> which is the same as C<C> but is a special case for dotted uppercase I and
  dotless lowercase i:
  
@@ -967,7 +1029,8 @@ Each code has at least four hexdigits.
  Note that this folding does not maintain canonical equivalence without
  additional processing.
  
-For versions of Unicode 3.1 and earlier, this field is empty unless there is a
+For Unicode versions between 3.1 and 3.1.1 inclusive, this field is empty unless
+there is a
  special folding for Turkic languages, in which case I<status> is C<I>, and
  I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.  
  
@@ -997,54 +1060,88 @@ L<http://www.unicode.org/unicode/reports/tr21>
  my %CASEFOLD;
  
  sub _casefold {
-    unless (%CASEFOLD) {
-       if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
-           local $_;
-           local $/ = "\n";
-           while (<$CASEFOLDFH>) {
-               if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
-                   my $code = hex($1);
-                   $CASEFOLD{$code}{'code'} = $1;
-                   $CASEFOLD{$code}{'turkic'} = "" unless
-                                           defined $CASEFOLD{$code}{'turkic'};
-                   if ($2 eq 'C' || $2 eq 'I') {       # 'I' is only on 3.1 and
-                                                       # earlier Unicodes
-                                                       # Both entries there (I
-                                                       # only checked 3.1) are
-                                                       # the same as C, and
-                                                       # there are no other
-                                                       # entries for those
-                                                       # codepoints, so treat
-                                                       # as if C, but override
-                                                       # the turkic one for
-                                                       # 'I'.
-                       $CASEFOLD{$code}{'status'} = $2;
-                       $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
-                       $CASEFOLD{$code}{'mapping'} = $3;
-                       $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
-                   } elsif ($2 eq 'F') {
-                       $CASEFOLD{$code}{'full'} = $3;
-                       unless (defined $CASEFOLD{$code}{'simple'}) {
-                               $CASEFOLD{$code}{'simple'} = "";
-                               $CASEFOLD{$code}{'mapping'} = $3;
-                               $CASEFOLD{$code}{'status'} = $2;
-                       }
-                   } elsif ($2 eq 'S') {
+    unless (%CASEFOLD) {   # Populate the hash
+        my ($full_invlist_ref, $full_invmap_ref, undef, $default)
+                                                = prop_invmap('Case_Folding');
+
+        # Use the recipe given in the prop_invmap() pod to convert the
+        # inversion map into the hash.
+        for my $i (0 .. @$full_invlist_ref - 1 - 1) {
+            next if $full_invmap_ref->[$i] == $default;
+            my $adjust = -1;
+            for my $j ($full_invlist_ref->[$i] .. $full_invlist_ref->[$i+1] -1) {
+                $adjust++;
+                if (! ref $full_invmap_ref->[$i]) {
+
+                    # This is a single character mapping
+                    $CASEFOLD{$j}{'status'} = 'C';
+                    $CASEFOLD{$j}{'simple'}
+                        = $CASEFOLD{$j}{'full'}
+                        = $CASEFOLD{$j}{'mapping'}
+                        = sprintf("%04X", $full_invmap_ref->[$i] + $adjust);
+                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+                    $CASEFOLD{$j}{'turkic'} = "";
+                }
+                else {  # prop_invmap ensures that $adjust is 0 for a ref
+                    $CASEFOLD{$j}{'status'} = 'F';
+                    $CASEFOLD{$j}{'full'}
+                    = $CASEFOLD{$j}{'mapping'}
+                    = join " ", map { sprintf "%04X", $_ }
+                                                    @{$full_invmap_ref->[$i]};
+                    $CASEFOLD{$j}{'simple'} = "";
+                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+                    $CASEFOLD{$j}{'turkic'} = "";
+                }
+            }
+        }
  
+        # We have filled in the full mappings above, assuming there were no
+        # simple ones for the ones with multi-character maps.  Now, we find
+        # and fix the cases where that assumption was false.
+        (my ($simple_invlist_ref, $simple_invmap_ref, undef), $default)
+                                        = prop_invmap('Simple_Case_Folding');
+        for my $i (0 .. @$simple_invlist_ref - 1 - 1) {
+            next if $simple_invmap_ref->[$i] == $default;
+            my $adjust = -1;
+            for my $j ($simple_invlist_ref->[$i]
+                       .. $simple_invlist_ref->[$i+1] -1)
+            {
+                $adjust++;
+                next if $CASEFOLD{$j}{'status'} eq 'C';
+                $CASEFOLD{$j}{'status'} = 'S';
+                $CASEFOLD{$j}{'simple'}
+                    = $CASEFOLD{$j}{'mapping'}
+                    = sprintf("%04X", $simple_invmap_ref->[$i] + $adjust);
+                $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+                $CASEFOLD{$j}{'turkic'} = "";
+            }
+        }
  
-                       # There can't be a simple without a full, and simple
-                       # overrides all but full
+        # We hard-code in the turkish rules
+        UnicodeVersion() unless defined $v_unicode_version;
+        if ($v_unicode_version ge v3.2.0) {
  
-                       $CASEFOLD{$code}{'simple'} = $3;
-                       $CASEFOLD{$code}{'mapping'} = $3;
-                       $CASEFOLD{$code}{'status'} = $2;
-                   } elsif ($2 eq 'T') {
-                       $CASEFOLD{$code}{'turkic'} = $3;
-                   } # else can't happen because only [CIFST] are possible
-               }
-           }
-           close($CASEFOLDFH);
-       }
+            # These two code points should already have regular entries, so
+            # just fill in the turkish fields
+            $CASEFOLD{ord('I')}{'turkic'} = '0131';
+            $CASEFOLD{0x130}{'turkic'} = sprintf "%04X", ord('i');
+        }
+        elsif ($v_unicode_version ge v3.1.0) {
+
+            # These two code points don't have entries otherwise.
+            $CASEFOLD{0x130}{'code'} = '0130';
+            $CASEFOLD{0x131}{'code'} = '0131';
+            $CASEFOLD{0x130}{'status'} = $CASEFOLD{0x131}{'status'} = 'I';
+            $CASEFOLD{0x130}{'turkic'}
+                = $CASEFOLD{0x130}{'mapping'}
+                = $CASEFOLD{0x130}{'full'}
+                = $CASEFOLD{0x130}{'simple'}
+                = $CASEFOLD{0x131}{'turkic'}
+                = $CASEFOLD{0x131}{'mapping'}
+                = $CASEFOLD{0x131}{'full'}
+                = $CASEFOLD{0x131}{'simple'}
+                = sprintf "%04X", ord('i');
+        }
      }
  }
  
@@ -1059,6 +1156,55 @@ sub casefold {
      return $CASEFOLD{$code};
  }
  
+=head2 B<all_casefolds()>
+
+
+    use Unicode::UCD 'all_casefolds';
+
+    my $all_folds_ref = all_casefolds();
+    foreach my $char_with_casefold (sort { $a <=> $b }
+                                    keys %$all_folds_ref)
+    {
+        printf "%04X:", $char_with_casefold;
+        my $casefold = $all_folds_ref->{$char_with_casefold};
+
+        # Get folds for $char_with_casefold
+
+        my @full_fold_hex = split / /, $casefold->{'full'};
+        my $full_fold_string =
+                    join "", map {chr(hex($_))} @full_fold_hex;
+        print " full=", join " ", @full_fold_hex;
+        my @turkic_fold_hex =
+                        split / /, ($casefold->{'turkic'} ne "")
+                                        ? $casefold->{'turkic'}
+                                        : $casefold->{'full'};
+        my $turkic_fold_string =
+                        join "", map {chr(hex($_))} @turkic_fold_hex;
+        print "; turkic=", join " ", @turkic_fold_hex;
+        if (defined $casefold && $casefold->{'simple'} ne "") {
+            my $simple_fold_hex = $casefold->{'simple'};
+            my $simple_fold_string = chr(hex($simple_fold_hex));
+            print "; simple=$simple_fold_hex";
+        }
+        print "\n";
+    }
+
+This returns all the case foldings in the current version of Unicode in the
+form of a reference to a hash.  Each key to the hash is the decimal
+representation of a Unicode character that has a casefold to other than
+itself.  The casefold of a semi-colon is itself, so it isn't in the hash;
+likewise for a lowercase "a", but there is an entry for a capital "A".  The
+hash value for each key is another hash, identical to what is returned by
+L</casefold()> if called with that code point as its argument.  So the value
+C<< all_casefolds()->{ord("A")}' >> is equivalent to C<casefold(ord("A"))>;
+
+=cut
+
+sub all_casefolds () {
+    _casefold() unless %CASEFOLD;
+    return _dclone \%CASEFOLD;
+}
+
  =head2 B<casespec()>
  
      use Unicode::UCD 'casespec';
@@ -1161,15 +1307,25 @@ my %CASESPEC;
  
  sub _casespec {
      unless (%CASESPEC) {
-       if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
+        UnicodeVersion() unless defined $v_unicode_version;
+        if ($v_unicode_version lt v2.1.8) {
+            %CASESPEC = {};
+        }
+       elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
             local $_;
             local $/ = "\n";
             while (<$CASESPECFH>) {
                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
+
                     my ($hexcode, $lower, $title, $upper, $condition) =
                         ($1, $2, $3, $4, $5);
                     my $code = hex($hexcode);
-                   if (exists $CASESPEC{$code}) {
+
+                    # In 2.1.8, there were duplicate entries; ignore all but
+                    # the first one -- there were no conditions in the file
+                    # anyway.
+                   if (exists $CASESPEC{$code} && $v_unicode_version ne v2.1.8)
+                    {
                         if (exists $CASESPEC{$code}->{code}) {
                             my ($oldlower,
                                 $oldtitle,
@@ -1222,7 +1378,7 @@ sub casespec {
  
      _casespec() unless %CASESPEC;
  
-    return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
+    return ref $CASESPEC{$code} ? _dclone $CASESPEC{$code} : $CASESPEC{$code};
  }
  
  =head2 B<namedseq()>
@@ -1307,17 +1463,7 @@ sub namedseq {
  my %NUMERIC;
  
  sub _numeric {
-
-    # Unicode 6.0 instituted the rule that only digits in a consecutive
-    # block of 10 would be considered decimal digits.  Before that, the only
-    # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
-    # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
-    # ONE.  The code could be modified to handle that, but not bothering, as
-    # in TUS 6.0, U+19DA was changed to Nt=Di.
-    if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
-       croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
-    }
-    my @numbers = _read_table("unicore/To/Nv.pl");
+    my @numbers = _read_table("To/Nv.pl");
      foreach my $entry (@numbers) {
          my ($start, $end, $value) = @$entry;
  
@@ -1327,10 +1473,17 @@ sub _numeric {
              my $real = $rational[0] / $rational[1];
              $real_to_rational{$real} = $value;
              $value = $real;
-        }
  
-        for my $i ($start .. $end) {
-            $NUMERIC{$i} = $value;
+            # Should only be single element, but just in case...
+            for my $i ($start .. $end) {
+                $NUMERIC{$i} = $value;
+            }
+        }
+        else {
+            # The values require adjusting, as is in 'a' format
+            for my $i ($start .. $end) {
+                $NUMERIC{$i} = $value + $i - $start;
+            }
          }
      }
  
@@ -1420,14 +1573,43 @@ sub num {
      return if $string =~ /\D/;
      my $first_ord = ord(substr($string, 0, 1));
      my $value = $NUMERIC{$first_ord};
+
+    # To be a valid decimal number, it should be in a block of 10 consecutive
+    # characters, whose values are 0, 1, 2, ... 9.  Therefore this digit's
+    # value is its offset in that block from the character that means zero.
      my $zero_ord = $first_ord - $value;
  
+    # Unicode 6.0 instituted the rule that only digits in a consecutive
+    # block of 10 would be considered decimal digits.  If this is an earlier
+    # release, we verify that this first character is a member of such a
+    # block.  That is, that the block of characters surrounding this one
+    # consists of all \d characters whose numeric values are the expected
+    # ones.
+    UnicodeVersion() unless defined $v_unicode_version;
+    if ($v_unicode_version lt v6.0.0) {
+        for my $i (0 .. 9) {
+            my $ord = $zero_ord + $i;
+            return unless chr($ord) =~ /\d/;
+            my $numeric = $NUMERIC{$ord};
+            return unless defined $numeric;
+            return unless $numeric == $i;
+        }
+    }
+
      for my $i (1 .. $length -1) {
+
+        # Here we know either by verifying, or by fact of the first character
+        # being a \d in Unicode 6.0 or later, that any character between the
+        # character that means 0, and 9 positions above it must be \d, and
+        # must have its value correspond to its offset from the zero.  Any
+        # characters outside these 10 do not form a legal number for this
+        # function.
          my $ord = ord(substr($string, $i, 1));
          my $digit = $ord - $zero_ord;
          return unless $digit >= 0 && $digit <= 9;
          $value = $value * 10 + $digit;
      }
+
      return $value;
  }
  
@@ -1669,7 +1851,7 @@ sub prop_aliases ($) {
      # The full name is in element 1.
      return $list_ref->[1] unless wantarray;
  
-    return @{dclone $list_ref};
+    return @{_dclone $list_ref};
  }
  
  =pod
@@ -1808,7 +1990,7 @@ sub prop_value_aliases ($$) {
          # The full name is in element 1.
          return $list_ref->[1] unless wantarray;
  
-        return @{dclone $list_ref};
+        return @{_dclone $list_ref};
      }
  
      return $list_ref->[0] unless wantarray;
@@ -1835,7 +2017,8 @@ by the input parameter string:
   prints:
   0, 1114112
  
-An empty list is returned if the input is unknown; the number of elements in
+If the input is unknown C<undef> is returned in scalar context; an empty-list
+in list context.  If the input is known, the number of elements in
  the list is returned if called in scalar context.
  
  L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives
@@ -1945,8 +2128,12 @@ properties, and will return C<undef> if called with one of those.
  our %loose_defaults;
  our $MAX_UNICODE_CODEPOINT;
  
-sub prop_invlist ($) {
+sub prop_invlist ($;$) {
      my $prop = $_[0];
+
+    # Undocumented way to get at Perl internal properties
+    my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
+
      return if ! defined $prop;
  
      require "utf8_heavy.pl";
@@ -1963,7 +2150,7 @@ sub prop_invlist ($) {
                || ref $swash eq ""
                || $swash->{'BITS'} != 1
                || $swash->{'USER_DEFINED'}
-              || $prop =~ /^\s*_/;
+              || (! $internal_ok && $prop =~ /^\s*_/);
  
      if ($swash->{'EXTRAS'}) {
          carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
@@ -2190,7 +2377,7 @@ The first line (with Index [0]) means that the value for code point 0 is "Basic
  Latin".  The entry "0x0080" in the @blocks_ranges column in the second line
  means that the value from the first line, "Basic Latin", extends to all code
  points in the range from 0 up to but not including 0x0080, that is, through
-255.  In other words, the code points from 0 to 255 are all in the "Basic
+127.  In other words, the code points from 0 to 127 are all in the "Basic
  Latin" block.  Similarly, all code points in the range from 0x0080 up to (but
  not including) 0x0100 are in the block named "Latin-1 Supplement", etc.
  (Notice that the return is the old-style block names; see L</Old-style versus
@@ -2224,19 +2411,19 @@ There are exceptions to the simple scalar maps.  Some properties have some
  elements in their map list that are themselves lists of scalars; and some
  special strings are returned that are not to be interpreted as-is.  Element
  [2] (placed into C<$format> in the example above) of the returned four element
-list tells you if the map has any of these special elements, as follows:
+list tells you if the map has any of these special elements or not, as follows:
  
  =over
  
-=item C<s>
+=item B<C<s>>
  
  means all the elements of the map array are simple scalars, with no special
  elements.  Almost all properties are like this, like the C<block> example
  above.
  
-=item C<sl>
+=item B<C<sl>>
  
-means that some of the map array elements have the form given by C<s>, and
+means that some of the map array elements have the form given by C<"s">, and
  the rest are lists of scalars.  For example, here is a portion of the output
  of calling C<prop_invmap>() with the "Script Extensions" property:
  
@@ -2247,70 +2434,67 @@ of calling C<prop_invmap>() with the "Script Extensions" property:
        0x0966      Devanagari
        0x0970      Common
  
-Here, the code points 0x964 and 0x965 are used in the Bengali,
-Devanagari, Gurmukhi, and Oriya  scripts.
+Here, the code points 0x964 and 0x965 are both used in Bengali,
+Devanagari, Gurmukhi, and Oriya, but no other scripts.
  
-The Name_Alias property is of this form.  But each scalar consists of two
+The Name_Alias property is also of this form.  But each scalar consists of two
  components:  1) the name, and 2) the type of alias this is.  They are
-separated by a colon and a space.  In Unicode 6.0, there are two alias types:
-C<"correction">, which indicates that the name is a corrected form for the
-original name (which remains valid) for the same code point; and C<"control">,
-which adds a new name for a control character.
+separated by a colon and a space.  In Unicode 6.1, there are several alias types:
  
-For example,
+=over
  
- @aliases_ranges  @alias_maps
-    ...
-    0x01A2        LATIN CAPITAL LETTER GHA: correction
-    0x01A3        LATIN SMALL LETTER GHA: correction
+=item C<correction>
  
-Unicode 6.1 will introduce other types, and some map entries will be lists of
-multiple name-alias pairs for a single code point.
+indicates that the name is a corrected form for the
+original name (which remains valid) for the same code point.
  
-=item C<r>
+=item C<control>
  
-means that all the elements of the map array are either rational numbers or
-the string C<"NaN">, meaning "Not a Number".  A rational number is either an
-integer, or two integers separated by a solidus (C<"/">).  The second integer
-represents the denominator of the division implied by the solidus, and is
-guaranteed not to be 0.  If you want to convert them to scalar numbers, you
-can use something like this:
+adds a new name for a control character.
  
- my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
- if ($format && $format eq "r") {
-     map { $_ = eval $_ } @$invmap_ref;
- }
+=item C<alternate>
  
-Here's some entries from the output of the property "Nv", which has format
-C<"r">.
-
- @numerics_ranges  @numerics_maps        Note
-        0x00             "NaN"
-        0x30             0              DIGIT 0
-        0x31             1
-        0x32             2
-        ...
-        0x37             7
-        0x38             8
-        0x39             9              DIGIT 9
-        0x3A             "NaN"
-        0xB2             2              SUPERSCRIPT 2
-        0xB3             3              SUPERSCRIPT 2
-        0xB4             "NaN"
-        0xB9             1              SUPERSCRIPT 1
-        0xBA             "NaN"
-        0xBC             1/4            VULGAR FRACTION 1/4
-        0xBD             1/2            VULGAR FRACTION 1/2
-        0xBE             3/4            VULGAR FRACTION 3/4
-        0xBF             "NaN"
-        0x660            0              ARABIC-INDIC DIGIT ZERO
-
-=item C<c>
-
-is like C<s> in that all the map array elements are scalars, but some of them
-are the special string S<C<"E<lt>code pointE<gt>">>, meaning that the map of
-each code point in the corresponding range in the inversion list is the code
-point itself.  For example, in:
+is an alternate name for a character
+
+=item C<figment>
+
+is a name for a character that has been documented but was never in any
+actual standard.
+
+=item C<abbreviation>
+
+is a common abbreviation for a character
+
+=back
+
+The lists are ordered (roughly) so the most preferred names come before less
+preferred ones.
+
+For example,
+
+ @aliases_ranges        @alias_maps
+    ...
+    0x009E        [ 'PRIVACY MESSAGE: control', 'PM: abbreviation' ]
+    0x009F        [ 'APPLICATION PROGRAM COMMAND: control',
+                    'APC: abbreviation'
+                  ]
+    0x00A0        'NBSP: abbreviation'
+    0x00A1        ""
+    0x00AD        'SHY: abbreviation'
+    0x00AE        ""
+    0x01A2        'LATIN CAPITAL LETTER GHA: correction'
+    0x01A3        'LATIN SMALL LETTER GHA: correction'
+    0x01A4        ""
+    ...
+
+A map to the empty string means that there is no alias defined for the code
+point.
+
+=item B<C<a>>
+
+is like C<"s"> in that all the map array elements are scalars, but here they are
+restricted to all being integers, and some have to be adjusted (hence the name
+C<"a">) to get the correct result.  For example, in:
  
   my ($uppers_ranges_ref, $uppers_maps_ref, $format)
                            = prop_invmap("Simple_Uppercase_Mapping");
@@ -2318,28 +2502,33 @@ point itself.  For example, in:
  the returned arrays look like this:
  
   @$uppers_ranges_ref    @$uppers_maps_ref   Note
-       0                 "<code point>"
-      97                     65          'a' maps to 'A'
-      98                     66          'b' => 'B'
-      99                     67          'c' => 'C'
-      ...
-     120                     88          'x' => 'X'
-     121                     89          'y' => 'Y'
-     122                     90          'z' => 'Z'
-     123                "<code point>"
+       0                      0
+      97                     65          'a' maps to 'A', b => B ...
+     123                      0
       181                    924          MICRO SIGN => Greek Cap MU
-     182                "<code point>"
+     182                      0
       ...
  
-The first line means that the uppercase of code point 0 is 0;
-the uppercase of code point 1 is 1; ...  of code point 96 is 96.  Without the
-C<"E<lt>code_pointE<gt>"> notation, every code point would have to have an
-entry.  This would mean that the arrays would each have more than a million
-entries to list just the legal Unicode code points!
+Let's start with the second line.  It says that the uppercase of code point 97
+is 65; or C<uc("a")> == "A".  But the line is for the entire range of code
+points 97 through 122.  To get the mapping for any code point in a range, you
+take the offset it has from the beginning code point of the range, and add
+that to the mapping for that first code point.  So, the mapping for 122 ("z")
+is derived by taking the offset of 122 from 97 (=25) and adding that to 65,
+yielding 90 ("z").  Likewise for everything in between.
+
+The first line works the same way.  The first map in a range is always the
+correct value for its code point (because the adjustment is 0).  Thus the
+C<uc(chr(0))> is just itself.  Also, C<uc(chr(1))> is also itself, as the
+adjustment is 0+1-0 .. C<uc(chr(96))> is 96.
+
+Requiring this simple adjustment allows the returned arrays to be
+significantly smaller than otherwise, up to a factor of 10, speeding up
+searching through them.
  
-=item C<cl>
+=item B<C<al>>
  
-means that some of the map array elements have the form given by C<c>, and
+means that some of the map array elements have the form given by C<"a">, and
  the rest are ordered lists of code points.
  For example, in:
  
@@ -2349,43 +2538,113 @@ For example, in:
  the returned arrays look like this:
  
   @$uppers_ranges_ref    @$uppers_maps_ref
-       0                 "<code point>"
+       0                      0
        97                     65
-     ...
-     122                     90
-     123                "<code point>"
+     123                      0
       181                    924
-     182                "<code point>"
+     182                      0
       ...
      0x0149              [ 0x02BC 0x004E ]
-    0x014A              "<code point>"
-    0x014B                 0x014A
+    0x014A                    0
+    0x014B                  330
       ...
  
  This is the full Uppercase_Mapping property (as opposed to the
-Simple_Uppercase_Mapping given in the example for format C<"c">).  The only
+Simple_Uppercase_Mapping given in the example for format C<"a">).  The only
  difference between the two in the ranges shown is that the code point at
  0x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
  characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
  CAPITAL LETTER N).
  
-=item C<cle>
+No adjustments are needed to entries that are references to arrays; each such
+entry will have exactly one element in its range, so the offset is always 0.
+
+=item B<C<ae>>
+
+This is like C<"a">, but some elements are the empty string, and should not be
+adjusted.
+The one internal Perl property accessible by C<prop_invmap> is of this type:
+"Perl_Decimal_Digit" returns an inversion map which gives the numeric values
+that are represented by the Unicode decimal digit characters.  Characters that
+don't represent decimal digits map to the empty string, like so:
+
+ @digits    @values
+ 0x0000       ""
+ 0x0030        0
+ 0x003A:      ""
+ 0x0660:       0
+ 0x066A:      ""
+ 0x06F0:       0
+ 0x06FA:      ""
+ 0x07C0:       0
+ 0x07CA:      ""
+ 0x0966:       0
+ ...
+
+This means that the code points from 0 to 0x2F do not represent decimal digits;
+the code point 0x30 (DIGIT ZERO) represents 0;  code point 0x31, (DIGIT ONE),
+represents 0+1-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9-0 = 9;
+... code points 0x3A through 0x65F do not represent decimal digits; 0x660
+(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE),
+represents 0+1-0 = 1 ...
  
-means that some of the map array elements have the forms given by C<cl>, and
+=item B<C<ale>>
+
+is a combination of the C<"al"> type and the C<"ae"> type.  Some of
+the map array elements have the forms given by C<"al">, and
  the rest are the empty string.  The property C<NFKC_Casefold> has this form.
  An example slice is:
  
   @$ranges_ref  @$maps_ref         Note
      ...
-   0x00AA     0x0061              FEMININE ORDINAL INDICATOR => 'a'
-   0x00AB     <code point>
+   0x00AA       97                FEMININE ORDINAL INDICATOR => 'a'
+   0x00AB        0
     0x00AD                         SOFT HYPHEN => ""
-   0x00AE     <code point>
+   0x00AE        0
     0x00AF     [ 0x0020, 0x0304 ]  MACRON => SPACE . COMBINING MACRON
-   0x00B0     <code point>
+   0x00B0        0
     ...
  
-=item C<n>
+=item B<C<ar>>
+
+means that all the elements of the map array are either rational numbers or
+the string C<"NaN">, meaning "Not a Number".  A rational number is either an
+integer, or two integers separated by a solidus (C<"/">).  The second integer
+represents the denominator of the division implied by the solidus, and is
+actually always positive, so it is guaranteed not to be 0 and to not be
+signed.  When the element is a plain integer (without the
+solidus), it may need to be adjusted to get the correct value by adding the
+offset, just as other C<"a"> properties.  No adjustment is needed for
+fractions, as the range is guaranteed to have just a single element, and so
+the offset is always 0.
+
+If you want to convert the returned map to entirely scalar numbers, you
+can use something like this:
+
+ my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
+ if ($format && $format eq "ar") {
+     map { $_ = eval $_ if $_ ne 'NaN' } @$map_ref;
+ }
+
+Here's some entries from the output of the property "Nv", which has format
+C<"ar">.
+
+ @numerics_ranges  @numerics_maps       Note
+        0x00           "NaN"
+        0x30             0           DIGIT 0 .. DIGIT 9
+        0x3A           "NaN"
+        0xB2             2           SUPERSCRIPTs 2 and 3
+        0xB4           "NaN"
+        0xB9             1           SUPERSCRIPT 1
+        0xBA           "NaN"
+        0xBC            1/4          VULGAR FRACTION 1/4
+        0xBD            1/2          VULGAR FRACTION 1/2
+        0xBE            3/4          VULGAR FRACTION 3/4
+        0xBF           "NaN"
+        0x660            0           ARABIC-INDIC DIGIT ZERO .. NINE
+        0x66A          "NaN"
+
+=item B<C<n>>
  
  means the Name property.  All the elements of the map array are simple
  scalars, but some of them contain special strings that require more work to
@@ -2397,8 +2656,8 @@ Entries such as:
  
  mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-"
  with the code point (expressed in hexadecimal) appended to it, like "CJK
-UNIFIED IDEOGRAPH-3403" (similarly for C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
-pointE<gt>>).
+UNIFIED IDEOGRAPH-3403" (similarly for S<C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
+pointE<gt>>>).
  
  Also, entries like
  
@@ -2409,25 +2668,39 @@ the function L<charnames/charnames::viacode(code)>.
  
  Note that for control characters (C<Gc=cc>), Unicode's data files have the
  string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
-string.  This function returns that real name, the empty string.
+string.  This function returns that real name, the empty string.  (There are
+names for these characters, but they are considered aliases, not the Name
+property name, and are contained in the C<Name_Alias> property.)
  
-=item C<d>
+=item B<C<ad>>
  
-means the Decomposition_Mapping property.  This property is like C<cl>
-properties, except it has an additional entry type:
+means the Decomposition_Mapping property.  This property is like C<"al">
+properties, except that one of the scalar elements is of the form:
  
   <hangul syllable>
  
-for those code points whose decomposition is algorithmically calculated.  (The
-C<n> format has this same entry.)  These can be generated via the function
+This signifies that this entry should be replaced by the decompositions for
+all the code points whose decomposition is algorithmically calculated.  (All
+of them are currently in one range and no others outisde the range are likely
+to ever be added to Unicode; the C<"n"> format
+has this same entry.)  These can be generated via the function
  L<Unicode::Normalize::NFD()|Unicode::Normalize>.
  
-
  Note that the mapping is the one that is specified in the Unicode data files,
  and to get the final decomposition, it may need to be applied recursively.
  
  =back
  
+Note that a format begins with the letter "a" if and only the property it is
+for requires adjustments by adding the offsets in multi-element ranges.  For
+all these properties, an entry should be adjusted only if the map is a scalar
+which is an integer.  That is, it must match the regular expression:
+
+    / ^ -? \d+ $ /xa
+
+Further, the first element in a range never needs adjustment, as the
+adjustment would be just adding 0.
+
  A binary search can be used to quickly find a code point in the inversion
  list, and hence its corresponding mapping.
  
@@ -2441,10 +2714,14 @@ potentially make your data structure much smaller.  As you construct your data
  structure from the one returned by this function, simply ignore those ranges
  that map to this value, generally called the "default" value.  For example, to
  convert to the data structure searchable by L</charinrange()>, you can follow
-this recipe:
+this recipe for properties that don't require adjustments:
  
   my ($list_ref, $map_ref, $format, $missing) = prop_invmap($property);
   my @range_list;
+
+ # Look at each element in the list, but the -2 is needed because we
+ # look at $i+1 in the loop, and the final element is guaranteed to map
+ # to $missing by prop_invmap(), so we would skip it anyway.
   for my $i (0 .. @$list_ref - 2) {
      next if $map_ref->[$i] eq $missing;
      push @range_list, [ $list_ref->[$i],
@@ -2455,32 +2732,39 @@ this recipe:
  
   print charinrange(\@range_list, $code_point), "\n";
  
-
  With this, C<charinrange()> will return C<undef> if its input code point maps
  to C<$missing>.  You can avoid this by omitting the C<next> statement, and adding
  a line after the loop to handle the final element of the inversion map.
  
-One internal Perl property is accessible by this function.
-"Perl_Decimal_Digit" returns an inversion map in which all the Unicode decimal
-digits map to their numeric values, and everything else to the empty string,
-like so:
+Similarly, this recipe can be used for properties that do require adjustments:
  
- @digits    @values
- 0x0000       ""
- 0x0030       0
- 0x0031       1
- 0x0032       2
- 0x0033       3
- 0x0034       4
- 0x0035       5
- 0x0036       6
- 0x0037       7
- 0x0038       8
- 0x0039       9
- 0x003A       ""
- 0x0660       0
- 0x0661       1
- ...
+ for my $i (0 .. @$list_ref - 2) {
+    next if $map_ref->[$i] eq $missing;
+
+    # prop_invmap() guarantees that if the mapping is to an array, the
+    # range has just one element, so no need to worry about adjustments.
+    if (ref $map_ref->[$i]) {
+        push @range_list,
+                   [ $list_ref->[$i], $list_ref->[$i], $map_ref->[$i] ];
+    }
+    else {  # Otherwise each element is actually mapped to a separate
+            # value, so the range has to be split into single code point
+            # ranges.
+
+        my $adjustment = 0;
+
+        # For each code point that gets mapped to something...
+        for my $j ($list_ref->[$i] .. $list_ref->[$i+1] -1 ) {
+
+            # ... add a range consisting of just it mapping to the
+            # original plus the adjustment, which is incremented for the
+            # next time through the loop, as the offset increases by 1
+            # for each element in the range
+            push @range_list,
+                             [ $j, $j, $map_ref->[$i] + $adjustment++ ];
+        }
+    }
+ }
  
  Note that the inversion maps returned for the C<Case_Folding> and
  C<Simple_Case_Folding> properties do not include the Turkic-locale mappings.
@@ -2530,23 +2814,24 @@ sub prop_invmap ($) {
  
  RETRY:
  
+    # If there are multiple entries for a single code point
+    my $has_multiples = 0;
+
      # Try to get the map swash for the property.  They have 'To' prepended to
      # the property name, and 32 means we will accept 32 bit return values.
+    # The 0 means we aren't calling this from tr///.
      my $swash = utf8::SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0);
  
-    # If there are multiple entries for a single code point;
-    my $has_multiples = 0;
-
      # If didn't find it, could be because needs a proxy.  And if was the
      # 'Block' or 'Name' property, use a proxy even if did find it.  Finding it
-    # would be the result of the installation changing mktables to output the
-    # Block or Name tables.  The Block table gives block names in the
-    # new-style, and this routine is supposed to return old-style block names.
-    # The Name table is valid, but we need to execute the special code below
-    # to add in the algorithmic-defined name entries.
+    # in these cases would be the result of the installation changing mktables
+    # to output the Block or Name tables.  The Block table gives block names
+    # in the new-style, and this routine is supposed to return old-style block
+    # names.  The Name table is valid, but we need to execute the special code
+    # below to add in the algorithmic-defined name entries.
+    # And NFKCCF needs conversion, so handle that here too.
      if (ref $swash eq ""
-        || $swash->{'TYPE'} eq 'ToBlk'
-        || $swash->{'TYPE'} eq 'ToNa')
+        || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x)
      {
  
          # Get the short name of the input property, in standard form
@@ -2560,51 +2845,20 @@ RETRY:
              $prop = "age";
              goto RETRY;
          }
-        elsif ($second_try eq 'scf') {
+        elsif ($second_try =~ / ^ s ( cf | fc | [ltu] c ) $ /x) {
  
-            # This property uses just the LIST part of cf which includes the
-            # simple folds that are otherwise overridden by the SPECIALS.  So
-            # all we need do is to not look at the SPECIALS; set $overrides to
-            # indicate that
+            # These properties use just the LIST part of the full mapping,
+            # which includes the simple maps that are otherwise overridden by
+            # the SPECIALS.  So all we need do is to not look at the SPECIALS;
+            # set $overrides to indicate that
              $overrides = -1;
-            $prop = "cf";
-            goto RETRY;
-        }
-        elsif ($second_try =~ / ^ s[ltu]c $ /x) {
-
-            # Because some applications may be reading the full mapping
-            # equivalent files directly, they haven't been changed to include
-            # the simple mappings as well, as was done with the cf file (which
-            # doesn't have those backward compatibility issues) in 5.14.
-            # Instead, separate internal-only files were created that
-            # contain just the simple mappings that get overridden by the
-            # SPECIALS.  Thus, these simple case mappings use the LIST part of
-            # their full mapping equivalents; plus the ones that are in those
-            # additional files.  These special files are used by other
-            # functions in this module, so use the same hashes that those
-            # functions use.
-            my $file;
-            if ($second_try eq "suc") {
-                $file = '_suc.pl';
-                $overrides = \%SIMPLE_UPPER;
-            }
-            elsif ($second_try eq "slc") {
-                $file = '_slc.pl';
-                $overrides = \%SIMPLE_LOWER;
-            }
-            else {
-                $file = '_stc.pl';
-                $overrides = \%SIMPLE_TITLE;
-            }
  
-            # The files are already handled by the _read_table() function.
-            # Don't read them in if already done.
-            %$overrides =_read_table("unicore/To/$file", 'use_hash')
-                                                            unless %$overrides;
+            # The full name is the simple name stripped of its initial 's'
+            $prop = $1;
+
+            # .. except for this case
+            $prop = 'cf' if $prop eq 'fc';
  
-            # Convert to the full mapping name, and go handle that; e.g.,
-            # suc => uc.
-            $prop = $second_try =~ s/^s//r;
              goto RETRY;
          }
          elsif ($second_try eq "blk") {
@@ -2658,17 +2912,12 @@ RETRY:
                  map { s/:.*// } @{$aliases{$code_point}};
              }
  
-            # We hold off on adding the next entry to the list until we know,
-            # that the next line isn't for the same code point.  We only
-            # output the final line.  That one is the original Name property
-            # value.  The others are the Name_Alias corrections, which are
-            # listed first in the file.
              my $i = 0;
              foreach my $line (split "\n", $original) {
                  my ($hex_code_point, $name) = split "\t", $line;
  
                  # Weeds out all comments, blank lines, and named sequences
-                next if $hex_code_point =~ /\P{ASCII_HEX_DIGIT}/;
+                next if $hex_code_point =~ /[^[:xdigit:]]/a;
  
                  my $code_point = hex $hex_code_point;
  
@@ -2721,25 +2970,21 @@ RETRY:
                  $decomps{'TYPE'} = "ToDt";
                  $utf8::SwashInfo{'ToDt'}{'missing'} = "None";
                  $utf8::SwashInfo{'ToDt'}{'format'} = "s";
-            }
-            else {
-                $decomps{'TYPE'} = "ToDm";
-                $utf8::SwashInfo{'ToDm'}{'missing'} = "<code point>";
-
-                # Use a special internal-to-this_routine format, 'dm', to
-                # distinguish from 'd', meaning decimal.
-                $utf8::SwashInfo{'ToDm'}{'format'} = "dm";
-            }
+            }   # 'dm' is handled below, with 'nfkccf'
  
              $decomps{'LIST'} = "";
  
              # This property has one special range not in the file: for the
-            # hangul syllables
-            my $done_hangul = 0;    # Have we done the hangul range.
+            # hangul syllables.  But not in Unicode version 1.
+            UnicodeVersion() unless defined $v_unicode_version;
+            my $done_hangul = ($v_unicode_version lt v2.0.0)
+                              ? 1
+                              : 0;    # Have we done the hangul range ?
              foreach my $line (split "\n", $original) {
                  my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
                  my $code_point = hex $hex_lower;
                  my $value;
+                my $redo = 0;
  
                  # The type, enclosed in <...>, precedes the mapping separated
                  # by blanks
@@ -2764,14 +3009,163 @@ RETRY:
                                          : "<hangul syllable>";
                  }
  
+                if ($value =~ / / && $hex_upper ne "" && $hex_upper ne $hex_lower) {
+                    $line = sprintf("%04X\t%s\t%s", hex($hex_lower) + 1, $hex_upper, $value);
+                    $hex_upper = "";
+                    $redo = 1;
+                }
+
                  # And append this to our constructed LIST.
                  $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
+
+                redo if $redo;
              }
              $swash = \%decomps;
          }
-        else {  # Don't know this property. Fail.
+        elsif ($second_try ne 'nfkccf') { # Don't know this property. Fail.
              return;
          }
+
+        if ($second_try eq 'nfkccf' || $second_try eq 'dm') {
+
+            # The 'nfkccf' property is stored in the old format for backwards
+            # compatibility for any applications that has read its file
+            # directly before prop_invmap() existed.
+            # And the code above has extracted the 'dm' property from its file
+            # yielding the same format.  So here we convert them to adjusted
+            # format for compatibility with the other properties similar to
+            # them.
+            my %revised_swash;
+
+            # We construct a new converted list.
+            my $list = "";
+
+            my @ranges = split "\n", $swash->{'LIST'};
+            for (my $i = 0; $i < @ranges; $i++) {
+                my ($hex_begin, $hex_end, $map) = split "\t", $ranges[$i];
+
+                # The dm property has maps that are space separated sequences
+                # of code points, as well as the special entry "<hangul
+                # syllable>, which also contains a blank.
+                my @map = split " ", $map;
+                if (@map > 1) {
+
+                    # If it's just the special entry, append as-is.
+                    if ($map eq '<hangul syllable>') {
+                        $list .= "$ranges[$i]\n";
+                    }
+                    else {
+
+                        # These should all be single-element ranges.
+                        croak __PACKAGE__, "::prop_invmap: Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne "" && $hex_end ne $hex_begin;
+
+                        # Convert them to decimal, as that's what's expected.
+                        $list .= "$hex_begin\t\t"
+                            . join(" ", map { hex } @map)
+                            . "\n";
+                    }
+                    next;
+                }
+
+                # Here, the mapping doesn't have a blank, is for a single code
+                # point.
+                my $begin = hex $hex_begin;
+                my $end = (defined $hex_end && $hex_end ne "")
+                        ? hex $hex_end
+                        : $begin;
+
+                # Again, the output is to be in decimal.
+                my $decimal_map = hex $map;
+
+                # We know that multi-element ranges with the same mapping
+                # should not be adjusted, as after the adjustment
+                # multi-element ranges are for consecutive increasing code
+                # points.  Further, the final element in the list won't be
+                # adjusted, as there is nothing after it to include in the
+                # adjustment
+                if ($begin != $end || $i == @ranges -1) {
+
+                    # So just convert these to single-element ranges
+                    foreach my $code_point ($begin .. $end) {
+                        $list .= sprintf("%04X\t\t%d\n",
+                                        $code_point, $decimal_map);
+                    }
+                }
+                else {
+
+                    # Here, we have a candidate for adjusting.  What we do is
+                    # look through the subsequent adjacent elements in the
+                    # input.  If the map to the next one differs by 1 from the
+                    # one before, then we combine into a larger range with the
+                    # initial map.  Loop doing this until we find one that
+                    # can't be combined.
+
+                    my $offset = 0;     # How far away are we from the initial
+                                        # map
+                    my $squished = 0;   # ? Did we squish at least two
+                                        # elements together into one range
+                    for ( ; $i < @ranges; $i++) {
+                        my ($next_hex_begin, $next_hex_end, $next_map)
+                                                = split "\t", $ranges[$i+1];
+
+                        # In the case of 'dm', the map may be a sequence of
+                        # multiple code points, which are never combined with
+                        # another range
+                        last if $next_map =~ / /;
+
+                        $offset++;
+                        my $next_decimal_map = hex $next_map;
+
+                        # If the next map is not next in sequence, it
+                        # shouldn't be combined.
+                        last if $next_decimal_map != $decimal_map + $offset;
+
+                        my $next_begin = hex $next_hex_begin;
+
+                        # Likewise, if the next element isn't adjacent to the
+                        # previous one, it shouldn't be combined.
+                        last if $next_begin != $begin + $offset;
+
+                        my $next_end = (defined $next_hex_end
+                                        && $next_hex_end ne "")
+                                            ? hex $next_hex_end
+                                            : $next_begin;
+
+                        # And finally, if the next element is a multi-element
+                        # range, it shouldn't be combined.
+                        last if $next_end != $next_begin;
+
+                        # Here, we will combine.  Loop to see if we should
+                        # combine the next element too.
+                        $squished = 1;
+                    }
+
+                    if ($squished) {
+
+                        # Here, 'i' is the element number of the last element to
+                        # be combined, and the range is single-element, or we
+                        # wouldn't be combining.  Get it's code point.
+                        my ($hex_end, undef, undef) = split "\t", $ranges[$i];
+                        $list .= "$hex_begin\t$hex_end\t$decimal_map\n";
+                    } else {
+
+                        # Here, no combining done.  Just appen the initial
+                        # (and current) values.
+                        $list .= "$hex_begin\t\t$decimal_map\n";
+                    }
+                }
+            } # End of loop constructing the converted list
+
+            # Finish up the data structure for our converted swash
+            my $type = ($second_try eq 'nfkccf') ? 'ToNFKCCF' : 'ToDm';
+            $revised_swash{'LIST'} = $list;
+            $revised_swash{'TYPE'} = $type;
+            $revised_swash{'SPECIALS'} = $swash->{'SPECIALS'};
+            $swash = \%revised_swash;
+
+            $utf8::SwashInfo{$type}{'missing'} = 0;
+            $utf8::SwashInfo{$type}{'format'} = 'a';
+        }
      }
  
      if ($swash->{'EXTRAS'}) {
@@ -2780,7 +3174,7 @@ RETRY:
      }
  
      # Here, have a valid swash return.  Examine it.
-    my $returned_prop = $swash->{TYPE};
+    my $returned_prop = $swash->{'TYPE'};
  
      # All properties but binary ones should have 'missing' and 'format'
      # entries
@@ -2790,6 +3184,8 @@ RETRY:
      $format = $utf8::SwashInfo{$returned_prop}{'format'};
      $format = 'b' unless defined $format;
  
+    my $requires_adjustment = $format =~ /^a/;
+
      # The LIST input lines look like:
      # ...
      # 0374\t\tCommon
@@ -2829,7 +3225,7 @@ RETRY:
          #
          # Thus, things are set up for the typical case of a new non-adjacent
          # range of non-missings to be added.  But, if the new range is
-        # adjacent, it needs to replace the [-1] elements; and if the new
+        # adjacent, it needs to replace the [-1] element; and if the new
          # range is a multiple value of the previous one, it needs to be added
          # to the [-2] map element.
  
@@ -2855,7 +3251,7 @@ RETRY:
              # This is all we need do for this iteration.
  
              if ($end != $begin) {
-                croak __PACKAGE__, "Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
+                croak __PACKAGE__, ":prop_invmap: Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
              }
              if (! ref $invmap[-2]) {
                  $invmap[-2] = [ $invmap[-2], $map ];
@@ -2870,10 +3266,12 @@ RETRY:
  
              # If the input isn't in the most compact form, so that there are
              # two adjacent ranges that map to the same thing, they should be
-            # combined.  This happens in our constructed dt mapping, as
-            # Element [-2] is the map for the latest range so far processed.
-            # Just set the beginning point of the map to $missing (in
-            # invlist[-1]) to 1 beyond where this range ends.  For example, in
+            # combined (EXCEPT where the arrays require adjustments, in which
+            # case everything is already set up correctly).  This happens in
+            # our constructed dt mapping, as Element [-2] is the map for the
+            # latest range so far processed.  Just set the beginning point of
+            # the map to $missing (in invlist[-1]) to 1 beyond where this
+            # range ends.  For example, in
              # 12\t13\tXYZ
              # 14\t17\tXYZ
              # we have set it up so that it looks like
@@ -2883,7 +3281,7 @@ RETRY:
              # We now see that it should be
              # 12 => XYZ
              # 18 => $missing
-            if (@invlist > 1 && ( (defined $map)
+            if (! $requires_adjustment && @invlist > 1 && ( (defined $map)
                                    ? $invmap[-2] eq $map
                                    : $invmap[-2] eq 'Y'))
              {
@@ -2901,7 +3299,7 @@ RETRY:
  
          # Add the range beginning, and the range's map.
          push @invlist, $begin;
-        if ($format eq 'dm') {
+        if ($returned_prop eq 'ToDm') {
  
              # The decomposition maps are either a line like <hangul syllable>
              # which are to be taken as is; or a sequence of code points in hex
@@ -2911,7 +3309,7 @@ RETRY:
                  push @invmap, $map;
              }
              else {
-                my @map = map { hex } split " ", $map;
+                my @map = split " ", $map;
                  if (@map == 1) {
                      push @invmap, $map[0];
                  }
@@ -2951,7 +3349,7 @@ RETRY:
          push @invmap, $missing;
      }
  
-    # And add in standard element that all non-Unicode code points map to
+    # And add in standard element that all non-Unicode code points map to:
      # $missing
      push @invlist, $MAX_UNICODE_CODEPOINT + 1;
      push @invmap, $missing;
@@ -2964,16 +3362,16 @@ RETRY:
      if ($overrides) {
  
          # A negative $overrides implies that the SPECIALS should be ignored,
-        # and a simple 'c' list is the value.
+        # and a simple 'a' list is the value.
          if ($overrides < 0) {
-            $format = 'c';
+            $format = 'a';
          }
          else {
  
              # Currently, all overrides are for properties that normally map to
              # single code points, but now some will map to lists of code
              # points (but there is an exception case handled below).
-            $format = 'cl';
+            $format = 'al';
  
              # Look through the overrides.
              foreach my $cp_maybe_utf8 (keys %$overrides) {
@@ -2988,22 +3386,22 @@ RETRY:
  
                      # The empty string will show up unpacked as an empty
                      # array.
-                    $format = 'cle' if @map == 0;
+                    $format = 'ale' if @map == 0;
                  }
                  else {
  
                      # But if we generated the overrides, we didn't bother to
                      # pack them, and we, so far, do this only for properties
-                    # that are 'c' ones.
+                    # that are 'a' ones.
                      $cp = $cp_maybe_utf8;
                      @map = hex $overrides->{$cp};
-                    $format = 'c';
+                    $format = 'a';
                  }
  
                  # Find the range that the override applies to.
                  my $i = _search_invlist(\@invlist, $cp);
                  if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
-                    croak __PACKAGE__, "wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
+                    croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
                  }
  
                  # And what that range currently maps to
@@ -3072,11 +3470,12 @@ RETRY:
      }
      elsif ($format eq 'x') {
  
-        # All hex-valued properties are really to code points
-        $format = 'c';
+        # All hex-valued properties are really to code points, and have been
+        # converted to decimal.
+        $format = 's';
      }
-    elsif ($format eq 'dm') {
-        $format = 'd';
+    elsif ($returned_prop eq 'ToDm') {
+        $format = 'ad';
      }
      elsif ($format eq 'sw') { # blank-separated elements to form a list.
          map { $_ = [ split " ", $_  ] if $_ =~ / / } @invmap;
@@ -3088,13 +3487,22 @@ RETRY:
          # could
          $format = 'sl';
      }
-    elsif ($format ne 'n' && $format ne 'r') {
+    elsif ($returned_prop eq 'ToPerlDecimalDigit') {
+        $format = 'ae';
+    }
+    elsif ($returned_prop eq 'ToNv') {
+
+        # The one property that has this format is stored as a delta, so needs
+        # to indicate that need to add code point to it.
+        $format = 'ar';
+    }
+    elsif ($format ne 'n' && $format ne 'a') {
  
          # All others are simple scalars
          $format = 's';
      }
      if ($has_multiples &&  $format !~ /l/) {
-       croak __PACKAGE__, "Wrong format '$format' for prop_invmap('$prop'); should indicate has lists";
+       croak __PACKAGE__, "::prop_invmap: Wrong format '$format' for prop_invmap('$prop'); should indicate has lists";
      }
  
      return (\@invlist, \@invmap, $format, $missing);
@@ -3119,6 +3527,7 @@ sub UnicodeVersion {
         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
      }
+    $v_unicode_version = pack "C*", split /\./, $UNICODEVERSION;
      return $UNICODEVERSION;
  }
  
@@ -3179,6 +3588,9 @@ To convert from new-style to old-style, follow this recipe:
  gets the lower end of the range (0th element) and then looks up the old name
  for its block using C<charblock>).
  
+Note that starting in Unicode 6.1, many of the block names have shorter
+synonyms.  These are always given in the new style.
+
  =head1 BUGS
  
  Does not yet support EBCDIC platforms.