mktables: Generate native code-point tables

[perl5.git] / lib / Unicode / UCD.pm
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm

index 343191d..f1b00a4 100644 (file)
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -4,11 +4,8 @@ use strict;
  use warnings;
  no warnings 'surrogate';    # surrogates can be inputs to this
  use charnames ();
-use Unicode::Normalize qw(getCombinClass NFD);
  
-our $VERSION = '0.41';
-
-use Storable qw(dclone);
+our $VERSION = '0.53';
  
  require Exporter;
  
@@ -20,13 +17,14 @@ our @EXPORT_OK = qw(charinfo
                     charinrange
                     general_categories bidi_types
                     compexcl
-                   casefold casespec
+                   casefold all_casefolds casespec
                     namedseq
                      num
                      prop_aliases
                      prop_value_aliases
                      prop_invlist
                      prop_invmap
+                    search_invlist
                      MAX_CP
                  );
  
@@ -44,6 +42,9 @@ Unicode::UCD - Unicode character database
      use Unicode::UCD 'casefold';
      my $casefold = casefold(0xFB00);
  
+    use Unicode::UCD 'all_casefolds';
+    my $all_casefolds_ref = all_casefolds();
+
      use Unicode::UCD 'casespec';
      my $casespec = casespec(0xFB00);
  
@@ -80,6 +81,9 @@ Unicode::UCD - Unicode character database
      my ($list_ref, $map_ref, $format, $missing)
                                        = prop_invmap("General Category");
  
+    use Unicode::UCD 'search_invlist';
+    my $index = search_invlist(\@invlist, $code_point);
+
      use Unicode::UCD 'compexcl';
      my $compexcl = compexcl($codepoint);
  
@@ -104,8 +108,16 @@ a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
  followed by hexadecimals designating a Unicode code point.  In other words, if
  you want a code point to be interpreted as a hexadecimal number, you must
  prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
-interpreted as a decimal code point.  Note that the largest code point in
-Unicode is U+10FFFF.
+interpreted as a decimal code point.
+
+Examples:
+
+    223     # Decimal 223
+    0223    # Hexadecimal 223 (= 547 decimal)
+    0xDF    # Hexadecimal DF (= 223 decimal
+    U+DF    # Hexadecimal DF
+
+Note that the largest code point in Unicode is U+10FFFF.
  
  =cut
  
@@ -114,6 +126,7 @@ my $VERSIONFH;
  my $CASEFOLDFH;
  my $CASESPECFH;
  my $NAMEDSEQFH;
+my $v_unicode_version;  # v-string.
  
  sub openunicode {
      my ($rfh, @path) = @_;
@@ -132,6 +145,35 @@ sub openunicode {
      return $f;
  }
  
+sub _dclone ($) {   # Use Storable::dclone if available; otherwise emulate it.
+
+    use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone);
+
+    return dclone(shift) if defined &dclone;
+
+    my $arg = shift;
+    my $type = ref $arg;
+    return $arg unless $type;   # No deep cloning needed for scalars
+
+    if ($type eq 'ARRAY') {
+        my @return;
+        foreach my $element (@$arg) {
+            push @return, &_dclone($element);
+        }
+        return \@return;
+    }
+    elsif ($type eq 'HASH') {
+        my %return;
+        foreach my $key (keys %$arg) {
+            $return{$key} = &_dclone($arg->{$key});
+        }
+        return \%return;
+    }
+    else {
+        croak "_dclone can't handle " . $type;
+    }
+}
+
  =head2 B<charinfo()>
  
      use Unicode::UCD 'charinfo';
@@ -303,6 +345,7 @@ my %SIMPLE_LOWER;
  my %SIMPLE_TITLE;
  my %SIMPLE_UPPER;
  my %UNICODE_1_NAMES;
+my %ISO_COMMENT;
  
  sub charinfo {
  
@@ -315,6 +358,9 @@ sub charinfo {
  
      use feature 'unicode_strings';
  
+    # Will fail if called under minitest
+    use if defined &DynaLoader::boot_DynaLoader, "Unicode::Normalize" => qw(getCombinClass NFD);
+
      my $arg  = shift;
      my $code = _getcode($arg);
      croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
@@ -325,7 +371,7 @@ sub charinfo {
      my %prop;
      my $char = chr($code);
  
-    @CATEGORIES =_read_table("unicore/To/Gc.pl") unless @CATEGORIES;
+    @CATEGORIES =_read_table("To/Gc.pl") unless @CATEGORIES;
      $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
                          // $utf8::SwashInfo{'ToGc'}{'missing'};
  
@@ -337,7 +383,7 @@ sub charinfo {
  
      $prop{'combining'} = getCombinClass($code);
  
-    @BIDIS =_read_table("unicore/To/Bc.pl") unless @BIDIS;
+    @BIDIS =_read_table("To/Bc.pl") unless @BIDIS;
      $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
                      // $utf8::SwashInfo{'ToBc'}{'missing'};
  
@@ -353,14 +399,15 @@ sub charinfo {
      # Having no decomposition implies an empty field; otherwise, all but
      # "Canonical" imply a compatible decomposition, and the type is prefixed
      # to that, as it is in UnicodeData.txt
-    if ($char =~ /\p{Block=Hangul_Syllables}/) {
+    UnicodeVersion() unless defined $v_unicode_version;
+    if ($v_unicode_version ge v2.0.0 && $char =~ /\p{Block=Hangul_Syllables}/) {
          # The code points of the decomposition are output in standard Unicode
          # hex format, separated by blanks.
          $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
                                             unpack "U*", NFD($char);
      }
      else {
-        @DECOMPOSITIONS = _read_table("unicore/Decomposition.pl")
+        @DECOMPOSITIONS = _read_table("Decomposition.pl")
                            unless @DECOMPOSITIONS;
          $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
                                                                  $code) // "";
@@ -382,8 +429,7 @@ sub charinfo {
              # e.g., TAMIL NUMBER TEN.
              $prop{'decimal'} = "";
  
-            @NUMERIC_TYPES =_read_table("unicore/To/Nt.pl")
-                                unless @NUMERIC_TYPES;
+            @NUMERIC_TYPES =_read_table("To/Nt.pl") unless @NUMERIC_TYPES;
              if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
                  eq 'Digit')
              {
@@ -398,29 +444,33 @@ sub charinfo {
  
      $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
  
-    %UNICODE_1_NAMES =_read_table("unicore/To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
+    %UNICODE_1_NAMES =_read_table("To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
      $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
  
-    # This is true starting in 6.0, but, num() also requires 6.0, so
-    # don't need to test for version again here.
-    $prop{'comment'} = "";
+    UnicodeVersion() unless defined $v_unicode_version;
+    if ($v_unicode_version ge v6.0.0) {
+        $prop{'comment'} = "";
+    }
+    else {
+        %ISO_COMMENT = _read_table("To/Isc.pl", "use_hash") unless %ISO_COMMENT;
+        $prop{'comment'} = (defined $ISO_COMMENT{$code})
+                           ? $ISO_COMMENT{$code}
+                           : "";
+    }
  
-    %SIMPLE_UPPER = _read_table("unicore/To/Uc.pl", "use_hash")
-                                                           unless %SIMPLE_UPPER;
+    %SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER;
      $prop{'upper'} = (defined $SIMPLE_UPPER{$code})
-                     ? sprintf("%04X", $SIMPLE_UPPER{$code} + $code)
+                     ? sprintf("%04X", $SIMPLE_UPPER{$code})
                       : "";
  
-    %SIMPLE_LOWER = _read_table("unicore/To/Lc.pl", "use_hash")
-                                                           unless %SIMPLE_LOWER;
+    %SIMPLE_LOWER = _read_table("To/Lc.pl", "use_hash") unless %SIMPLE_LOWER;
      $prop{'lower'} = (defined $SIMPLE_LOWER{$code})
-                     ? sprintf("%04X", $SIMPLE_LOWER{$code} + $code)
+                     ? sprintf("%04X", $SIMPLE_LOWER{$code})
                       : "";
  
-    %SIMPLE_TITLE = _read_table("unicore/To/Tc.pl", "use_hash")
-                                                           unless %SIMPLE_TITLE;
+    %SIMPLE_TITLE = _read_table("To/Tc.pl", "use_hash") unless %SIMPLE_TITLE;
      $prop{'title'} = (defined $SIMPLE_TITLE{$code})
-                     ? sprintf("%04X", $SIMPLE_TITLE{$code} + $code)
+                     ? sprintf("%04X", $SIMPLE_TITLE{$code})
                       : "";
  
      $prop{block}  = charblock($code);
@@ -479,8 +529,17 @@ sub _read_table ($;$) {
      my @return;
      my %return;
      local $_;
+    my $list = do "unicore/$table";
  
-    for (split /^/m, do $table) {
+    # Look up if this property requires adjustments, which we do below if it
+    # does.
+    require "unicore/Heavy.pl";
+    my $property = $table =~ s/\.pl//r;
+    $property = $utf8::file_to_swash_name{$property};
+    my $to_adjust = defined $property
+                    && $utf8::SwashInfo{$property}{'format'} eq 'a';
+
+    for (split /^/m, $list) {
          my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
                                          \s* ( \# .* )?  # Optional comment
                                          $ /x;
@@ -488,11 +547,14 @@ sub _read_table ($;$) {
          my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
          if ($return_hash) {
              foreach my $i ($decimal_start .. $decimal_end) {
-                $return{$i} = $value;
+                $return{$i} = ($to_adjust)
+                              ? $value + $i - $decimal_start
+                              : $value;
              }
          }
-        elsif (@return &&
-               $return[-1][1] == $decimal_start - 1
+        elsif (! $to_adjust
+               && @return
+               && $return[-1][1] == $decimal_start - 1
                 && $return[-1][2] eq $value)
          {
              # If this is merely extending the previous range, do just that.
@@ -528,7 +590,8 @@ With a L</code point argument> charblock() returns the I<block> the code point
  belongs to, e.g.  C<Basic Latin>.  The old-style block name is returned (see
  L</Old-style versus new-style block names>).
  If the code point is unassigned, this returns the block it would belong to if
-it were assigned.
+it were assigned.  (If the Unicode version being used is so early as to not
+have blocks, all code points are considered to be in C<No_Block>.)
  
  See also L</Blocks versus Scripts>.
  
@@ -554,7 +617,13 @@ sub _charblocks {
      # Can't read from the mktables table because it loses the hyphens in the
      # original.
      unless (@BLOCKS) {
-       if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
+        UnicodeVersion() unless defined $v_unicode_version;
+        if ($v_unicode_version lt v2.0.0) {
+            my $subrange = [ 0, 0x10FFFF, 'No_Block' ];
+            push @BLOCKS, $subrange;
+            push @{$BLOCKS{'No_Block'}}, $subrange;
+        }
+        elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) {
             local $_;
             local $/ = "\n";
             while (<$BLOCKSFH>) {
@@ -583,7 +652,7 @@ sub charblock {
          return 'No_Block';
      }
      elsif (exists $BLOCKS{$arg}) {
-        return dclone $BLOCKS{$arg};
+        return _dclone $BLOCKS{$arg};
      }
  }
  
@@ -599,7 +668,8 @@ sub charblock {
  
  With a L</code point argument> charscript() returns the I<script> the
  code point belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
-If the code point is unassigned, it returns C<"Unknown">.
+If the code point is unassigned or the Unicode version being used is so early
+that it doesn't have scripts, this function returns C<"Unknown">.
  
  If supplied with an argument that can't be a code point, charscript() tries
  to do the opposite and interpret the argument as a script name. The
@@ -616,7 +686,15 @@ my @SCRIPTS;
  my %SCRIPTS;
  
  sub _charscripts {
-    @SCRIPTS =_read_table("unicore/To/Sc.pl") unless @SCRIPTS;
+    unless (@SCRIPTS) {
+        UnicodeVersion() unless defined $v_unicode_version;
+        if ($v_unicode_version lt v3.1.0) {
+            push @SCRIPTS, [ 0, 0x10FFFF, 'Unknown' ];
+        }
+        else {
+            @SCRIPTS =_read_table("To/Sc.pl");
+        }
+    }
      foreach my $entry (@SCRIPTS) {
          $entry->[2] =~ s/(_\w)/\L$1/g;  # Preserve old-style casing
          push @{$SCRIPTS{$entry->[2]}}, $entry;
@@ -635,7 +713,7 @@ sub charscript {
          return $result if defined $result;
          return $utf8::SwashInfo{'ToSc'}{'missing'};
      } elsif (exists $SCRIPTS{$arg}) {
-        return dclone $SCRIPTS{$arg};
+        return _dclone $SCRIPTS{$arg};
      }
  
      return;
@@ -662,7 +740,7 @@ See also L</Blocks versus Scripts>.
  
  sub charblocks {
      _charblocks() unless %BLOCKS;
-    return dclone \%BLOCKS;
+    return _dclone \%BLOCKS;
  }
  
  =head2 B<charscripts()>
@@ -684,7 +762,7 @@ See also L</Blocks versus Scripts>.
  
  sub charscripts {
      _charscripts() unless %SCRIPTS;
-    return dclone \%SCRIPTS;
+    return _dclone \%SCRIPTS;
  }
  
  =head2 B<charinrange()>
@@ -744,7 +822,7 @@ my %GENERAL_CATEGORIES =
   );
  
  sub general_categories {
-    return dclone \%GENERAL_CATEGORIES;
+    return _dclone \%GENERAL_CATEGORIES;
  }
  
  =head2 B<general_categories()>
@@ -812,7 +890,7 @@ the bidi type name.
  =cut
  
  sub bidi_types {
-    return dclone \%BIDI_TYPES;
+    return _dclone \%BIDI_TYPES;
  }
  
  =head2 B<compexcl()>
@@ -821,7 +899,9 @@ sub bidi_types {
  
      my $compexcl = compexcl(0x09dc);
  
-This routine is included for backwards compatibility, but as of Perl 5.12, for
+This routine returns C<undef> if the Unicode version being used is so early
+that it doesn't have this property.  It is included for backwards
+compatibility, but as of Perl 5.12 and more modern Unicode versions, for
  most purposes it is probably more convenient to use one of the following
  instead:
  
@@ -856,6 +936,9 @@ sub compexcl {
      croak __PACKAGE__, "::compexcl: unknown code '$arg'"
         unless defined $code;
  
+    UnicodeVersion() unless defined $v_unicode_version;
+    return if $v_unicode_version lt v3.0.0;
+
      no warnings "non_unicode";     # So works on non-Unicode code points
      return chr($code) =~ /\p{Composition_Exclusion}/;
  }
@@ -882,9 +965,11 @@ sub compexcl {
      }
  
  This returns the (almost) locale-independent case folding of the
-character specified by the L</code point argument>.
+character specified by the L</code point argument>.  (Starting in Perl v5.16,
+the core function C<fc()> returns the C<full> mapping (described below)
+faster than this does, and for entire strings.)
  
-If there is no case folding for that code point, C<undef> is returned.
+If there is no case folding for the input code point, C<undef> is returned.
  
  If there is a case folding for that code point, a reference to a hash
  with the following fields is returned:
@@ -927,18 +1012,18 @@ Note that this
  describes the contents of I<mapping>.  It is defined primarily for backwards
  compatibility.
  
-On versions 3.1 and earlier of Unicode, I<status> can also be
+For Unicode versions between 3.1 and 3.1.1 inclusive, I<status> can also be
  C<I> which is the same as C<C> but is a special case for dotted uppercase I and
  dotless lowercase i:
  
  =over
  
-=item B<*> If you use this C<I> mapping
+=item Z<>B<*> If you use this C<I> mapping
  
  the result is case-insensitive,
  but dotless and dotted I's are not distinguished
  
-=item B<*> If you exclude this C<I> mapping
+=item Z<>B<*> If you exclude this C<I> mapping
  
  the result is not fully case-insensitive, but
  dotless and dotted I's are distinguished
@@ -956,7 +1041,8 @@ Each code has at least four hexdigits.
  Note that this folding does not maintain canonical equivalence without
  additional processing.
  
-For versions of Unicode 3.1 and earlier, this field is empty unless there is a
+For Unicode versions between 3.1 and 3.1.1 inclusive, this field is empty unless
+there is a
  special folding for Turkic languages, in which case I<status> is C<I>, and
  I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.  
  
@@ -986,54 +1072,88 @@ L<http://www.unicode.org/unicode/reports/tr21>
  my %CASEFOLD;
  
  sub _casefold {
-    unless (%CASEFOLD) {
-       if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
-           local $_;
-           local $/ = "\n";
-           while (<$CASEFOLDFH>) {
-               if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
-                   my $code = hex($1);
-                   $CASEFOLD{$code}{'code'} = $1;
-                   $CASEFOLD{$code}{'turkic'} = "" unless
-                                           defined $CASEFOLD{$code}{'turkic'};
-                   if ($2 eq 'C' || $2 eq 'I') {       # 'I' is only on 3.1 and
-                                                       # earlier Unicodes
-                                                       # Both entries there (I
-                                                       # only checked 3.1) are
-                                                       # the same as C, and
-                                                       # there are no other
-                                                       # entries for those
-                                                       # codepoints, so treat
-                                                       # as if C, but override
-                                                       # the turkic one for
-                                                       # 'I'.
-                       $CASEFOLD{$code}{'status'} = $2;
-                       $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
-                       $CASEFOLD{$code}{'mapping'} = $3;
-                       $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
-                   } elsif ($2 eq 'F') {
-                       $CASEFOLD{$code}{'full'} = $3;
-                       unless (defined $CASEFOLD{$code}{'simple'}) {
-                               $CASEFOLD{$code}{'simple'} = "";
-                               $CASEFOLD{$code}{'mapping'} = $3;
-                               $CASEFOLD{$code}{'status'} = $2;
-                       }
-                   } elsif ($2 eq 'S') {
+    unless (%CASEFOLD) {   # Populate the hash
+        my ($full_invlist_ref, $full_invmap_ref, undef, $default)
+                                                = prop_invmap('Case_Folding');
+
+        # Use the recipe given in the prop_invmap() pod to convert the
+        # inversion map into the hash.
+        for my $i (0 .. @$full_invlist_ref - 1 - 1) {
+            next if $full_invmap_ref->[$i] == $default;
+            my $adjust = -1;
+            for my $j ($full_invlist_ref->[$i] .. $full_invlist_ref->[$i+1] -1) {
+                $adjust++;
+                if (! ref $full_invmap_ref->[$i]) {
+
+                    # This is a single character mapping
+                    $CASEFOLD{$j}{'status'} = 'C';
+                    $CASEFOLD{$j}{'simple'}
+                        = $CASEFOLD{$j}{'full'}
+                        = $CASEFOLD{$j}{'mapping'}
+                        = sprintf("%04X", $full_invmap_ref->[$i] + $adjust);
+                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+                    $CASEFOLD{$j}{'turkic'} = "";
+                }
+                else {  # prop_invmap ensures that $adjust is 0 for a ref
+                    $CASEFOLD{$j}{'status'} = 'F';
+                    $CASEFOLD{$j}{'full'}
+                    = $CASEFOLD{$j}{'mapping'}
+                    = join " ", map { sprintf "%04X", $_ }
+                                                    @{$full_invmap_ref->[$i]};
+                    $CASEFOLD{$j}{'simple'} = "";
+                    $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+                    $CASEFOLD{$j}{'turkic'} = "";
+                }
+            }
+        }
  
+        # We have filled in the full mappings above, assuming there were no
+        # simple ones for the ones with multi-character maps.  Now, we find
+        # and fix the cases where that assumption was false.
+        (my ($simple_invlist_ref, $simple_invmap_ref, undef), $default)
+                                        = prop_invmap('Simple_Case_Folding');
+        for my $i (0 .. @$simple_invlist_ref - 1 - 1) {
+            next if $simple_invmap_ref->[$i] == $default;
+            my $adjust = -1;
+            for my $j ($simple_invlist_ref->[$i]
+                       .. $simple_invlist_ref->[$i+1] -1)
+            {
+                $adjust++;
+                next if $CASEFOLD{$j}{'status'} eq 'C';
+                $CASEFOLD{$j}{'status'} = 'S';
+                $CASEFOLD{$j}{'simple'}
+                    = $CASEFOLD{$j}{'mapping'}
+                    = sprintf("%04X", $simple_invmap_ref->[$i] + $adjust);
+                $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
+                $CASEFOLD{$j}{'turkic'} = "";
+            }
+        }
  
-                       # There can't be a simple without a full, and simple
-                       # overrides all but full
+        # We hard-code in the turkish rules
+        UnicodeVersion() unless defined $v_unicode_version;
+        if ($v_unicode_version ge v3.2.0) {
  
-                       $CASEFOLD{$code}{'simple'} = $3;
-                       $CASEFOLD{$code}{'mapping'} = $3;
-                       $CASEFOLD{$code}{'status'} = $2;
-                   } elsif ($2 eq 'T') {
-                       $CASEFOLD{$code}{'turkic'} = $3;
-                   } # else can't happen because only [CIFST] are possible
-               }
-           }
-           close($CASEFOLDFH);
-       }
+            # These two code points should already have regular entries, so
+            # just fill in the turkish fields
+            $CASEFOLD{ord('I')}{'turkic'} = '0131';
+            $CASEFOLD{0x130}{'turkic'} = sprintf "%04X", ord('i');
+        }
+        elsif ($v_unicode_version ge v3.1.0) {
+
+            # These two code points don't have entries otherwise.
+            $CASEFOLD{0x130}{'code'} = '0130';
+            $CASEFOLD{0x131}{'code'} = '0131';
+            $CASEFOLD{0x130}{'status'} = $CASEFOLD{0x131}{'status'} = 'I';
+            $CASEFOLD{0x130}{'turkic'}
+                = $CASEFOLD{0x130}{'mapping'}
+                = $CASEFOLD{0x130}{'full'}
+                = $CASEFOLD{0x130}{'simple'}
+                = $CASEFOLD{0x131}{'turkic'}
+                = $CASEFOLD{0x131}{'mapping'}
+                = $CASEFOLD{0x131}{'full'}
+                = $CASEFOLD{0x131}{'simple'}
+                = sprintf "%04X", ord('i');
+        }
      }
  }
  
@@ -1048,6 +1168,55 @@ sub casefold {
      return $CASEFOLD{$code};
  }
  
+=head2 B<all_casefolds()>
+
+
+    use Unicode::UCD 'all_casefolds';
+
+    my $all_folds_ref = all_casefolds();
+    foreach my $char_with_casefold (sort { $a <=> $b }
+                                    keys %$all_folds_ref)
+    {
+        printf "%04X:", $char_with_casefold;
+        my $casefold = $all_folds_ref->{$char_with_casefold};
+
+        # Get folds for $char_with_casefold
+
+        my @full_fold_hex = split / /, $casefold->{'full'};
+        my $full_fold_string =
+                    join "", map {chr(hex($_))} @full_fold_hex;
+        print " full=", join " ", @full_fold_hex;
+        my @turkic_fold_hex =
+                        split / /, ($casefold->{'turkic'} ne "")
+                                        ? $casefold->{'turkic'}
+                                        : $casefold->{'full'};
+        my $turkic_fold_string =
+                        join "", map {chr(hex($_))} @turkic_fold_hex;
+        print "; turkic=", join " ", @turkic_fold_hex;
+        if (defined $casefold && $casefold->{'simple'} ne "") {
+            my $simple_fold_hex = $casefold->{'simple'};
+            my $simple_fold_string = chr(hex($simple_fold_hex));
+            print "; simple=$simple_fold_hex";
+        }
+        print "\n";
+    }
+
+This returns all the case foldings in the current version of Unicode in the
+form of a reference to a hash.  Each key to the hash is the decimal
+representation of a Unicode character that has a casefold to other than
+itself.  The casefold of a semi-colon is itself, so it isn't in the hash;
+likewise for a lowercase "a", but there is an entry for a capital "A".  The
+hash value for each key is another hash, identical to what is returned by
+L</casefold()> if called with that code point as its argument.  So the value
+C<< all_casefolds()->{ord("A")}' >> is equivalent to C<casefold(ord("A"))>;
+
+=cut
+
+sub all_casefolds () {
+    _casefold() unless %CASEFOLD;
+    return _dclone \%CASEFOLD;
+}
+
  =head2 B<casespec()>
  
      use Unicode::UCD 'casespec';
@@ -1150,15 +1319,25 @@ my %CASESPEC;
  
  sub _casespec {
      unless (%CASESPEC) {
-       if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
+        UnicodeVersion() unless defined $v_unicode_version;
+        if ($v_unicode_version lt v2.1.8) {
+            %CASESPEC = {};
+        }
+       elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
             local $_;
             local $/ = "\n";
             while (<$CASESPECFH>) {
                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
+
                     my ($hexcode, $lower, $title, $upper, $condition) =
                         ($1, $2, $3, $4, $5);
                     my $code = hex($hexcode);
-                   if (exists $CASESPEC{$code}) {
+
+                    # In 2.1.8, there were duplicate entries; ignore all but
+                    # the first one -- there were no conditions in the file
+                    # anyway.
+                   if (exists $CASESPEC{$code} && $v_unicode_version ne v2.1.8)
+                    {
                         if (exists $CASESPEC{$code}->{code}) {
                             my ($oldlower,
                                 $oldtitle,
@@ -1211,7 +1390,7 @@ sub casespec {
  
      _casespec() unless %CASESPEC;
  
-    return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
+    return ref $CASESPEC{$code} ? _dclone $CASESPEC{$code} : $CASESPEC{$code};
  }
  
  =head2 B<namedseq()>
@@ -1296,17 +1475,7 @@ sub namedseq {
  my %NUMERIC;
  
  sub _numeric {
-
-    # Unicode 6.0 instituted the rule that only digits in a consecutive
-    # block of 10 would be considered decimal digits.  Before that, the only
-    # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
-    # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
-    # ONE.  The code could be modified to handle that, but not bothering, as
-    # in TUS 6.0, U+19DA was changed to Nt=Di.
-    if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
-       croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
-    }
-    my @numbers = _read_table("unicore/To/Nv.pl");
+    my @numbers = _read_table("To/Nv.pl");
      foreach my $entry (@numbers) {
          my ($start, $end, $value) = @$entry;
  
@@ -1316,10 +1485,17 @@ sub _numeric {
              my $real = $rational[0] / $rational[1];
              $real_to_rational{$real} = $value;
              $value = $real;
-        }
  
-        for my $i ($start .. $end) {
-            $NUMERIC{$i} = $value;
+            # Should only be single element, but just in case...
+            for my $i ($start .. $end) {
+                $NUMERIC{$i} = $value;
+            }
+        }
+        else {
+            # The values require adjusting, as is in 'a' format
+            for my $i ($start .. $end) {
+                $NUMERIC{$i} = $value + $i - $start;
+            }
          }
      }
  
@@ -1409,14 +1585,43 @@ sub num {
      return if $string =~ /\D/;
      my $first_ord = ord(substr($string, 0, 1));
      my $value = $NUMERIC{$first_ord};
+
+    # To be a valid decimal number, it should be in a block of 10 consecutive
+    # characters, whose values are 0, 1, 2, ... 9.  Therefore this digit's
+    # value is its offset in that block from the character that means zero.
      my $zero_ord = $first_ord - $value;
  
+    # Unicode 6.0 instituted the rule that only digits in a consecutive
+    # block of 10 would be considered decimal digits.  If this is an earlier
+    # release, we verify that this first character is a member of such a
+    # block.  That is, that the block of characters surrounding this one
+    # consists of all \d characters whose numeric values are the expected
+    # ones.
+    UnicodeVersion() unless defined $v_unicode_version;
+    if ($v_unicode_version lt v6.0.0) {
+        for my $i (0 .. 9) {
+            my $ord = $zero_ord + $i;
+            return unless chr($ord) =~ /\d/;
+            my $numeric = $NUMERIC{$ord};
+            return unless defined $numeric;
+            return unless $numeric == $i;
+        }
+    }
+
      for my $i (1 .. $length -1) {
+
+        # Here we know either by verifying, or by fact of the first character
+        # being a \d in Unicode 6.0 or later, that any character between the
+        # character that means 0, and 9 positions above it must be \d, and
+        # must have its value correspond to its offset from the zero.  Any
+        # characters outside these 10 do not form a legal number for this
+        # function.
          my $ord = ord(substr($string, $i, 1));
          my $digit = $ord - $zero_ord;
          return unless $digit >= 0 && $digit <= 9;
          $value = $value * 10 + $digit;
      }
+
      return $value;
  }
  
@@ -1486,7 +1691,7 @@ Those discouraged forms are accepted as input to C<prop_aliases>, but are not
  returned in the lists.  C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
  which are old synonyms for C<"Is_LC"> and should not be used in new code, are
  examples of this.  These both return C<(Is_LC, Cased_Letter)>.  Thus this
-function allows you to take a discourarged form, and find its acceptable
+function allows you to take a discouraged form, and find its acceptable
  alternatives.  The same goes with single-form Block property equivalences.
  Only the forms that begin with C<"In_"> are not discouraged; if you pass
  C<prop_aliases> a discouraged form, you will get back the equivalent ones that
@@ -1658,7 +1863,7 @@ sub prop_aliases ($) {
      # The full name is in element 1.
      return $list_ref->[1] unless wantarray;
  
-    return @{dclone $list_ref};
+    return @{_dclone $list_ref};
  }
  
  =pod
@@ -1797,7 +2002,7 @@ sub prop_value_aliases ($$) {
          # The full name is in element 1.
          return $list_ref->[1] unless wantarray;
  
-        return @{dclone $list_ref};
+        return @{_dclone $list_ref};
      }
  
      return $list_ref->[0] unless wantarray;
@@ -1824,7 +2029,8 @@ by the input parameter string:
   prints:
   0, 1114112
  
-An empty list is returned if the input is unknown; the number of elements in
+If the input is unknown C<undef> is returned in scalar context; an empty-list
+in list context.  If the input is known, the number of elements in
  the list is returned if called in scalar context.
  
  L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives
@@ -1885,7 +2091,7 @@ the same result:
  And both raise a warning that a Unicode property is being used on a
  non-Unicode code point.  It is arguable as to which is the correct thing to do
  here.  This function has chosen the way opposite to the Perl regular
-expression behavior.  This allows you to easily flip to to the Perl regular
+expression behavior.  This allows you to easily flip to the Perl regular
  expression way (for you to go in the other direction would be far harder).
  Simply add 0x110000 at the end of the non-empty returned list if it isn't
  already that value; and pop that value if it is; like:
@@ -1922,6 +2128,9 @@ code points that have the property-value:
  C<prop_invlist> does not know about any user-defined nor Perl internal-only
  properties, and will return C<undef> if called with one of those.
  
+The L</search_invlist()> function is provided for finding a code point within
+an inversion list.
+
  =cut
  
  # User-defined properties could be handled with some changes to utf8_heavy.pl;
@@ -1934,8 +2143,12 @@ properties, and will return C<undef> if called with one of those.
  our %loose_defaults;
  our $MAX_UNICODE_CODEPOINT;
  
-sub prop_invlist ($) {
+sub prop_invlist ($;$) {
      my $prop = $_[0];
+
+    # Undocumented way to get at Perl internal properties
+    my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
+
      return if ! defined $prop;
  
      require "utf8_heavy.pl";
@@ -1952,7 +2165,7 @@ sub prop_invlist ($) {
                || ref $swash eq ""
                || $swash->{'BITS'} != 1
                || $swash->{'USER_DEFINED'}
-              || $prop =~ /^\s*_/;
+              || (! $internal_ok && $prop =~ /^\s*_/);
  
      if ($swash->{'EXTRAS'}) {
          carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
@@ -2055,67 +2268,12 @@ sub prop_invlist ($) {
      return @invlist;
  }
  
-sub _search_invlist {
-    # Find the range in the inversion list which contains a code point; that
-    # is, find i such that l[i] <= code_point < l[i+1]
-
-    # If this is ever made public, could use to speed up .t specials.  Would
-    # need to use code point argument, as in other functions in this pm
-
-    my $list_ref = shift;
-    my $code_point = shift;
-    # Verify non-neg numeric  XXX
-
-    my $max_element = @$list_ref - 1;
-    return if ! $max_element < 0;     # Undef if list is empty.
-
-    # Short cut something at the far-end of the table.  This also allows us to
-    # refer to element [$i+1] without fear of being out-of-bounds in the loop
-    # below.
-    return $max_element if $code_point >= $list_ref->[$max_element];
-
-    use integer;        # want integer division
-
-    my $i = $max_element / 2;
-
-    my $lower = 0;
-    my $upper = $max_element;
-    while (1) {
-
-        if ($code_point >= $list_ref->[$i]) {
-
-            # Here we have met the lower constraint.  We can quit if we
-            # also meet the upper one.
-            last if $code_point < $list_ref->[$i+1];
-
-            $lower = $i;        # Still too low.
-
-        }
-        else {
-
-            # Here, $code_point < $list_ref[$i], so look lower down.
-            $upper = $i;
-        }
-
-        # Split search domain in half to try again.
-        my $temp = ($upper + $lower) / 2;
-
-        # No point in continuing unless $i changes for next time
-        # in the loop.
-        return $i if $temp == $i;
-        $i = $temp;
-    } # End of while loop
-
-    # Here we have found the offset
-    return $i;
-}
-
  =pod
  
  =head2 B<prop_invmap()>
  
   use Unicode::UCD 'prop_invmap';
- my ($list_ref, $map_ref, $format, $missing)
+ my ($list_ref, $map_ref, $format, $default)
                                        = prop_invmap("General Category");
  
  C<prop_invmap> is used to get the complete mapping definition for a property,
@@ -2140,10 +2298,13 @@ properties acceptable as inputs to this function.
  
  It is a fatal error to call this function except in list context.
  
-In addition to the the two arrays that form the inversion map, C<prop_invmap>
+In addition to the two arrays that form the inversion map, C<prop_invmap>
  returns two other values; one is a scalar that gives some details as to the
-format of the entries of the map array; the other is used for specialized
-purposes, described at the end of this section.
+format of the entries of the map array; the other is a default value, useful
+in maps whose format name begins with the letter C<"a">, as described
+L<below in its subsection|/a>; and for specialized purposes, such as
+converting to another data structure, described at the end of this main
+section.
  
  This means that C<prop_invmap> returns a 4 element list.  For example,
  
@@ -2179,7 +2340,7 @@ The first line (with Index [0]) means that the value for code point 0 is "Basic
  Latin".  The entry "0x0080" in the @blocks_ranges column in the second line
  means that the value from the first line, "Basic Latin", extends to all code
  points in the range from 0 up to but not including 0x0080, that is, through
-255.  In other words, the code points from 0 to 255 are all in the "Basic
+127.  In other words, the code points from 0 to 127 are all in the "Basic
  Latin" block.  Similarly, all code points in the range from 0x0080 up to (but
  not including) 0x0100 are in the block named "Latin-1 Supplement", etc.
  (Notice that the return is the old-style block names; see L</Old-style versus
@@ -2203,7 +2364,8 @@ that, instead of treating these as unassigned Unicode code points, the value
  for this range should be C<undef>.  If you wish, you can change the returned
  arrays accordingly.
  
-The maps are almost always simple scalars that should be interpreted as-is.
+The maps for almost all properties are simple scalars that should be
+interpreted as-is.
  These values are those given in the Unicode-supplied data files, which may be
  inconsistent as to capitalization and as to which synonym for a property-value
  is given.  The results may be normalized by using the L</prop_value_aliases()>
@@ -2213,7 +2375,7 @@ There are exceptions to the simple scalar maps.  Some properties have some
  elements in their map list that are themselves lists of scalars; and some
  special strings are returned that are not to be interpreted as-is.  Element
  [2] (placed into C<$format> in the example above) of the returned four element
-list tells you if the map has any of these special elements, as follows:
+list tells you if the map has any of these special elements or not, as follows:
  
  =over
  
@@ -2225,7 +2387,7 @@ above.
  
  =item B<C<sl>>
  
-means that some of the map array elements have the form given by C<s>, and
+means that some of the map array elements have the form given by C<"s">, and
  the rest are lists of scalars.  For example, here is a portion of the output
  of calling C<prop_invmap>() with the "Script Extensions" property:
  
@@ -2236,10 +2398,10 @@ of calling C<prop_invmap>() with the "Script Extensions" property:
        0x0966      Devanagari
        0x0970      Common
  
-Here, the code points 0x964 and 0x965 are both used in the Bengali,
-Devanagari, Gurmukhi, and Oriya  scripts, and no other scripts.
+Here, the code points 0x964 and 0x965 are both used in Bengali,
+Devanagari, Gurmukhi, and Oriya, but no other scripts.
  
-The Name_Alias property is of this form.  But each scalar consists of two
+The Name_Alias property is also of this form.  But each scalar consists of two
  components:  1) the name, and 2) the type of alias this is.  They are
  separated by a colon and a space.  In Unicode 6.1, there are several alias types:
  
@@ -2292,76 +2454,84 @@ For example,
  A map to the empty string means that there is no alias defined for the code
  point.
  
-=item B<C<c>>
+=item B<C<a>>
  
-is like C<s> in that all the map array elements are scalars, but here they are
-restricted to all being integers, and each has to be tweaked to get the correct
-result by adding the code point number to it.  For example, in:
+is like C<"s"> in that all the map array elements are scalars, but here they are
+restricted to all being integers, and some have to be adjusted (hence the name
+C<"a">) to get the correct result.  For example, in:
  
- my ($uppers_ranges_ref, $uppers_maps_ref, $format)
+ my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
                            = prop_invmap("Simple_Uppercase_Mapping");
  
  the returned arrays look like this:
  
   @$uppers_ranges_ref    @$uppers_maps_ref   Note
         0                      0
-      97                    -32          'a' maps to 'A', b => B ...
+      97                     65          'a' maps to 'A', b => B ...
       123                      0
-     181                    743          MICRO SIGN => Greek Cap MU
+     181                    924          MICRO SIGN => Greek Cap MU
       182                      0
       ...
  
-The first line means that the uppercase of code point 0 is 0+0; the uppercase
-of code point 1 is 1+0; ...  of code point 96 is 96+0.  In other words, the
-uppercase of each of the first 0..96 code points is itself.  The second line
-means that code point 97 maps to 97-32 (=65) or the uppercase of 'a' is 'A';
-98 => 98-32 (=66) or the uppercase of 'b' is 'B'; ... 122 => 122-32 (=90) or
-the uppercase of 'z' is 'Z'.
+and C<$default> is 0.
+
+Let's start with the second line.  It says that the uppercase of code point 97
+is 65; or C<uc("a")> == "A".  But the line is for the entire range of code
+points 97 through 122.  To get the mapping for any code point in this range,
+you take the offset it has from the beginning code point of the range, and add
+that to the mapping for that first code point.  So, the mapping for 122 ("z")
+is derived by taking the offset of 122 from 97 (=25) and adding that to 65,
+yielding 90 ("z").  Likewise for everything in between.
+
+Requiring this simple adjustment allows the returned arrays to be
+significantly smaller than otherwise, up to a factor of 10, speeding up
+searching through them.
  
-By requiring adding the code point to the returned result, the arrays are made
-significantly smaller.
+Ranges that map to C<$default>, C<"0">, behave somewhat differently.  For
+these, each code point maps to itself.  So, in the first line in the example,
+S<C<ord(uc(chr(0)))>> is 0, S<C<ord(uc(chr(1)))>> is 1, ..
+S<C<ord(uc(chr(96)))>> is 96.
  
-=item B<C<cl>>
+=item B<C<al>>
  
-means that some of the map array elements have the form given by C<c>, and
+means that some of the map array elements have the form given by C<"a">, and
  the rest are ordered lists of code points.
  For example, in:
  
- my ($uppers_ranges_ref, $uppers_maps_ref, $format)
+ my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
                                   = prop_invmap("Uppercase_Mapping");
  
  the returned arrays look like this:
  
   @$uppers_ranges_ref    @$uppers_maps_ref
         0                      0
-      97                    -32
+      97                     65
       123                      0
-     181                    743
+     181                    924
       182                      0
       ...
      0x0149              [ 0x02BC 0x004E ]
      0x014A                    0
-    0x014B                   -1
+    0x014B                  330
       ...
  
  This is the full Uppercase_Mapping property (as opposed to the
-Simple_Uppercase_Mapping given in the example for format C<"c">).  The only
+Simple_Uppercase_Mapping given in the example for format C<"a">).  The only
  difference between the two in the ranges shown is that the code point at
  0x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
  characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
  CAPITAL LETTER N).
  
-Yes, there is an inconsistency here.  When the map is a single element the
-correct value must be derived by adding the code point number to it; when the
-map is a list of code points, they are the final correct values.  The reason
-for forcing the addition is to make the returned map array significantly more
-compact.  There is no such advantage to doing the same thing to the elements
-that are lists, and the addition is extra work.
+No adjustments are needed to entries that are references to arrays; each such
+entry will have exactly one element in its range, so the offset is always 0.
  
-=item B<C<ce>>
+The fourth (index [3]) element (C<$default>) in the list returned for this
+format is 0.
  
-This is like C<c>, but some elements are the empty string, so not all are
-integers.
+=item B<C<ae>>
+
+This is like C<"a">, but some elements are the empty string, and should not be
+adjusted.
  The one internal Perl property accessible by C<prop_invmap> is of this type:
  "Perl_Decimal_Digit" returns an inversion map which gives the numeric values
  that are represented by the Unicode decimal digit characters.  Characters that
@@ -2369,79 +2539,88 @@ don't represent decimal digits map to the empty string, like so:
  
   @digits    @values
   0x0000       ""
- 0x0030       -48
+ 0x0030        0
   0x003A:      ""
- 0x0660:    -1632
+ 0x0660:       0
   0x066A:      ""
- 0x06F0:    -1776
+ 0x06F0:       0
   0x06FA:      ""
- 0x07C0:    -1984
+ 0x07C0:       0
   0x07CA:      ""
- 0x0966:    -2406
+ 0x0966:       0
   ...
  
  This means that the code points from 0 to 0x2F do not represent decimal digits;
-the code point 0x30 (DIGIT ZERO, =48 decimal) represents 48-48 = 0;  code
-point 0x31, (DIGIT ONE), represents 49-48 = 1; ... code point 0x39, (DIGIT
-NINE), represents 57-48 = 9; ... code points 0x3A through 0x65F do not
-represent decimal digits; 0x660 (ARABIC-INDIC DIGIT ZERO, =1632 decimal),
-represents 1632-1632 = 0; ... 0x07C1 (NKO DIGIT ONE, = 1985), represents
-1985-1984 = 1 ...
+the code point 0x30 (DIGIT ZERO) represents 0;  code point 0x31, (DIGIT ONE),
+represents 0+1-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9-0 = 9;
+... code points 0x3A through 0x65F do not represent decimal digits; 0x660
+(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE),
+represents 0+1-0 = 1 ...
+
+The fourth (index [3]) element (C<$default>) in the list returned for this
+format is the empty string.
  
-=item B<C<cle>>
+=item B<C<ale>>
  
-is a combination of the C<cl> type and the C<e> type.  Some of
-the map array elements have the forms given by C<cl>, and
+is a combination of the C<"al"> type and the C<"ae"> type.  Some of
+the map array elements have the forms given by C<"al">, and
  the rest are the empty string.  The property C<NFKC_Casefold> has this form.
  An example slice is:
  
   @$ranges_ref  @$maps_ref         Note
      ...
-   0x00AA     -73                 FEMININE ORDINAL INDICATOR => 'a'
-   0x00AB       0
+   0x00AA       97                FEMININE ORDINAL INDICATOR => 'a'
+   0x00AB        0
     0x00AD                         SOFT HYPHEN => ""
-   0x00AE       0
+   0x00AE        0
     0x00AF     [ 0x0020, 0x0304 ]  MACRON => SPACE . COMBINING MACRON
-   0x00B0       0
+   0x00B0        0
     ...
  
-=item B<C<r>>
+The fourth (index [3]) element (C<$default>) in the list returned for this
+format is 0.
+
+=item B<C<ar>>
  
  means that all the elements of the map array are either rational numbers or
  the string C<"NaN">, meaning "Not a Number".  A rational number is either an
  integer, or two integers separated by a solidus (C<"/">).  The second integer
  represents the denominator of the division implied by the solidus, and is
-guaranteed not to be 0.  If you want to convert them to scalar numbers, you
+actually always positive, so it is guaranteed not to be 0 and to not be
+signed.  When the element is a plain integer (without the
+solidus), it may need to be adjusted to get the correct value by adding the
+offset, just as other C<"a"> properties.  No adjustment is needed for
+fractions, as the range is guaranteed to have just a single element, and so
+the offset is always 0.
+
+If you want to convert the returned map to entirely scalar numbers, you
  can use something like this:
  
   my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
- if ($format && $format eq "r") {
-     map { $_ = eval $_ } @$invmap_ref;
+ if ($format && $format eq "ar") {
+     map { $_ = eval $_ if $_ ne 'NaN' } @$map_ref;
   }
  
  Here's some entries from the output of the property "Nv", which has format
-C<"r">.
+C<"ar">.
  
- @numerics_ranges  @numerics_maps        Note
+ @numerics_ranges  @numerics_maps       Note
          0x00           "NaN"
-        0x30             0              DIGIT 0
-        0x31             1
-        0x32             2
-        ...
-        0x37             7
-        0x38             8
-        0x39             9              DIGIT 9
+        0x30             0           DIGIT 0 .. DIGIT 9
          0x3A           "NaN"
-        0xB2             2              SUPERSCRIPT 2
-        0xB3             3              SUPERSCRIPT 2
+        0xB2             2           SUPERSCRIPTs 2 and 3
          0xB4           "NaN"
-        0xB9             1              SUPERSCRIPT 1
+        0xB9             1           SUPERSCRIPT 1
          0xBA           "NaN"
-        0xBC            1/4             VULGAR FRACTION 1/4
-        0xBD            1/2             VULGAR FRACTION 1/2
-        0xBE            3/4             VULGAR FRACTION 3/4
+        0xBC            1/4          VULGAR FRACTION 1/4
+        0xBD            1/2          VULGAR FRACTION 1/2
+        0xBE            3/4          VULGAR FRACTION 3/4
          0xBF           "NaN"
-        0x660            0              ARABIC-INDIC DIGIT ZERO
+        0x660            0           ARABIC-INDIC DIGIT ZERO .. NINE
+        0x66A          "NaN"
+
+The fourth (index [3]) element (C<$default>) in the list returned for this
+format is C<"NaN">.
  
  =item B<C<n>>
  
@@ -2455,8 +2634,8 @@ Entries such as:
  
  mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-"
  with the code point (expressed in hexadecimal) appended to it, like "CJK
-UNIFIED IDEOGRAPH-3403" (similarly for C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
-pointE<gt>>).
+UNIFIED IDEOGRAPH-3403" (similarly for S<C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
+pointE<gt>>>).
  
  Also, entries like
  
@@ -2468,46 +2647,66 @@ the function L<charnames/charnames::viacode(code)>.
  Note that for control characters (C<Gc=cc>), Unicode's data files have the
  string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
  string.  This function returns that real name, the empty string.  (There are
-names for these characters, but they are aliases, not the real name, and are
-contained in the C<Name_Alias> property.)
+names for these characters, but they are considered aliases, not the Name
+property name, and are contained in the C<Name_Alias> property.)
  
-=item B<C<d>>
+=item B<C<ad>>
  
-means the Decomposition_Mapping property.  This property is like C<cl>
+means the Decomposition_Mapping property.  This property is like C<"al">
  properties, except that one of the scalar elements is of the form:
  
   <hangul syllable>
  
  This signifies that this entry should be replaced by the decompositions for
  all the code points whose decomposition is algorithmically calculated.  (All
-of them are currently in one range and likely to remain so; the C<n> format
+of them are currently in one range and no others outside the range are likely
+to ever be added to Unicode; the C<"n"> format
  has this same entry.)  These can be generated via the function
  L<Unicode::Normalize::NFD()|Unicode::Normalize>.
  
  Note that the mapping is the one that is specified in the Unicode data files,
  and to get the final decomposition, it may need to be applied recursively.
  
+The fourth (index [3]) element (C<$default>) in the list returned for this
+format is 0.
+
  =back
  
-A binary search can be used to quickly find a code point in the inversion
-list, and hence its corresponding mapping.
+Note that a format begins with the letter "a" if and only the property it is
+for requires adjustments by adding the offsets in multi-element ranges.  For
+all these properties, an entry should be adjusted only if the map is a scalar
+which is an integer.  That is, it must match the regular expression:
+
+    / ^ -? \d+ $ /xa
  
-The final element (index [3], assigned to C<$default> in the "block" example) in
-the four element list returned by this function may be useful for applications
+Further, the first element in a range never needs adjustment, as the
+adjustment would be just adding 0.
+
+A binary search such as that provided by L</search_invlist()>, can be used to
+quickly find a code point in the inversion list, and hence its corresponding
+mapping.
+
+The final, fourth element (index [3], assigned to C<$default> in the "block"
+example) in the four element list returned by this function is used with the
+C<"a"> format types; it may also be useful for applications
  that wish to convert the returned inversion map data structure into some
  other, such as a hash.  It gives the mapping that most code points map to
  under the property.  If you establish the convention that any code point not
  explicitly listed in your data structure maps to this value, you can
  potentially make your data structure much smaller.  As you construct your data
  structure from the one returned by this function, simply ignore those ranges
-that map to this value, generally called the "default" value.  For example, to
+that map to this value.  For example, to
  convert to the data structure searchable by L</charinrange()>, you can follow
-this recipe:
+this recipe for properties that don't require adjustments:
  
- my ($list_ref, $map_ref, $format, $missing) = prop_invmap($property);
+ my ($list_ref, $map_ref, $format, $default) = prop_invmap($property);
   my @range_list;
+
+ # Look at each element in the list, but the -2 is needed because we
+ # look at $i+1 in the loop, and the final element is guaranteed to map
+ # to $default by prop_invmap(), so we would skip it anyway.
   for my $i (0 .. @$list_ref - 2) {
-    next if $map_ref->[$i] eq $missing;
+    next if $map_ref->[$i] eq $default;
      push @range_list, [ $list_ref->[$i],
                          $list_ref->[$i+1],
                          $map_ref->[$i]
@@ -2516,11 +2715,39 @@ this recipe:
  
   print charinrange(\@range_list, $code_point), "\n";
  
-
  With this, C<charinrange()> will return C<undef> if its input code point maps
-to C<$missing>.  You can avoid this by omitting the C<next> statement, and adding
+to C<$default>.  You can avoid this by omitting the C<next> statement, and adding
  a line after the loop to handle the final element of the inversion map.
  
+Similarly, this recipe can be used for properties that do require adjustments:
+
+ for my $i (0 .. @$list_ref - 2) {
+    next if $map_ref->[$i] eq $default;
+
+    # prop_invmap() guarantees that if the mapping is to an array, the
+    # range has just one element, so no need to worry about adjustments.
+    if (ref $map_ref->[$i]) {
+        push @range_list,
+                   [ $list_ref->[$i], $list_ref->[$i], $map_ref->[$i] ];
+    }
+    else {  # Otherwise each element is actually mapped to a separate
+            # value, so the range has to be split into single code point
+            # ranges.
+
+        my $adjustment = 0;
+
+        # For each code point that gets mapped to something...
+        for my $j ($list_ref->[$i] .. $list_ref->[$i+1] -1 ) {
+
+            # ... add a range consisting of just it mapping to the
+            # original plus the adjustment, which is incremented for the
+            # next time through the loop, as the offset increases by 1
+            # for each element in the range
+            push @range_list,
+                             [ $j, $j, $map_ref->[$i] + $adjustment++ ];
+        }
+    }
+ }
  
  Note that the inversion maps returned for the C<Case_Folding> and
  C<Simple_Case_Folding> properties do not include the Turkic-locale mappings.
@@ -2561,7 +2788,7 @@ sub prop_invmap ($) {
  
      # The swash has two components we look at, the base list, and a hash,
      # named 'SPECIALS', containing any additional members whose mappings don't
-    # fit into the the base list scheme of things.  These generally 'override'
+    # fit into the base list scheme of things.  These generally 'override'
      # any value in the base list for the same code point.
      my $overrides;
  
@@ -2570,20 +2797,21 @@ sub prop_invmap ($) {
  
  RETRY:
  
+    # If there are multiple entries for a single code point
+    my $has_multiples = 0;
+
      # Try to get the map swash for the property.  They have 'To' prepended to
      # the property name, and 32 means we will accept 32 bit return values.
+    # The 0 means we aren't calling this from tr///.
      my $swash = utf8::SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0);
  
-    # If there are multiple entries for a single code point;
-    my $has_multiples = 0;
-
      # If didn't find it, could be because needs a proxy.  And if was the
      # 'Block' or 'Name' property, use a proxy even if did find it.  Finding it
-    # would be the result of the installation changing mktables to output the
-    # Block or Name tables.  The Block table gives block names in the
-    # new-style, and this routine is supposed to return old-style block names.
-    # The Name table is valid, but we need to execute the special code below
-    # to add in the algorithmic-defined name entries.
+    # in these cases would be the result of the installation changing mktables
+    # to output the Block or Name tables.  The Block table gives block names
+    # in the new-style, and this routine is supposed to return old-style block
+    # names.  The Name table is valid, but we need to execute the special code
+    # below to add in the algorithmic-defined name entries.
      # And NFKCCF needs conversion, so handle that here too.
      if (ref $swash eq ""
          || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x)
@@ -2600,7 +2828,7 @@ RETRY:
              $prop = "age";
              goto RETRY;
          }
-        elsif ($second_try =~ / ^ s ( cf | [ltu] c ) $ /x) {
+        elsif ($second_try =~ / ^ s ( cf | fc | [ltu] c ) $ /x) {
  
              # These properties use just the LIST part of the full mapping,
              # which includes the simple maps that are otherwise overridden by
@@ -2609,7 +2837,11 @@ RETRY:
              $overrides = -1;
  
              # The full name is the simple name stripped of its initial 's'
-            $prop = $second_try =~ s/^s//r;
+            $prop = $1;
+
+            # .. except for this case
+            $prop = 'cf' if $prop eq 'fc';
+
              goto RETRY;
          }
          elsif ($second_try eq "blk") {
@@ -2663,17 +2895,12 @@ RETRY:
                  map { s/:.*// } @{$aliases{$code_point}};
              }
  
-            # We hold off on adding the next entry to the list until we know,
-            # that the next line isn't for the same code point.  We only
-            # output the final line.  That one is the original Name property
-            # value.  The others are the Name_Alias corrections, which are
-            # listed first in the file.
              my $i = 0;
              foreach my $line (split "\n", $original) {
                  my ($hex_code_point, $name) = split "\t", $line;
  
                  # Weeds out all comments, blank lines, and named sequences
-                next if $hex_code_point =~ /\P{ASCII_HEX_DIGIT}/;
+                next if $hex_code_point =~ /[^[:xdigit:]]/a;
  
                  my $code_point = hex $hex_code_point;
  
@@ -2726,22 +2953,16 @@ RETRY:
                  $decomps{'TYPE'} = "ToDt";
                  $utf8::SwashInfo{'ToDt'}{'missing'} = "None";
                  $utf8::SwashInfo{'ToDt'}{'format'} = "s";
-            }
-            else {
-                $decomps{'TYPE'} = "ToDm";
-                $utf8::SwashInfo{'ToDm'}{'missing'} = "0";
-                $utf8::SwashInfo{'ToDm'}{'format'} = 'i';
-
-                # Use a special internal-to-this_routine format, 'dm', to
-                # distinguish from 'd', meaning decimal.
-                $utf8::SwashInfo{'ToDm'}{'format'} = "dm";
-            }
+            }   # 'dm' is handled below, with 'nfkccf'
  
              $decomps{'LIST'} = "";
  
              # This property has one special range not in the file: for the
-            # hangul syllables
-            my $done_hangul = 0;    # Have we done the hangul range.
+            # hangul syllables.  But not in Unicode version 1.
+            UnicodeVersion() unless defined $v_unicode_version;
+            my $done_hangul = ($v_unicode_version lt v2.0.0)
+                              ? 1
+                              : 0;    # Have we done the hangul range ?
              foreach my $line (split "\n", $original) {
                  my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
                  my $code_point = hex $hex_lower;
@@ -2758,35 +2979,6 @@ RETRY:
                               ? "Canonical" :
                               $type_and_map;
                  }
-                if ($second_try eq 'dm') {
-                    my @map = map { hex } split " ", $value;
-
-                    if (@map == 1) {
-
-                        # Single character maps are converted to deltas, as
-                        # this file is stored, for backwards compatibility,
-                        # not using them.
-                        $value = $map[0] - $code_point;
-
-                        # If this is a multi-char range, process the rest of
-                        # it by doing a 'redo' after this line is done.  Fix
-                        # up the line to contain the rest of the range for
-                        # that redo.
-                        if ($hex_upper ne "" && hex $hex_upper != $code_point) {
-                            $line = sprintf("%04X\t%s\t%s",
-                                            $code_point + 1,
-                                            $hex_upper,
-                                            $type_and_map);
-                            $redo = 1;
-
-                            # Pretend that this is a single element range.
-                            $hex_upper = $hex_lower;
-                        }
-                    }
-                    else {
-                        $value = join " ", @map;
-                    }
-                }
  
                  # Insert the hangul range at the appropriate spot.
                  if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
@@ -2800,6 +2992,12 @@ RETRY:
                                          : "<hangul syllable>";
                  }
  
+                if ($value =~ / / && $hex_upper ne "" && $hex_upper ne $hex_lower) {
+                    $line = sprintf("%04X\t%s\t%s", hex($hex_lower) + 1, $hex_upper, $value);
+                    $hex_upper = "";
+                    $redo = 1;
+                }
+
                  # And append this to our constructed LIST.
                  $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
  
@@ -2807,37 +3005,149 @@ RETRY:
              }
              $swash = \%decomps;
          }
-        elsif ($second_try eq 'nfkccf') {
+        elsif ($second_try ne 'nfkccf') { # Don't know this property. Fail.
+            return;
+        }
  
-            # This property is stored in the old format for backwards
-            # compatibility for any applications that read its file directly.
-            # So here we convert it to delta format for compatibility with the
-            # other properties similar to it.
-            my %nfkccf;
+        if ($second_try eq 'nfkccf' || $second_try eq 'dm') {
  
-            # Create a new LIST with deltas instead of code points.
+            # The 'nfkccf' property is stored in the old format for backwards
+            # compatibility for any applications that has read its file
+            # directly before prop_invmap() existed.
+            # And the code above has extracted the 'dm' property from its file
+            # yielding the same format.  So here we convert them to adjusted
+            # format for compatibility with the other properties similar to
+            # them.
+            my %revised_swash;
+
+            # We construct a new converted list.
              my $list = "";
-            foreach my $range (split "\n", $swash->{'LIST'}) {
-                my ($hex_begin, $hex_end, $map) = split "\t", $range;
+
+            my @ranges = split "\n", $swash->{'LIST'};
+            for (my $i = 0; $i < @ranges; $i++) {
+                my ($hex_begin, $hex_end, $map) = split "\t", $ranges[$i];
+
+                # The dm property has maps that are space separated sequences
+                # of code points, as well as the special entry "<hangul
+                # syllable>, which also contains a blank.
+                my @map = split " ", $map;
+                if (@map > 1) {
+
+                    # If it's just the special entry, append as-is.
+                    if ($map eq '<hangul syllable>') {
+                        $list .= "$ranges[$i]\n";
+                    }
+                    else {
+
+                        # These should all be single-element ranges.
+                        croak __PACKAGE__, "::prop_invmap: Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne "" && $hex_end ne $hex_begin;
+
+                        # Convert them to decimal, as that's what's expected.
+                        $list .= "$hex_begin\t\t"
+                            . join(" ", map { hex } @map)
+                            . "\n";
+                    }
+                    next;
+                }
+
+                # Here, the mapping doesn't have a blank, is for a single code
+                # point.
                  my $begin = hex $hex_begin;
                  my $end = (defined $hex_end && $hex_end ne "")
                          ? hex $hex_end
                          : $begin;
+
+                # Again, the output is to be in decimal.
                  my $decimal_map = hex $map;
-                foreach my $code_point ($begin .. $end) {
-                    $list .= sprintf("%04X\t\t%d\n", $code_point, $decimal_map - $code_point);
+
+                # We know that multi-element ranges with the same mapping
+                # should not be adjusted, as after the adjustment
+                # multi-element ranges are for consecutive increasing code
+                # points.  Further, the final element in the list won't be
+                # adjusted, as there is nothing after it to include in the
+                # adjustment
+                if ($begin != $end || $i == @ranges -1) {
+
+                    # So just convert these to single-element ranges
+                    foreach my $code_point ($begin .. $end) {
+                        $list .= sprintf("%04X\t\t%d\n",
+                                        $code_point, $decimal_map);
+                    }
                  }
-            }
+                else {
  
-            $nfkccf{'LIST'} = $list;
-            $nfkccf{'TYPE'} = "ToNFKCCF";
-            $nfkccf{'SPECIALS'} = $swash->{'SPECIALS'};
-            $swash = \%nfkccf;
-            $utf8::SwashInfo{'ToNFKCCF'}{'missing'} = 0;
-            $utf8::SwashInfo{'ToNFKCCF'}{'format'} = 'i';
-        }
-        else {  # Don't know this property. Fail.
-            return;
+                    # Here, we have a candidate for adjusting.  What we do is
+                    # look through the subsequent adjacent elements in the
+                    # input.  If the map to the next one differs by 1 from the
+                    # one before, then we combine into a larger range with the
+                    # initial map.  Loop doing this until we find one that
+                    # can't be combined.
+
+                    my $offset = 0;     # How far away are we from the initial
+                                        # map
+                    my $squished = 0;   # ? Did we squish at least two
+                                        # elements together into one range
+                    for ( ; $i < @ranges; $i++) {
+                        my ($next_hex_begin, $next_hex_end, $next_map)
+                                                = split "\t", $ranges[$i+1];
+
+                        # In the case of 'dm', the map may be a sequence of
+                        # multiple code points, which are never combined with
+                        # another range
+                        last if $next_map =~ / /;
+
+                        $offset++;
+                        my $next_decimal_map = hex $next_map;
+
+                        # If the next map is not next in sequence, it
+                        # shouldn't be combined.
+                        last if $next_decimal_map != $decimal_map + $offset;
+
+                        my $next_begin = hex $next_hex_begin;
+
+                        # Likewise, if the next element isn't adjacent to the
+                        # previous one, it shouldn't be combined.
+                        last if $next_begin != $begin + $offset;
+
+                        my $next_end = (defined $next_hex_end
+                                        && $next_hex_end ne "")
+                                            ? hex $next_hex_end
+                                            : $next_begin;
+
+                        # And finally, if the next element is a multi-element
+                        # range, it shouldn't be combined.
+                        last if $next_end != $next_begin;
+
+                        # Here, we will combine.  Loop to see if we should
+                        # combine the next element too.
+                        $squished = 1;
+                    }
+
+                    if ($squished) {
+
+                        # Here, 'i' is the element number of the last element to
+                        # be combined, and the range is single-element, or we
+                        # wouldn't be combining.  Get it's code point.
+                        my ($hex_end, undef, undef) = split "\t", $ranges[$i];
+                        $list .= "$hex_begin\t$hex_end\t$decimal_map\n";
+                    } else {
+
+                        # Here, no combining done.  Just append the initial
+                        # (and current) values.
+                        $list .= "$hex_begin\t\t$decimal_map\n";
+                    }
+                }
+            } # End of loop constructing the converted list
+
+            # Finish up the data structure for our converted swash
+            my $type = ($second_try eq 'nfkccf') ? 'ToNFKCCF' : 'ToDm';
+            $revised_swash{'LIST'} = $list;
+            $revised_swash{'TYPE'} = $type;
+            $revised_swash{'SPECIALS'} = $swash->{'SPECIALS'};
+            $swash = \%revised_swash;
+
+            $utf8::SwashInfo{$type}{'missing'} = 0;
+            $utf8::SwashInfo{$type}{'format'} = 'a';
          }
      }
  
@@ -2857,6 +3167,8 @@ RETRY:
      $format = $utf8::SwashInfo{$returned_prop}{'format'};
      $format = 'b' unless defined $format;
  
+    my $requires_adjustment = $format =~ /^a/;
+
      # The LIST input lines look like:
      # ...
      # 0374\t\tCommon
@@ -2922,7 +3234,7 @@ RETRY:
              # This is all we need do for this iteration.
  
              if ($end != $begin) {
-                croak __PACKAGE__, "Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
+                croak __PACKAGE__, ":prop_invmap: Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
              }
              if (! ref $invmap[-2]) {
                  $invmap[-2] = [ $invmap[-2], $map ];
@@ -2937,10 +3249,12 @@ RETRY:
  
              # If the input isn't in the most compact form, so that there are
              # two adjacent ranges that map to the same thing, they should be
-            # combined.  This happens in our constructed dt mapping, as
-            # Element [-2] is the map for the latest range so far processed.
-            # Just set the beginning point of the map to $missing (in
-            # invlist[-1]) to 1 beyond where this range ends.  For example, in
+            # combined (EXCEPT where the arrays require adjustments, in which
+            # case everything is already set up correctly).  This happens in
+            # our constructed dt mapping, as Element [-2] is the map for the
+            # latest range so far processed.  Just set the beginning point of
+            # the map to $missing (in invlist[-1]) to 1 beyond where this
+            # range ends.  For example, in
              # 12\t13\tXYZ
              # 14\t17\tXYZ
              # we have set it up so that it looks like
@@ -2950,7 +3264,7 @@ RETRY:
              # We now see that it should be
              # 12 => XYZ
              # 18 => $missing
-            if (@invlist > 1 && ( (defined $map)
+            if (! $requires_adjustment && @invlist > 1 && ( (defined $map)
                                    ? $invmap[-2] eq $map
                                    : $invmap[-2] eq 'Y'))
              {
@@ -2968,7 +3282,7 @@ RETRY:
  
          # Add the range beginning, and the range's map.
          push @invlist, $begin;
-        if ($format eq 'dm') {
+        if ($returned_prop eq 'ToDm') {
  
              # The decomposition maps are either a line like <hangul syllable>
              # which are to be taken as is; or a sequence of code points in hex
@@ -3018,7 +3332,7 @@ RETRY:
          push @invmap, $missing;
      }
  
-    # And add in standard element that all non-Unicode code points map to
+    # And add in standard element that all non-Unicode code points map to:
      # $missing
      push @invlist, $MAX_UNICODE_CODEPOINT + 1;
      push @invmap, $missing;
@@ -3031,16 +3345,16 @@ RETRY:
      if ($overrides) {
  
          # A negative $overrides implies that the SPECIALS should be ignored,
-        # and a simple 'c' list is the value.
+        # and a simple 'a' list is the value.
          if ($overrides < 0) {
-            $format = 'c';
+            $format = 'a';
          }
          else {
  
              # Currently, all overrides are for properties that normally map to
              # single code points, but now some will map to lists of code
              # points (but there is an exception case handled below).
-            $format = 'cl';
+            $format = 'al';
  
              # Look through the overrides.
              foreach my $cp_maybe_utf8 (keys %$overrides) {
@@ -3055,22 +3369,22 @@ RETRY:
  
                      # The empty string will show up unpacked as an empty
                      # array.
-                    $format = 'cle' if @map == 0;
+                    $format = 'ale' if @map == 0;
                  }
                  else {
  
                      # But if we generated the overrides, we didn't bother to
                      # pack them, and we, so far, do this only for properties
-                    # that are 'c' ones.
+                    # that are 'a' ones.
                      $cp = $cp_maybe_utf8;
                      @map = hex $overrides->{$cp};
-                    $format = 'c';
+                    $format = 'a';
                  }
  
                  # Find the range that the override applies to.
-                my $i = _search_invlist(\@invlist, $cp);
+                my $i = search_invlist(\@invlist, $cp);
                  if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
-                    croak __PACKAGE__, "wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
+                    croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
                  }
  
                  # And what that range currently maps to
@@ -3139,11 +3453,12 @@ RETRY:
      }
      elsif ($format eq 'x') {
  
-        # All hex-valued properties are really to code points
-        $format = 'i';
+        # All hex-valued properties are really to code points, and have been
+        # converted to decimal.
+        $format = 's';
      }
-    elsif ($format eq 'dm') {
-        $format = 'd';
+    elsif ($returned_prop eq 'ToDm') {
+        $format = 'ad';
      }
      elsif ($format eq 'sw') { # blank-separated elements to form a list.
          map { $_ = [ split " ", $_  ] if $_ =~ / / } @invmap;
@@ -3156,20 +3471,120 @@ RETRY:
          $format = 'sl';
      }
      elsif ($returned_prop eq 'ToPerlDecimalDigit') {
-        $format = 'ce';
+        $format = 'ae';
      }
-    elsif ($format ne 'n' && $format ne 'r') {
+    elsif ($returned_prop eq 'ToNv') {
+
+        # The one property that has this format is stored as a delta, so needs
+        # to indicate that need to add code point to it.
+        $format = 'ar';
+    }
+    elsif ($format ne 'n' && $format ne 'a') {
  
          # All others are simple scalars
          $format = 's';
      }
      if ($has_multiples &&  $format !~ /l/) {
-       croak __PACKAGE__, "Wrong format '$format' for prop_invmap('$prop'); should indicate has lists";
+       croak __PACKAGE__, "::prop_invmap: Wrong format '$format' for prop_invmap('$prop'); should indicate has lists";
      }
  
      return (\@invlist, \@invmap, $format, $missing);
  }
  
+sub search_invlist {
+
+=pod
+
+=head2 B<search_invlist()>
+
+ use Unicode::UCD qw(prop_invmap prop_invlist);
+ use Unicode::UCD 'search_invlist';
+
+ my @invlist = prop_invlist($property_name);
+ print $code_point, ((search_invlist(\@invlist, $code_point) // -1) % 2)
+                     ? " isn't"
+                     : " is",
+     " in $property_name\n";
+
+ my ($blocks_ranges_ref, $blocks_map_ref) = prop_invmap("Block");
+ my $index = search_invlist($blocks_ranges_ref, $code_point);
+ print "$code_point is in block ", $blocks_map_ref->[$index], "\n";
+
+C<search_invlist> is used to search an inversion list returned by
+C<prop_invlist> or C<prop_invmap> for a particular L</code point argument>.
+C<undef> is returned if the code point is not found in the inversion list
+(this happens only when it is not a legal L<code point argument>, or is less
+than the list's first element).  A warning is raised in the first instance.
+
+Otherwise, it returns the index into the list of the range that contains the
+code point.; that is, find C<i> such that
+
+    list[i]<= code_point < list[i+1].
+
+As explained in L</prop_invlist()>, whether a code point is in the list or not
+depends on if the index is even (in) or odd (not in).  And as explained in
+L</prop_invmap()>, the index is used with the returned parallel array to find
+the mapping.
+
+=cut
+
+
+    my $list_ref = shift;
+    my $input_code_point = shift;
+    my $code_point = _getcode($input_code_point);
+
+    if (! defined $code_point) {
+        carp __PACKAGE__, "::search_invlist: unknown code '$input_code_point'";
+        return;
+    }
+
+    my $max_element = @$list_ref - 1;
+
+    # Return undef if list is empty or requested item is before the first element.
+    return if $max_element < 0;
+    return if $code_point < $list_ref->[0];
+
+    # Short cut something at the far-end of the table.  This also allows us to
+    # refer to element [$i+1] without fear of being out-of-bounds in the loop
+    # below.
+    return $max_element if $code_point >= $list_ref->[$max_element];
+
+    use integer;        # want integer division
+
+    my $i = $max_element / 2;
+
+    my $lower = 0;
+    my $upper = $max_element;
+    while (1) {
+
+        if ($code_point >= $list_ref->[$i]) {
+
+            # Here we have met the lower constraint.  We can quit if we
+            # also meet the upper one.
+            last if $code_point < $list_ref->[$i+1];
+
+            $lower = $i;        # Still too low.
+
+        }
+        else {
+
+            # Here, $code_point < $list_ref[$i], so look lower down.
+            $upper = $i;
+        }
+
+        # Split search domain in half to try again.
+        my $temp = ($upper + $lower) / 2;
+
+        # No point in continuing unless $i changes for next time
+        # in the loop.
+        return $i if $temp == $i;
+        $i = $temp;
+    } # End of while loop
+
+    # Here we have found the offset
+    return $i;
+}
+
  =head2 Unicode::UCD::UnicodeVersion
  
  This returns the version of the Unicode Character Database, in other words, the
@@ -3189,6 +3604,7 @@ sub UnicodeVersion {
         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
      }
+    $v_unicode_version = pack "C*", split /\./, $UNICODEVERSION;
      return $UNICODEVERSION;
  }
  
@@ -3197,7 +3613,8 @@ sub UnicodeVersion {
  The difference between a block and a script is that scripts are closer
  to the linguistic notion of a set of code points required to present
  languages, while block is more of an artifact of the Unicode code point
-numbering and separation into blocks of (mostly) 256 code points.
+numbering and separation into blocks of consecutive code points (so far the
+size of a block is some multiple of 16, like 128 or 256).
  
  For example the Latin B<script> is spread over several B<blocks>, such
  as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and