lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5 no warnings 'surrogate';    # surrogates can be inputs to this
   6 use charnames ();
   7
   8 our $VERSION = '0.61';
   9
  10 require Exporter;
  11
  12 our @ISA = qw(Exporter);
  13
  14 our @EXPORT_OK = qw(charinfo
  15                     charblock charscript
  16                     charblocks charscripts
  17                     charinrange
  18                     general_categories bidi_types
  19                     compexcl
  20                     casefold all_casefolds casespec
  21                     namedseq
  22                     num
  23                     prop_aliases
  24                     prop_value_aliases
  25                     prop_values
  26                     prop_invlist
  27                     prop_invmap
  28                     search_invlist
  29                     MAX_CP
  30                 );
  31
  32 use Carp;
  33
  34 sub IS_ASCII_PLATFORM { ord("A") == 65 }
  35
  36 =head1 NAME
  37
  38 Unicode::UCD - Unicode character database
  39
  40 =head1 SYNOPSIS
  41
  42     use Unicode::UCD 'charinfo';
  43     my $charinfo   = charinfo($codepoint);
  44
  45     use Unicode::UCD 'casefold';
  46     my $casefold = casefold($codepoint);
  47
  48     use Unicode::UCD 'all_casefolds';
  49     my $all_casefolds_ref = all_casefolds();
  50
  51     use Unicode::UCD 'casespec';
  52     my $casespec = casespec($codepoint);
  53
  54     use Unicode::UCD 'charblock';
  55     my $charblock  = charblock($codepoint);
  56
  57     use Unicode::UCD 'charscript';
  58     my $charscript = charscript($codepoint);
  59
  60     use Unicode::UCD 'charblocks';
  61     my $charblocks = charblocks();
  62
  63     use Unicode::UCD 'charscripts';
  64     my $charscripts = charscripts();
  65
  66     use Unicode::UCD qw(charscript charinrange);
  67     my $range = charscript($script);
  68     print "looks like $script\n" if charinrange($range, $codepoint);
  69
  70     use Unicode::UCD qw(general_categories bidi_types);
  71     my $categories = general_categories();
  72     my $types = bidi_types();
  73
  74     use Unicode::UCD 'prop_aliases';
  75     my @space_names = prop_aliases("space");
  76
  77     use Unicode::UCD 'prop_value_aliases';
  78     my @gc_punct_names = prop_value_aliases("Gc", "Punct");
  79
  80     use Unicode::UCD 'prop_values';
  81     my @all_EA_short_names = prop_values("East_Asian_Width");
  82
  83     use Unicode::UCD 'prop_invlist';
  84     my @puncts = prop_invlist("gc=punctuation");
  85
  86     use Unicode::UCD 'prop_invmap';
  87     my ($list_ref, $map_ref, $format, $missing)
  88                                       = prop_invmap("General Category");
  89
  90     use Unicode::UCD 'search_invlist';
  91     my $index = search_invlist(\@invlist, $code_point);
  92
  93     use Unicode::UCD 'compexcl';
  94     my $compexcl = compexcl($codepoint);
  95
  96     use Unicode::UCD 'namedseq';
  97     my $namedseq = namedseq($named_sequence_name);
  98
  99     my $unicode_version = Unicode::UCD::UnicodeVersion();
 100
 101     my $convert_to_numeric =
 102               Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
 103
 104 =head1 DESCRIPTION
 105
 106 The Unicode::UCD module offers a series of functions that
 107 provide a simple interface to the Unicode
 108 Character Database.
 109
 110 =head2 code point argument
 111
 112 Some of the functions are called with a I<code point argument>, which is either
 113 a decimal or a hexadecimal scalar designating a code point in the platform's
 114 native character set (extended to Unicode), or a string containing C<U+>
 115 followed by hexadecimals
 116 designating a Unicode code point.  A leading 0 will force a hexadecimal
 117 interpretation, as will a hexadecimal digit that isn't a decimal digit.
 118
 119 Examples:
 120
 121     223     # Decimal 223 in native character set
 122     0223    # Hexadecimal 223, native (= 547 decimal)
 123     0xDF    # Hexadecimal DF, native (= 223 decimal
 124     'U+DF'  # Hexadecimal DF, in Unicode's character set
 125                               (= LATIN SMALL LETTER SHARP S)
 126
 127 Note that the largest code point in Unicode is U+10FFFF.
 128
 129 =cut
 130
 131 my $BLOCKSFH;
 132 my $VERSIONFH;
 133 my $CASEFOLDFH;
 134 my $CASESPECFH;
 135 my $NAMEDSEQFH;
 136 my $v_unicode_version;  # v-string.
 137
 138 sub openunicode {
 139     my ($rfh, @path) = @_;
 140     my $f;
 141     unless (defined $$rfh) {
 142         for my $d (@INC) {
 143             use File::Spec;
 144             $f = File::Spec->catfile($d, "unicore", @path);
 145             last if open($$rfh, $f);
 146             undef $f;
 147         }
 148         croak __PACKAGE__, ": failed to find ",
 149               File::Spec->catfile(@path), " in @INC"
 150             unless defined $f;
 151     }
 152     return $f;
 153 }
 154
 155 sub _dclone ($) {   # Use Storable::dclone if available; otherwise emulate it.
 156
 157     use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone);
 158
 159     return dclone(shift) if defined &dclone;
 160
 161     my $arg = shift;
 162     my $type = ref $arg;
 163     return $arg unless $type;   # No deep cloning needed for scalars
 164
 165     if ($type eq 'ARRAY') {
 166         my @return;
 167         foreach my $element (@$arg) {
 168             push @return, &_dclone($element);
 169         }
 170         return \@return;
 171     }
 172     elsif ($type eq 'HASH') {
 173         my %return;
 174         foreach my $key (keys %$arg) {
 175             $return{$key} = &_dclone($arg->{$key});
 176         }
 177         return \%return;
 178     }
 179     else {
 180         croak "_dclone can't handle " . $type;
 181     }
 182 }
 183
 184 =head2 B<charinfo()>
 185
 186     use Unicode::UCD 'charinfo';
 187
 188     my $charinfo = charinfo(0x41);
 189
 190 This returns information about the input L</code point argument>
 191 as a reference to a hash of fields as defined by the Unicode
 192 standard.  If the L</code point argument> is not assigned in the standard
 193 (i.e., has the general category C<Cn> meaning C<Unassigned>)
 194 or is a non-character (meaning it is guaranteed to never be assigned in
 195 the standard),
 196 C<undef> is returned.
 197
 198 Fields that aren't applicable to the particular code point argument exist in the
 199 returned hash, and are empty.
 200
 201 The keys in the hash with the meanings of their values are:
 202
 203 =over
 204
 205 =item B<code>
 206
 207 the input native L</code point argument> expressed in hexadecimal, with
 208 leading zeros
 209 added if necessary to make it contain at least four hexdigits
 210
 211 =item B<name>
 212
 213 name of I<code>, all IN UPPER CASE.
 214 Some control-type code points do not have names.
 215 This field will be empty for C<Surrogate> and C<Private Use> code points,
 216 and for the others without a name,
 217 it will contain a description enclosed in angle brackets, like
 218 C<E<lt>controlE<gt>>.
 219
 220
 221 =item B<category>
 222
 223 The short name of the general category of I<code>.
 224 This will match one of the keys in the hash returned by L</general_categories()>.
 225
 226 The L</prop_value_aliases()> function can be used to get all the synonyms
 227 of the category name.
 228
 229 =item B<combining>
 230
 231 the combining class number for I<code> used in the Canonical Ordering Algorithm.
 232 For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
 233 available at
 234 L<http://www.unicode.org/versions/Unicode5.1.0/>
 235
 236 The L</prop_value_aliases()> function can be used to get all the synonyms
 237 of the combining class number.
 238
 239 =item B<bidi>
 240
 241 bidirectional type of I<code>.
 242 This will match one of the keys in the hash returned by L</bidi_types()>.
 243
 244 The L</prop_value_aliases()> function can be used to get all the synonyms
 245 of the bidi type name.
 246
 247 =item B<decomposition>
 248
 249 is empty if I<code> has no decomposition; or is one or more codes
 250 (separated by spaces) that, taken in order, represent a decomposition for
 251 I<code>.  Each has at least four hexdigits.
 252 The codes may be preceded by a word enclosed in angle brackets, then a space,
 253 like C<E<lt>compatE<gt> >, giving the type of decomposition
 254
 255 This decomposition may be an intermediate one whose components are also
 256 decomposable.  Use L<Unicode::Normalize> to get the final decomposition in one
 257 step.
 258
 259 =item B<decimal>
 260
 261 if I<code> represents a decimal digit this is its integer numeric value
 262
 263 =item B<digit>
 264
 265 if I<code> represents some other digit-like number, this is its integer
 266 numeric value
 267
 268 =item B<numeric>
 269
 270 if I<code> represents a whole or rational number, this is its numeric value.
 271 Rational values are expressed as a string like C<1/4>.
 272
 273 =item B<mirrored>
 274
 275 C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
 276
 277 =item B<unicode10>
 278
 279 name of I<code> in the Unicode 1.0 standard if one
 280 existed for this code point and is different from the current name
 281
 282 =item B<comment>
 283
 284 As of Unicode 6.0, this is always empty.
 285
 286 =item B<upper>
 287
 288 is, if non-empty, the uppercase mapping for I<code> expressed as at least four
 289 hexdigits.  This indicates that the full uppercase mapping is a single
 290 character, and is identical to the simple (single-character only) mapping.
 291 When this field is empty, it means that the simple uppercase mapping is
 292 I<code> itself; you'll need some other means, (like
 293 L</casespec()> to get the full mapping.
 294
 295 =item B<lower>
 296
 297 is, if non-empty, the lowercase mapping for I<code> expressed as at least four
 298 hexdigits.  This indicates that the full lowercase mapping is a single
 299 character, and is identical to the simple (single-character only) mapping.
 300 When this field is empty, it means that the simple lowercase mapping is
 301 I<code> itself; you'll need some other means, (like
 302 L</casespec()> to get the full mapping.
 303
 304 =item B<title>
 305
 306 is, if non-empty, the titlecase mapping for I<code> expressed as at least four
 307 hexdigits.  This indicates that the full titlecase mapping is a single
 308 character, and is identical to the simple (single-character only) mapping.
 309 When this field is empty, it means that the simple titlecase mapping is
 310 I<code> itself; you'll need some other means, (like
 311 L</casespec()> to get the full mapping.
 312
 313 =item B<block>
 314
 315 the block I<code> belongs to (used in C<\p{Blk=...}>).
 316 The L</prop_value_aliases()> function can be used to get all the synonyms
 317 of the block name.
 318
 319 See L</Blocks versus Scripts>.
 320
 321 =item B<script>
 322
 323 the script I<code> belongs to.
 324 The L</prop_value_aliases()> function can be used to get all the synonyms
 325 of the script name.
 326
 327 See L</Blocks versus Scripts>.
 328
 329 =back
 330
 331 Note that you cannot do (de)composition and casing based solely on the
 332 I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields; you
 333 will need also the L</casespec()> function and the C<Composition_Exclusion>
 334 property.  (Or you could just use the L<lc()|perlfunc/lc>,
 335 L<uc()|perlfunc/uc>, and L<ucfirst()|perlfunc/ucfirst> functions, and the
 336 L<Unicode::Normalize> module.)
 337
 338 =cut
 339
 340 # NB: This function is nearly duplicated in charnames.pm
 341 sub _getcode {
 342     my $arg = shift;
 343
 344     if ($arg =~ /^[1-9]\d*$/) {
 345         return $arg;
 346     }
 347     elsif ($arg =~ /^(?:0[xX])?([[:xdigit:]]+)$/) {
 348         return CORE::hex($1);
 349     }
 350     elsif ($arg =~ /^[Uu]\+([[:xdigit:]]+)$/) { # Is of form U+0000, means
 351                                                 # wants the Unicode code
 352                                                 # point, not the native one
 353         my $decimal = CORE::hex($1);
 354         return $decimal if IS_ASCII_PLATFORM;
 355         return utf8::unicode_to_native($decimal);
 356     }
 357
 358     return;
 359 }
 360
 361 # Populated by _num.  Converts real number back to input rational
 362 my %real_to_rational;
 363
 364 # To store the contents of files found on disk.
 365 my @BIDIS;
 366 my @CATEGORIES;
 367 my @DECOMPOSITIONS;
 368 my @NUMERIC_TYPES;
 369 my %SIMPLE_LOWER;
 370 my %SIMPLE_TITLE;
 371 my %SIMPLE_UPPER;
 372 my %UNICODE_1_NAMES;
 373 my %ISO_COMMENT;
 374
 375 sub charinfo {
 376
 377     # This function has traditionally mimicked what is in UnicodeData.txt,
 378     # warts and all.  This is a re-write that avoids UnicodeData.txt so that
 379     # it can be removed to save disk space.  Instead, this assembles
 380     # information gotten by other methods that get data from various other
 381     # files.  It uses charnames to get the character name; and various
 382     # mktables tables.
 383
 384     use feature 'unicode_strings';
 385
 386     # Will fail if called under minitest
 387     use if defined &DynaLoader::boot_DynaLoader, "Unicode::Normalize" => qw(getCombinClass NFD);
 388
 389     my $arg  = shift;
 390     my $code = _getcode($arg);
 391     croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
 392
 393     # Non-unicode implies undef.
 394     return if $code > 0x10FFFF;
 395
 396     my %prop;
 397     my $char = chr($code);
 398
 399     @CATEGORIES =_read_table("To/Gc.pl") unless @CATEGORIES;
 400     $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
 401                         // $utf8::SwashInfo{'ToGc'}{'missing'};
 402     # Return undef if category value is 'Unassigned' or one of its synonyms
 403     return if grep { lc $_ eq 'unassigned' }
 404                                     prop_value_aliases('Gc', $prop{'category'});
 405
 406     $prop{'code'} = sprintf "%04X", $code;
 407     $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
 408                                            : (charnames::viacode($code) // "");
 409
 410     $prop{'combining'} = getCombinClass($code);
 411
 412     @BIDIS =_read_table("To/Bc.pl") unless @BIDIS;
 413     $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
 414                     // $utf8::SwashInfo{'ToBc'}{'missing'};
 415
 416     # For most code points, we can just read in "unicore/Decomposition.pl", as
 417     # its contents are exactly what should be output.  But that file doesn't
 418     # contain the data for the Hangul syllable decompositions, which can be
 419     # algorithmically computed, and NFD() does that, so we call NFD() for
 420     # those.  We can't use NFD() for everything, as it does a complete
 421     # recursive decomposition, and what this function has always done is to
 422     # return what's in UnicodeData.txt which doesn't show that recursiveness.
 423     # Fortunately, the NFD() of the Hanguls doesn't have any recursion
 424     # issues.
 425     # Having no decomposition implies an empty field; otherwise, all but
 426     # "Canonical" imply a compatible decomposition, and the type is prefixed
 427     # to that, as it is in UnicodeData.txt
 428     UnicodeVersion() unless defined $v_unicode_version;
 429     if ($v_unicode_version ge v2.0.0 && $char =~ /\p{Block=Hangul_Syllables}/) {
 430         # The code points of the decomposition are output in standard Unicode
 431         # hex format, separated by blanks.
 432         $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
 433                                            unpack "U*", NFD($char);
 434     }
 435     else {
 436         @DECOMPOSITIONS = _read_table("Decomposition.pl")
 437                           unless @DECOMPOSITIONS;
 438         $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
 439                                                                 $code) // "";
 440     }
 441
 442     # Can use num() to get the numeric values, if any.
 443     if (! defined (my $value = num($char))) {
 444         $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
 445     }
 446     else {
 447         if ($char =~ /\d/) {
 448             $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
 449         }
 450         else {
 451
 452             # For non-decimal-digits, we have to read in the Numeric type
 453             # to distinguish them.  It is not just a matter of integer vs.
 454             # rational, as some whole number values are not considered digits,
 455             # e.g., TAMIL NUMBER TEN.
 456             $prop{'decimal'} = "";
 457
 458             @NUMERIC_TYPES =_read_table("To/Nt.pl") unless @NUMERIC_TYPES;
 459             if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
 460                 eq 'Digit')
 461             {
 462                 $prop{'digit'} = $prop{'numeric'} = $value;
 463             }
 464             else {
 465                 $prop{'digit'} = "";
 466                 $prop{'numeric'} = $real_to_rational{$value} // $value;
 467             }
 468         }
 469     }
 470
 471     $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
 472
 473     %UNICODE_1_NAMES =_read_table("To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
 474     $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
 475
 476     UnicodeVersion() unless defined $v_unicode_version;
 477     if ($v_unicode_version ge v6.0.0) {
 478         $prop{'comment'} = "";
 479     }
 480     else {
 481         %ISO_COMMENT = _read_table("To/Isc.pl", "use_hash") unless %ISO_COMMENT;
 482         $prop{'comment'} = (defined $ISO_COMMENT{$code})
 483                            ? $ISO_COMMENT{$code}
 484                            : "";
 485     }
 486
 487     %SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER;
 488     $prop{'upper'} = (defined $SIMPLE_UPPER{$code})
 489                      ? sprintf("%04X", $SIMPLE_UPPER{$code})
 490                      : "";
 491
 492     %SIMPLE_LOWER = _read_table("To/Lc.pl", "use_hash") unless %SIMPLE_LOWER;
 493     $prop{'lower'} = (defined $SIMPLE_LOWER{$code})
 494                      ? sprintf("%04X", $SIMPLE_LOWER{$code})
 495                      : "";
 496
 497     %SIMPLE_TITLE = _read_table("To/Tc.pl", "use_hash") unless %SIMPLE_TITLE;
 498     $prop{'title'} = (defined $SIMPLE_TITLE{$code})
 499                      ? sprintf("%04X", $SIMPLE_TITLE{$code})
 500                      : "";
 501
 502     $prop{block}  = charblock($code);
 503     $prop{script} = charscript($code);
 504     return \%prop;
 505 }
 506
 507 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 508     my ($table, $lo, $hi, $code) = @_;
 509
 510     return if $lo > $hi;
 511
 512     my $mid = int(($lo+$hi) / 2);
 513
 514     if ($table->[$mid]->[0] < $code) {
 515         if ($table->[$mid]->[1] >= $code) {
 516             return $table->[$mid]->[2];
 517         } else {
 518             _search($table, $mid + 1, $hi, $code);
 519         }
 520     } elsif ($table->[$mid]->[0] > $code) {
 521         _search($table, $lo, $mid - 1, $code);
 522     } else {
 523         return $table->[$mid]->[2];
 524     }
 525 }
 526
 527 sub _read_table ($;$) {
 528
 529     # Returns the contents of the mktables generated table file located at $1
 530     # in the form of either an array of arrays or a hash, depending on if the
 531     # optional second parameter is true (for hash return) or not.  In the case
 532     # of a hash return, each key is a code point, and its corresponding value
 533     # is what the table gives as the code point's corresponding value.  In the
 534     # case of an array return, each outer array denotes a range with [0] the
 535     # start point of that range; [1] the end point; and [2] the value that
 536     # every code point in the range has.  The hash return is useful for fast
 537     # lookup when the table contains only single code point ranges.  The array
 538     # return takes much less memory when there are large ranges.
 539     #
 540     # This function has the side effect of setting
 541     # $utf8::SwashInfo{$property}{'format'} to be the mktables format of the
 542     #                                       table; and
 543     # $utf8::SwashInfo{$property}{'missing'} to be the value for all entries
 544     #                                        not listed in the table.
 545     # where $property is the Unicode property name, preceded by 'To' for map
 546     # properties., e.g., 'ToSc'.
 547     #
 548     # Table entries look like one of:
 549     # 0000      0040    Common  # [65]
 550     # 00AA              Latin
 551
 552     my $table = shift;
 553     my $return_hash = shift;
 554     $return_hash = 0 unless defined $return_hash;
 555     my @return;
 556     my %return;
 557     local $_;
 558     my $list = do "unicore/$table";
 559
 560     # Look up if this property requires adjustments, which we do below if it
 561     # does.
 562     require "unicore/Heavy.pl";
 563     my $property = $table =~ s/\.pl//r;
 564     $property = $utf8::file_to_swash_name{$property};
 565     my $to_adjust = defined $property
 566                     && $utf8::SwashInfo{$property}{'format'} =~ / ^ a /x;
 567
 568     for (split /^/m, $list) {
 569         my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
 570                                         \s* ( \# .* )?  # Optional comment
 571                                         $ /x;
 572         my $decimal_start = hex $start;
 573         my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
 574         $value = hex $value if $to_adjust
 575                                && $utf8::SwashInfo{$property}{'format'} eq 'ax';
 576         if ($return_hash) {
 577             foreach my $i ($decimal_start .. $decimal_end) {
 578                 $return{$i} = ($to_adjust)
 579                               ? $value + $i - $decimal_start
 580                               : $value;
 581             }
 582         }
 583         elsif (! $to_adjust
 584                && @return
 585                && $return[-1][1] == $decimal_start - 1
 586                && $return[-1][2] eq $value)
 587         {
 588             # If this is merely extending the previous range, do just that.
 589             $return[-1]->[1] = $decimal_end;
 590         }
 591         else {
 592             push @return, [ $decimal_start, $decimal_end, $value ];
 593         }
 594     }
 595     return ($return_hash) ? %return : @return;
 596 }
 597
 598 sub charinrange {
 599     my ($range, $arg) = @_;
 600     my $code = _getcode($arg);
 601     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 602         unless defined $code;
 603     _search($range, 0, $#$range, $code);
 604 }
 605
 606 =head2 B<charblock()>
 607
 608     use Unicode::UCD 'charblock';
 609
 610     my $charblock = charblock(0x41);
 611     my $charblock = charblock(1234);
 612     my $charblock = charblock(0x263a);
 613     my $charblock = charblock("U+263a");
 614
 615     my $range     = charblock('Armenian');
 616
 617 With a L</code point argument> C<charblock()> returns the I<block> the code point
 618 belongs to, e.g.  C<Basic Latin>.  The old-style block name is returned (see
 619 L</Old-style versus new-style block names>).
 620 The L</prop_value_aliases()> function can be used to get all the synonyms
 621 of the block name.
 622
 623 If the code point is unassigned, this returns the block it would belong to if
 624 it were assigned.  (If the Unicode version being used is so early as to not
 625 have blocks, all code points are considered to be in C<No_Block>.)
 626
 627 See also L</Blocks versus Scripts>.
 628
 629 If supplied with an argument that can't be a code point, C<charblock()> tries to
 630 do the opposite and interpret the argument as an old-style block name.  On an
 631 ASCII platform, the return value is a I<range set> with one range: an
 632 anonymous array with a single element that consists of another anonymous array
 633 whose first element is the first code point in the block, and whose second
 634 element is the final code point in the block.  On an EBCDIC
 635 platform, the first two Unicode blocks are not contiguous.  Their range sets
 636 are lists containing I<start-of-range>, I<end-of-range> code point pairs.  You
 637 can test whether a code point is in a range set using the L</charinrange()>
 638 function.  (To be precise, each I<range set> contains a third array element,
 639 after the range boundary ones: the old_style block name.)
 640
 641 If the argument to C<charblock()> is not a known block, C<undef> is
 642 returned.
 643
 644 =cut
 645
 646 my @BLOCKS;
 647 my %BLOCKS;
 648
 649 sub _charblocks {
 650
 651     # Can't read from the mktables table because it loses the hyphens in the
 652     # original.
 653     unless (@BLOCKS) {
 654         UnicodeVersion() unless defined $v_unicode_version;
 655         if ($v_unicode_version lt v2.0.0) {
 656             my $subrange = [ 0, 0x10FFFF, 'No_Block' ];
 657             push @BLOCKS, $subrange;
 658             push @{$BLOCKS{'No_Block'}}, $subrange;
 659         }
 660         elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 661             local $_;
 662             local $/ = "\n";
 663             while (<$BLOCKSFH>) {
 664                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 665                     my ($lo, $hi) = (hex($1), hex($2));
 666                     my $subrange = [ $lo, $hi, $3 ];
 667                     push @BLOCKS, $subrange;
 668                     push @{$BLOCKS{$3}}, $subrange;
 669                 }
 670             }
 671             close($BLOCKSFH);
 672             if (! IS_ASCII_PLATFORM) {
 673                 # The first two blocks, through 0xFF, are wrong on EBCDIC
 674                 # platforms.
 675
 676                 my @new_blocks = _read_table("To/Blk.pl");
 677
 678                 # Get rid of the first two ranges in the Unicode version, and
 679                 # replace them with the ones computed by mktables.
 680                 shift @BLOCKS;
 681                 shift @BLOCKS;
 682                 delete $BLOCKS{'Basic Latin'};
 683                 delete $BLOCKS{'Latin-1 Supplement'};
 684
 685                 # But there are multiple entries in the computed versions, and
 686                 # we change their names to (which we know) to be the old-style
 687                 # ones.
 688                 for my $i (0.. @new_blocks - 1) {
 689                     if ($new_blocks[$i][2] =~ s/Basic_Latin/Basic Latin/
 690                         or $new_blocks[$i][2] =~
 691                                     s/Latin_1_Supplement/Latin-1 Supplement/)
 692                     {
 693                         push @{$BLOCKS{$new_blocks[$i][2]}}, $new_blocks[$i];
 694                     }
 695                     else {
 696                         splice @new_blocks, $i;
 697                         last;
 698                     }
 699                 }
 700                 unshift @BLOCKS, @new_blocks;
 701             }
 702         }
 703     }
 704 }
 705
 706 sub charblock {
 707     my $arg = shift;
 708
 709     _charblocks() unless @BLOCKS;
 710
 711     my $code = _getcode($arg);
 712
 713     if (defined $code) {
 714         my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code);
 715         return $result if defined $result;
 716         return 'No_Block';
 717     }
 718     elsif (exists $BLOCKS{$arg}) {
 719         return _dclone $BLOCKS{$arg};
 720     }
 721 }
 722
 723 =head2 B<charscript()>
 724
 725     use Unicode::UCD 'charscript';
 726
 727     my $charscript = charscript(0x41);
 728     my $charscript = charscript(1234);
 729     my $charscript = charscript("U+263a");
 730
 731     my $range      = charscript('Thai');
 732
 733 With a L</code point argument>, C<charscript()> returns the I<script> the
 734 code point belongs to, e.g., C<Latin>, C<Greek>, C<Han>.
 735 If the code point is unassigned or the Unicode version being used is so early
 736 that it doesn't have scripts, this function returns C<"Unknown">.
 737 The L</prop_value_aliases()> function can be used to get all the synonyms
 738 of the script name.
 739
 740 If supplied with an argument that can't be a code point, charscript() tries
 741 to do the opposite and interpret the argument as a script name. The
 742 return value is a I<range set>: an anonymous array of arrays that contain
 743 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 744 code point is in a range set using the L</charinrange()> function.
 745 (To be precise, each I<range set> contains a third array element,
 746 after the range boundary ones: the script name.)
 747
 748 If the C<charscript()> argument is not a known script, C<undef> is returned.
 749
 750 See also L</Blocks versus Scripts>.
 751
 752 =cut
 753
 754 my @SCRIPTS;
 755 my %SCRIPTS;
 756
 757 sub _charscripts {
 758     unless (@SCRIPTS) {
 759         UnicodeVersion() unless defined $v_unicode_version;
 760         if ($v_unicode_version lt v3.1.0) {
 761             push @SCRIPTS, [ 0, 0x10FFFF, 'Unknown' ];
 762         }
 763         else {
 764             @SCRIPTS =_read_table("To/Sc.pl");
 765         }
 766     }
 767     foreach my $entry (@SCRIPTS) {
 768         $entry->[2] =~ s/(_\w)/\L$1/g;  # Preserve old-style casing
 769         push @{$SCRIPTS{$entry->[2]}}, $entry;
 770     }
 771 }
 772
 773 sub charscript {
 774     my $arg = shift;
 775
 776     _charscripts() unless @SCRIPTS;
 777
 778     my $code = _getcode($arg);
 779
 780     if (defined $code) {
 781         my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 782         return $result if defined $result;
 783         return $utf8::SwashInfo{'ToSc'}{'missing'};
 784     } elsif (exists $SCRIPTS{$arg}) {
 785         return _dclone $SCRIPTS{$arg};
 786     }
 787
 788     return;
 789 }
 790
 791 =head2 B<charblocks()>
 792
 793     use Unicode::UCD 'charblocks';
 794
 795     my $charblocks = charblocks();
 796
 797 C<charblocks()> returns a reference to a hash with the known block names
 798 as the keys, and the code point ranges (see L</charblock()>) as the values.
 799
 800 The names are in the old-style (see L</Old-style versus new-style block
 801 names>).
 802
 803 L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
 804 different type of data structure.
 805
 806 L<prop_values("Block")|/prop_values()> can be used to get all
 807 the known new-style block names as a list, without the code point ranges.
 808
 809 See also L</Blocks versus Scripts>.
 810
 811 =cut
 812
 813 sub charblocks {
 814     _charblocks() unless %BLOCKS;
 815     return _dclone \%BLOCKS;
 816 }
 817
 818 =head2 B<charscripts()>
 819
 820     use Unicode::UCD 'charscripts';
 821
 822     my $charscripts = charscripts();
 823
 824 C<charscripts()> returns a reference to a hash with the known script
 825 names as the keys, and the code point ranges (see L</charscript()>) as
 826 the values.
 827
 828 L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
 829 different type of data structure.
 830
 831 L<C<prop_values("Script")>|/prop_values()> can be used to get all
 832 the known script names as a list, without the code point ranges.
 833
 834 See also L</Blocks versus Scripts>.
 835
 836 =cut
 837
 838 sub charscripts {
 839     _charscripts() unless %SCRIPTS;
 840     return _dclone \%SCRIPTS;
 841 }
 842
 843 =head2 B<charinrange()>
 844
 845 In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
 846 can also test whether a code point is in the I<range> as returned by
 847 L</charblock()> and L</charscript()> or as the values of the hash returned
 848 by L</charblocks()> and L</charscripts()> by using C<charinrange()>:
 849
 850     use Unicode::UCD qw(charscript charinrange);
 851
 852     $range = charscript('Hiragana');
 853     print "looks like hiragana\n" if charinrange($range, $codepoint);
 854
 855 =cut
 856
 857 my %GENERAL_CATEGORIES =
 858  (
 859     'L'  =>         'Letter',
 860     'LC' =>         'CasedLetter',
 861     'Lu' =>         'UppercaseLetter',
 862     'Ll' =>         'LowercaseLetter',
 863     'Lt' =>         'TitlecaseLetter',
 864     'Lm' =>         'ModifierLetter',
 865     'Lo' =>         'OtherLetter',
 866     'M'  =>         'Mark',
 867     'Mn' =>         'NonspacingMark',
 868     'Mc' =>         'SpacingMark',
 869     'Me' =>         'EnclosingMark',
 870     'N'  =>         'Number',
 871     'Nd' =>         'DecimalNumber',
 872     'Nl' =>         'LetterNumber',
 873     'No' =>         'OtherNumber',
 874     'P'  =>         'Punctuation',
 875     'Pc' =>         'ConnectorPunctuation',
 876     'Pd' =>         'DashPunctuation',
 877     'Ps' =>         'OpenPunctuation',
 878     'Pe' =>         'ClosePunctuation',
 879     'Pi' =>         'InitialPunctuation',
 880     'Pf' =>         'FinalPunctuation',
 881     'Po' =>         'OtherPunctuation',
 882     'S'  =>         'Symbol',
 883     'Sm' =>         'MathSymbol',
 884     'Sc' =>         'CurrencySymbol',
 885     'Sk' =>         'ModifierSymbol',
 886     'So' =>         'OtherSymbol',
 887     'Z'  =>         'Separator',
 888     'Zs' =>         'SpaceSeparator',
 889     'Zl' =>         'LineSeparator',
 890     'Zp' =>         'ParagraphSeparator',
 891     'C'  =>         'Other',
 892     'Cc' =>         'Control',
 893     'Cf' =>         'Format',
 894     'Cs' =>         'Surrogate',
 895     'Co' =>         'PrivateUse',
 896     'Cn' =>         'Unassigned',
 897  );
 898
 899 sub general_categories {
 900     return _dclone \%GENERAL_CATEGORIES;
 901 }
 902
 903 =head2 B<general_categories()>
 904
 905     use Unicode::UCD 'general_categories';
 906
 907     my $categories = general_categories();
 908
 909 This returns a reference to a hash which has short
 910 general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
 911 names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
 912 C<Symbol>) as values.  The hash is reversible in case you need to go
 913 from the long names to the short names.  The general category is the
 914 one returned from
 915 L</charinfo()> under the C<category> key.
 916
 917 The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
 918 alternative to this function; the first returning a simple list of the short
 919 category names; and the second gets all the synonyms of a given category name.
 920
 921 =cut
 922
 923 my %BIDI_TYPES =
 924  (
 925    'L'   => 'Left-to-Right',
 926    'LRE' => 'Left-to-Right Embedding',
 927    'LRO' => 'Left-to-Right Override',
 928    'R'   => 'Right-to-Left',
 929    'AL'  => 'Right-to-Left Arabic',
 930    'RLE' => 'Right-to-Left Embedding',
 931    'RLO' => 'Right-to-Left Override',
 932    'PDF' => 'Pop Directional Format',
 933    'EN'  => 'European Number',
 934    'ES'  => 'European Number Separator',
 935    'ET'  => 'European Number Terminator',
 936    'AN'  => 'Arabic Number',
 937    'CS'  => 'Common Number Separator',
 938    'NSM' => 'Non-Spacing Mark',
 939    'BN'  => 'Boundary Neutral',
 940    'B'   => 'Paragraph Separator',
 941    'S'   => 'Segment Separator',
 942    'WS'  => 'Whitespace',
 943    'ON'  => 'Other Neutrals',
 944  );
 945
 946 =head2 B<bidi_types()>
 947
 948     use Unicode::UCD 'bidi_types';
 949
 950     my $categories = bidi_types();
 951
 952 This returns a reference to a hash which has the short
 953 bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
 954 names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
 955 hash is reversible in case you need to go from the long names to the
 956 short names.  The bidi type is the one returned from
 957 L</charinfo()>
 958 under the C<bidi> key.  For the exact meaning of the various bidi classes
 959 the Unicode TR9 is recommended reading:
 960 L<http://www.unicode.org/reports/tr9/>
 961 (as of Unicode 5.0.0)
 962
 963 The L</prop_values()> and L</prop_value_aliases()> functions can be used as an
 964 alternative to this function; the first returning a simple list of the short
 965 bidi type names; and the second gets all the synonyms of a given bidi type
 966 name.
 967
 968 =cut
 969
 970 sub bidi_types {
 971     return _dclone \%BIDI_TYPES;
 972 }
 973
 974 =head2 B<compexcl()>
 975
 976     use Unicode::UCD 'compexcl';
 977
 978     my $compexcl = compexcl(0x09dc);
 979
 980 This routine returns C<undef> if the Unicode version being used is so early
 981 that it doesn't have this property.
 982
 983 C<compexcl()> is included for backwards
 984 compatibility, but as of Perl 5.12 and more modern Unicode versions, for
 985 most purposes it is probably more convenient to use one of the following
 986 instead:
 987
 988     my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
 989     my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
 990
 991 or even
 992
 993     my $compexcl = chr(0x09dc) =~ /\p{CE};
 994     my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
 995
 996 The first two forms return B<true> if the L</code point argument> should not
 997 be produced by composition normalization.  For the final two forms to return
 998 B<true>, it is additionally required that this fact not otherwise be
 999 determinable from the Unicode data base.
1000
1001 This routine behaves identically to the final two forms.  That is,
1002 it does not return B<true> if the code point has a decomposition
1003 consisting of another single code point, nor if its decomposition starts
1004 with a code point whose combining class is non-zero.  Code points that meet
1005 either of these conditions should also not be produced by composition
1006 normalization, which is probably why you should use the
1007 C<Full_Composition_Exclusion> property instead, as shown above.
1008
1009 The routine returns B<false> otherwise.
1010
1011 =cut
1012
1013 sub compexcl {
1014     my $arg  = shift;
1015     my $code = _getcode($arg);
1016     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
1017         unless defined $code;
1018
1019     UnicodeVersion() unless defined $v_unicode_version;
1020     return if $v_unicode_version lt v3.0.0;
1021
1022     no warnings "non_unicode";     # So works on non-Unicode code points
1023     return chr($code) =~ /\p{Composition_Exclusion}/;
1024 }
1025
1026 =head2 B<casefold()>
1027
1028     use Unicode::UCD 'casefold';
1029
1030     my $casefold = casefold(0xDF);
1031     if (defined $casefold) {
1032         my @full_fold_hex = split / /, $casefold->{'full'};
1033         my $full_fold_string =
1034                     join "", map {chr(hex($_))} @full_fold_hex;
1035         my @turkic_fold_hex =
1036                         split / /, ($casefold->{'turkic'} ne "")
1037                                         ? $casefold->{'turkic'}
1038                                         : $casefold->{'full'};
1039         my $turkic_fold_string =
1040                         join "", map {chr(hex($_))} @turkic_fold_hex;
1041     }
1042     if (defined $casefold && $casefold->{'simple'} ne "") {
1043         my $simple_fold_hex = $casefold->{'simple'};
1044         my $simple_fold_string = chr(hex($simple_fold_hex));
1045     }
1046
1047 This returns the (almost) locale-independent case folding of the
1048 character specified by the L</code point argument>.  (Starting in Perl v5.16,
1049 the core function C<fc()> returns the C<full> mapping (described below)
1050 faster than this does, and for entire strings.)
1051
1052 If there is no case folding for the input code point, C<undef> is returned.
1053
1054 If there is a case folding for that code point, a reference to a hash
1055 with the following fields is returned:
1056
1057 =over
1058
1059 =item B<code>
1060
1061 the input native L</code point argument> expressed in hexadecimal, with
1062 leading zeros
1063 added if necessary to make it contain at least four hexdigits
1064
1065 =item B<full>
1066
1067 one or more codes (separated by spaces) that, taken in order, give the
1068 code points for the case folding for I<code>.
1069 Each has at least four hexdigits.
1070
1071 =item B<simple>
1072
1073 is empty, or is exactly one code with at least four hexdigits which can be used
1074 as an alternative case folding when the calling program cannot cope with the
1075 fold being a sequence of multiple code points.  If I<full> is just one code
1076 point, then I<simple> equals I<full>.  If there is no single code point folding
1077 defined for I<code>, then I<simple> is the empty string.  Otherwise, it is an
1078 inferior, but still better-than-nothing alternative folding to I<full>.
1079
1080 =item B<mapping>
1081
1082 is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
1083 otherwise.  It can be considered to be the simplest possible folding for
1084 I<code>.  It is defined primarily for backwards compatibility.
1085
1086 =item B<status>
1087
1088 is C<C> (for C<common>) if the best possible fold is a single code point
1089 (I<simple> equals I<full> equals I<mapping>).  It is C<S> if there are distinct
1090 folds, I<simple> and I<full> (I<mapping> equals I<simple>).  And it is C<F> if
1091 there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).
1092 Note that this
1093 describes the contents of I<mapping>.  It is defined primarily for backwards
1094 compatibility.
1095
1096 For Unicode versions between 3.1 and 3.1.1 inclusive, I<status> can also be
1097 C<I> which is the same as C<C> but is a special case for dotted uppercase I and
1098 dotless lowercase i:
1099
1100 =over
1101
1102 =item Z<>B<*> If you use this C<I> mapping
1103
1104 the result is case-insensitive,
1105 but dotless and dotted I's are not distinguished
1106
1107 =item Z<>B<*> If you exclude this C<I> mapping
1108
1109 the result is not fully case-insensitive, but
1110 dotless and dotted I's are distinguished
1111
1112 =back
1113
1114 =item B<turkic>
1115
1116 contains any special folding for Turkic languages.  For versions of Unicode
1117 starting with 3.2, this field is empty unless I<code> has a different folding
1118 in Turkic languages, in which case it is one or more codes (separated by
1119 spaces) that, taken in order, give the code points for the case folding for
1120 I<code> in those languages.
1121 Each code has at least four hexdigits.
1122 Note that this folding does not maintain canonical equivalence without
1123 additional processing.
1124
1125 For Unicode versions between 3.1 and 3.1.1 inclusive, this field is empty unless
1126 there is a
1127 special folding for Turkic languages, in which case I<status> is C<I>, and
1128 I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
1129
1130 =back
1131
1132 Programs that want complete generality and the best folding results should use
1133 the folding contained in the I<full> field.  But note that the fold for some
1134 code points will be a sequence of multiple code points.
1135
1136 Programs that can't cope with the fold mapping being multiple code points can
1137 use the folding contained in the I<simple> field, with the loss of some
1138 generality.  In Unicode 5.1, about 7% of the defined foldings have no single
1139 code point folding.
1140
1141 The I<mapping> and I<status> fields are provided for backwards compatibility for
1142 existing programs.  They contain the same values as in previous versions of
1143 this function.
1144
1145 Locale is not completely independent.  The I<turkic> field contains results to
1146 use when the locale is a Turkic language.
1147
1148 For more information about case mappings see
1149 L<http://www.unicode.org/unicode/reports/tr21>
1150
1151 =cut
1152
1153 my %CASEFOLD;
1154
1155 sub _casefold {
1156     unless (%CASEFOLD) {   # Populate the hash
1157         my ($full_invlist_ref, $full_invmap_ref, undef, $default)
1158                                                 = prop_invmap('Case_Folding');
1159
1160         # Use the recipe given in the prop_invmap() pod to convert the
1161         # inversion map into the hash.
1162         for my $i (0 .. @$full_invlist_ref - 1 - 1) {
1163             next if $full_invmap_ref->[$i] == $default;
1164             my $adjust = -1;
1165             for my $j ($full_invlist_ref->[$i] .. $full_invlist_ref->[$i+1] -1) {
1166                 $adjust++;
1167                 if (! ref $full_invmap_ref->[$i]) {
1168
1169                     # This is a single character mapping
1170                     $CASEFOLD{$j}{'status'} = 'C';
1171                     $CASEFOLD{$j}{'simple'}
1172                         = $CASEFOLD{$j}{'full'}
1173                         = $CASEFOLD{$j}{'mapping'}
1174                         = sprintf("%04X", $full_invmap_ref->[$i] + $adjust);
1175                     $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
1176                     $CASEFOLD{$j}{'turkic'} = "";
1177                 }
1178                 else {  # prop_invmap ensures that $adjust is 0 for a ref
1179                     $CASEFOLD{$j}{'status'} = 'F';
1180                     $CASEFOLD{$j}{'full'}
1181                     = $CASEFOLD{$j}{'mapping'}
1182                     = join " ", map { sprintf "%04X", $_ }
1183                                                     @{$full_invmap_ref->[$i]};
1184                     $CASEFOLD{$j}{'simple'} = "";
1185                     $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
1186                     $CASEFOLD{$j}{'turkic'} = "";
1187                 }
1188             }
1189         }
1190
1191         # We have filled in the full mappings above, assuming there were no
1192         # simple ones for the ones with multi-character maps.  Now, we find
1193         # and fix the cases where that assumption was false.
1194         (my ($simple_invlist_ref, $simple_invmap_ref, undef), $default)
1195                                         = prop_invmap('Simple_Case_Folding');
1196         for my $i (0 .. @$simple_invlist_ref - 1 - 1) {
1197             next if $simple_invmap_ref->[$i] == $default;
1198             my $adjust = -1;
1199             for my $j ($simple_invlist_ref->[$i]
1200                        .. $simple_invlist_ref->[$i+1] -1)
1201             {
1202                 $adjust++;
1203                 next if $CASEFOLD{$j}{'status'} eq 'C';
1204                 $CASEFOLD{$j}{'status'} = 'S';
1205                 $CASEFOLD{$j}{'simple'}
1206                     = $CASEFOLD{$j}{'mapping'}
1207                     = sprintf("%04X", $simple_invmap_ref->[$i] + $adjust);
1208                 $CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
1209                 $CASEFOLD{$j}{'turkic'} = "";
1210             }
1211         }
1212
1213         # We hard-code in the turkish rules
1214         UnicodeVersion() unless defined $v_unicode_version;
1215         if ($v_unicode_version ge v3.2.0) {
1216
1217             # These two code points should already have regular entries, so
1218             # just fill in the turkish fields
1219             $CASEFOLD{ord('I')}{'turkic'} = '0131';
1220             $CASEFOLD{0x130}{'turkic'} = sprintf "%04X", ord('i');
1221         }
1222         elsif ($v_unicode_version ge v3.1.0) {
1223
1224             # These two code points don't have entries otherwise.
1225             $CASEFOLD{0x130}{'code'} = '0130';
1226             $CASEFOLD{0x131}{'code'} = '0131';
1227             $CASEFOLD{0x130}{'status'} = $CASEFOLD{0x131}{'status'} = 'I';
1228             $CASEFOLD{0x130}{'turkic'}
1229                 = $CASEFOLD{0x130}{'mapping'}
1230                 = $CASEFOLD{0x130}{'full'}
1231                 = $CASEFOLD{0x130}{'simple'}
1232                 = $CASEFOLD{0x131}{'turkic'}
1233                 = $CASEFOLD{0x131}{'mapping'}
1234                 = $CASEFOLD{0x131}{'full'}
1235                 = $CASEFOLD{0x131}{'simple'}
1236                 = sprintf "%04X", ord('i');
1237         }
1238     }
1239 }
1240
1241 sub casefold {
1242     my $arg  = shift;
1243     my $code = _getcode($arg);
1244     croak __PACKAGE__, "::casefold: unknown code '$arg'"
1245         unless defined $code;
1246
1247     _casefold() unless %CASEFOLD;
1248
1249     return $CASEFOLD{$code};
1250 }
1251
1252 =head2 B<all_casefolds()>
1253
1254
1255     use Unicode::UCD 'all_casefolds';
1256
1257     my $all_folds_ref = all_casefolds();
1258     foreach my $char_with_casefold (sort { $a <=> $b }
1259                                     keys %$all_folds_ref)
1260     {
1261         printf "%04X:", $char_with_casefold;
1262         my $casefold = $all_folds_ref->{$char_with_casefold};
1263
1264         # Get folds for $char_with_casefold
1265
1266         my @full_fold_hex = split / /, $casefold->{'full'};
1267         my $full_fold_string =
1268                     join "", map {chr(hex($_))} @full_fold_hex;
1269         print " full=", join " ", @full_fold_hex;
1270         my @turkic_fold_hex =
1271                         split / /, ($casefold->{'turkic'} ne "")
1272                                         ? $casefold->{'turkic'}
1273                                         : $casefold->{'full'};
1274         my $turkic_fold_string =
1275                         join "", map {chr(hex($_))} @turkic_fold_hex;
1276         print "; turkic=", join " ", @turkic_fold_hex;
1277         if (defined $casefold && $casefold->{'simple'} ne "") {
1278             my $simple_fold_hex = $casefold->{'simple'};
1279             my $simple_fold_string = chr(hex($simple_fold_hex));
1280             print "; simple=$simple_fold_hex";
1281         }
1282         print "\n";
1283     }
1284
1285 This returns all the case foldings in the current version of Unicode in the
1286 form of a reference to a hash.  Each key to the hash is the decimal
1287 representation of a Unicode character that has a casefold to other than
1288 itself.  The casefold of a semi-colon is itself, so it isn't in the hash;
1289 likewise for a lowercase "a", but there is an entry for a capital "A".  The
1290 hash value for each key is another hash, identical to what is returned by
1291 L</casefold()> if called with that code point as its argument.  So the value
1292 C<< all_casefolds()->{ord("A")}' >> is equivalent to C<casefold(ord("A"))>;
1293
1294 =cut
1295
1296 sub all_casefolds () {
1297     _casefold() unless %CASEFOLD;
1298     return _dclone \%CASEFOLD;
1299 }
1300
1301 =head2 B<casespec()>
1302
1303     use Unicode::UCD 'casespec';
1304
1305     my $casespec = casespec(0xFB00);
1306
1307 This returns the potentially locale-dependent case mappings of the L</code point
1308 argument>.  The mappings may be longer than a single code point (which the basic
1309 Unicode case mappings as returned by L</charinfo()> never are).
1310
1311 If there are no case mappings for the L</code point argument>, or if all three
1312 possible mappings (I<lower>, I<title> and I<upper>) result in single code
1313 points and are locale independent and unconditional, C<undef> is returned
1314 (which means that the case mappings, if any, for the code point are those
1315 returned by L</charinfo()>).
1316
1317 Otherwise, a reference to a hash giving the mappings (or a reference to a hash
1318 of such hashes, explained below) is returned with the following keys and their
1319 meanings:
1320
1321 The keys in the bottom layer hash with the meanings of their values are:
1322
1323 =over
1324
1325 =item B<code>
1326
1327 the input native L</code point argument> expressed in hexadecimal, with
1328 leading zeros
1329 added if necessary to make it contain at least four hexdigits
1330
1331 =item B<lower>
1332
1333 one or more codes (separated by spaces) that, taken in order, give the
1334 code points for the lower case of I<code>.
1335 Each has at least four hexdigits.
1336
1337 =item B<title>
1338
1339 one or more codes (separated by spaces) that, taken in order, give the
1340 code points for the title case of I<code>.
1341 Each has at least four hexdigits.
1342
1343 =item B<upper>
1344
1345 one or more codes (separated by spaces) that, taken in order, give the
1346 code points for the upper case of I<code>.
1347 Each has at least four hexdigits.
1348
1349 =item B<condition>
1350
1351 the conditions for the mappings to be valid.
1352 If C<undef>, the mappings are always valid.
1353 When defined, this field is a list of conditions,
1354 all of which must be true for the mappings to be valid.
1355 The list consists of one or more
1356 I<locales> (see below)
1357 and/or I<contexts> (explained in the next paragraph),
1358 separated by spaces.
1359 (Other than as used to separate elements, spaces are to be ignored.)
1360 Case distinctions in the condition list are not significant.
1361 Conditions preceded by "NON_" represent the negation of the condition.
1362
1363 A I<context> is one of those defined in the Unicode standard.
1364 For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1365 available at
1366 L<http://www.unicode.org/versions/Unicode5.1.0/>.
1367 These are for context-sensitive casing.
1368
1369 =back
1370
1371 The hash described above is returned for locale-independent casing, where
1372 at least one of the mappings has length longer than one.  If C<undef> is
1373 returned, the code point may have mappings, but if so, all are length one,
1374 and are returned by L</charinfo()>.
1375 Note that when this function does return a value, it will be for the complete
1376 set of mappings for a code point, even those whose length is one.
1377
1378 If there are additional casing rules that apply only in certain locales,
1379 an additional key for each will be defined in the returned hash.  Each such key
1380 will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1381 followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1382 and a variant code).  You can find the lists of all possible locales, see
1383 L<Locale::Country> and L<Locale::Language>.
1384 (In Unicode 6.0, the only locales returned by this function
1385 are C<lt>, C<tr>, and C<az>.)
1386
1387 Each locale key is a reference to a hash that has the form above, and gives
1388 the casing rules for that particular locale, which take precedence over the
1389 locale-independent ones when in that locale.
1390
1391 If the only casing for a code point is locale-dependent, then the returned
1392 hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1393 will contain only locale keys.
1394
1395 For more information about case mappings see
1396 L<http://www.unicode.org/unicode/reports/tr21/>
1397
1398 =cut
1399
1400 my %CASESPEC;
1401
1402 sub _casespec {
1403     unless (%CASESPEC) {
1404         UnicodeVersion() unless defined $v_unicode_version;
1405         if ($v_unicode_version lt v2.1.8) {
1406             %CASESPEC = {};
1407         }
1408         elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
1409             local $_;
1410             local $/ = "\n";
1411             while (<$CASESPECFH>) {
1412                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
1413
1414                     my ($hexcode, $lower, $title, $upper, $condition) =
1415                         ($1, $2, $3, $4, $5);
1416                     if (! IS_ASCII_PLATFORM) { # Remap entry to native
1417                         foreach my $var_ref (\$hexcode,
1418                                              \$lower,
1419                                              \$title,
1420                                              \$upper)
1421                         {
1422                             next unless defined $$var_ref;
1423                             $$var_ref = join " ",
1424                                         map { sprintf("%04X",
1425                                               utf8::unicode_to_native(hex $_)) }
1426                                         split " ", $$var_ref;
1427                         }
1428                     }
1429
1430                     my $code = hex($hexcode);
1431
1432                     # In 2.1.8, there were duplicate entries; ignore all but
1433                     # the first one -- there were no conditions in the file
1434                     # anyway.
1435                     if (exists $CASESPEC{$code} && $v_unicode_version ne v2.1.8)
1436                     {
1437                         if (exists $CASESPEC{$code}->{code}) {
1438                             my ($oldlower,
1439                                 $oldtitle,
1440                                 $oldupper,
1441                                 $oldcondition) =
1442                                     @{$CASESPEC{$code}}{qw(lower
1443                                                            title
1444                                                            upper
1445                                                            condition)};
1446                             if (defined $oldcondition) {
1447                                 my ($oldlocale) =
1448                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
1449                                 delete $CASESPEC{$code};
1450                                 $CASESPEC{$code}->{$oldlocale} =
1451                                 { code      => $hexcode,
1452                                   lower     => $oldlower,
1453                                   title     => $oldtitle,
1454                                   upper     => $oldupper,
1455                                   condition => $oldcondition };
1456                             }
1457                         }
1458                         my ($locale) =
1459                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1460                         $CASESPEC{$code}->{$locale} =
1461                         { code      => $hexcode,
1462                           lower     => $lower,
1463                           title     => $title,
1464                           upper     => $upper,
1465                           condition => $condition };
1466                     } else {
1467                         $CASESPEC{$code} =
1468                         { code      => $hexcode,
1469                           lower     => $lower,
1470                           title     => $title,
1471                           upper     => $upper,
1472                           condition => $condition };
1473                     }
1474                 }
1475             }
1476             close($CASESPECFH);
1477         }
1478     }
1479 }
1480
1481 sub casespec {
1482     my $arg  = shift;
1483     my $code = _getcode($arg);
1484     croak __PACKAGE__, "::casespec: unknown code '$arg'"
1485         unless defined $code;
1486
1487     _casespec() unless %CASESPEC;
1488
1489     return ref $CASESPEC{$code} ? _dclone $CASESPEC{$code} : $CASESPEC{$code};
1490 }
1491
1492 =head2 B<namedseq()>
1493
1494     use Unicode::UCD 'namedseq';
1495
1496     my $namedseq = namedseq("KATAKANA LETTER AINU P");
1497     my @namedseq = namedseq("KATAKANA LETTER AINU P");
1498     my %namedseq = namedseq();
1499
1500 If used with a single argument in a scalar context, returns the string
1501 consisting of the code points of the named sequence, or C<undef> if no
1502 named sequence by that name exists.  If used with a single argument in
1503 a list context, it returns the list of the ordinals of the code points.
1504
1505 If used with no
1506 arguments in a list context, it returns a hash with the names of all the
1507 named sequences as the keys and their sequences as strings as
1508 the values.  Otherwise, it returns C<undef> or an empty list depending
1509 on the context.
1510
1511 This function only operates on officially approved (not provisional) named
1512 sequences.
1513
1514 Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
1515 sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
1516 LETTER AINU P")> will return the same string this function does, but will also
1517 operate on character names that aren't named sequences, without you having to
1518 know which are which.  See L<charnames>.
1519
1520 =cut
1521
1522 my %NAMEDSEQ;
1523
1524 sub _namedseq {
1525     unless (%NAMEDSEQ) {
1526         if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
1527             local $_;
1528             local $/ = "\n";
1529             while (<$NAMEDSEQFH>) {
1530                 if (/^ [0-9A-F]+ \  /x) {
1531                     chomp;
1532                     my ($sequence, $name) = split /\t/;
1533                     my @s = map { chr(hex($_)) } split(' ', $sequence);
1534                     $NAMEDSEQ{$name} = join("", @s);
1535                 }
1536             }
1537             close($NAMEDSEQFH);
1538         }
1539     }
1540 }
1541
1542 sub namedseq {
1543
1544     # Use charnames::string_vianame() which now returns this information,
1545     # unless the caller wants the hash returned, in which case we read it in,
1546     # and thereafter use it instead of calling charnames, as it is faster.
1547
1548     my $wantarray = wantarray();
1549     if (defined $wantarray) {
1550         if ($wantarray) {
1551             if (@_ == 0) {
1552                 _namedseq() unless %NAMEDSEQ;
1553                 return %NAMEDSEQ;
1554             } elsif (@_ == 1) {
1555                 my $s;
1556                 if (%NAMEDSEQ) {
1557                     $s = $NAMEDSEQ{ $_[0] };
1558                 }
1559                 else {
1560                     $s = charnames::string_vianame($_[0]);
1561                 }
1562                 return defined $s ? map { ord($_) } split('', $s) : ();
1563             }
1564         } elsif (@_ == 1) {
1565             return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
1566             return charnames::string_vianame($_[0]);
1567         }
1568     }
1569     return;
1570 }
1571
1572 my %NUMERIC;
1573
1574 sub _numeric {
1575     my @numbers = _read_table("To/Nv.pl");
1576     foreach my $entry (@numbers) {
1577         my ($start, $end, $value) = @$entry;
1578
1579         # If value contains a slash, convert to decimal, add a reverse hash
1580         # used by charinfo.
1581         if ((my @rational = split /\//, $value) == 2) {
1582             my $real = $rational[0] / $rational[1];
1583             $real_to_rational{$real} = $value;
1584             $value = $real;
1585
1586             # Should only be single element, but just in case...
1587             for my $i ($start .. $end) {
1588                 $NUMERIC{$i} = $value;
1589             }
1590         }
1591         else {
1592             # The values require adjusting, as is in 'a' format
1593             for my $i ($start .. $end) {
1594                 $NUMERIC{$i} = $value + $i - $start;
1595             }
1596         }
1597     }
1598
1599     # Decided unsafe to use these that aren't officially part of the Unicode
1600     # standard.
1601     #use Math::Trig;
1602     #my $pi = acos(-1.0);
1603     #$NUMERIC{0x03C0} = $pi;
1604
1605     # Euler's constant, not to be confused with Euler's number
1606     #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
1607
1608     # Euler's number
1609     #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
1610
1611     return;
1612 }
1613
1614 =pod
1615
1616 =head2 B<num()>
1617
1618     use Unicode::UCD 'num';
1619
1620     my $val = num("123");
1621     my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
1622
1623 C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
1624 doesn't think the entire string has a completely valid, safe numeric value.
1625
1626 If the string is just one character in length, the Unicode numeric value
1627 is returned if it has one, or C<undef> otherwise.  Note that this need
1628 not be a whole number.  C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
1629 example returns -0.5.
1630
1631 =cut
1632
1633 #A few characters to which Unicode doesn't officially
1634 #assign a numeric value are considered numeric by C<num>.
1635 #These are:
1636
1637 # EULER CONSTANT             0.5772...  (this is NOT Euler's number)
1638 # SCRIPT SMALL E             2.71828... (this IS Euler's number)
1639 # GREEK SMALL LETTER PI      3.14159...
1640
1641 =pod
1642
1643 If the string is more than one character, C<undef> is returned unless
1644 all its characters are decimal digits (that is, they would match C<\d+>),
1645 from the same script.  For example if you have an ASCII '0' and a Bengali
1646 '3', mixed together, they aren't considered a valid number, and C<undef>
1647 is returned.  A further restriction is that the digits all have to be of
1648 the same form.  A half-width digit mixed with a full-width one will
1649 return C<undef>.  The Arabic script has two sets of digits;  C<num> will
1650 return C<undef> unless all the digits in the string come from the same
1651 set.
1652
1653 C<num> errs on the side of safety, and there may be valid strings of
1654 decimal digits that it doesn't recognize.  Note that Unicode defines
1655 a number of "digit" characters that aren't "decimal digit" characters.
1656 "Decimal digits" have the property that they have a positional value, i.e.,
1657 there is a units position, a 10's position, a 100's, etc, AND they are
1658 arranged in Unicode in blocks of 10 contiguous code points.  The Chinese
1659 digits, for example, are not in such a contiguous block, and so Unicode
1660 doesn't view them as decimal digits, but merely digits, and so C<\d> will not
1661 match them.  A single-character string containing one of these digits will
1662 have its decimal value returned by C<num>, but any longer string containing
1663 only these digits will return C<undef>.
1664
1665 Strings of multiple sub- and superscripts are not recognized as numbers.  You
1666 can use either of the compatibility decompositions in Unicode::Normalize to
1667 change these into digits, and then call C<num> on the result.
1668
1669 =cut
1670
1671 # To handle sub, superscripts, this could if called in list context,
1672 # consider those, and return the <decomposition> type in the second
1673 # array element.
1674
1675 sub num {
1676     my $string = $_[0];
1677
1678     _numeric unless %NUMERIC;
1679
1680     my $length = length($string);
1681     return $NUMERIC{ord($string)} if $length == 1;
1682     return if $string =~ /\D/;
1683     my $first_ord = ord(substr($string, 0, 1));
1684     my $value = $NUMERIC{$first_ord};
1685
1686     # To be a valid decimal number, it should be in a block of 10 consecutive
1687     # characters, whose values are 0, 1, 2, ... 9.  Therefore this digit's
1688     # value is its offset in that block from the character that means zero.
1689     my $zero_ord = $first_ord - $value;
1690
1691     # Unicode 6.0 instituted the rule that only digits in a consecutive
1692     # block of 10 would be considered decimal digits.  If this is an earlier
1693     # release, we verify that this first character is a member of such a
1694     # block.  That is, that the block of characters surrounding this one
1695     # consists of all \d characters whose numeric values are the expected
1696     # ones.
1697     UnicodeVersion() unless defined $v_unicode_version;
1698     if ($v_unicode_version lt v6.0.0) {
1699         for my $i (0 .. 9) {
1700             my $ord = $zero_ord + $i;
1701             return unless chr($ord) =~ /\d/;
1702             my $numeric = $NUMERIC{$ord};
1703             return unless defined $numeric;
1704             return unless $numeric == $i;
1705         }
1706     }
1707
1708     for my $i (1 .. $length -1) {
1709
1710         # Here we know either by verifying, or by fact of the first character
1711         # being a \d in Unicode 6.0 or later, that any character between the
1712         # character that means 0, and 9 positions above it must be \d, and
1713         # must have its value correspond to its offset from the zero.  Any
1714         # characters outside these 10 do not form a legal number for this
1715         # function.
1716         my $ord = ord(substr($string, $i, 1));
1717         my $digit = $ord - $zero_ord;
1718         return unless $digit >= 0 && $digit <= 9;
1719         $value = $value * 10 + $digit;
1720     }
1721
1722     return $value;
1723 }
1724
1725 =pod
1726
1727 =head2 B<prop_aliases()>
1728
1729     use Unicode::UCD 'prop_aliases';
1730
1731     my ($short_name, $full_name, @other_names) = prop_aliases("space");
1732     my $same_full_name = prop_aliases("Space");     # Scalar context
1733     my ($same_short_name) = prop_aliases("Space");  # gets 0th element
1734     print "The full name is $full_name\n";
1735     print "The short name is $short_name\n";
1736     print "The other aliases are: ", join(", ", @other_names), "\n";
1737
1738     prints:
1739     The full name is White_Space
1740     The short name is WSpace
1741     The other aliases are: Space
1742
1743 Most Unicode properties have several synonymous names.  Typically, there is at
1744 least a short name, convenient to type, and a long name that more fully
1745 describes the property, and hence is more easily understood.
1746
1747 If you know one name for a Unicode property, you can use C<prop_aliases> to find
1748 either the long name (when called in scalar context), or a list of all of the
1749 names, somewhat ordered so that the short name is in the 0th element, the long
1750 name in the next element, and any other synonyms are in the remaining
1751 elements, in no particular order.
1752
1753 The long name is returned in a form nicely capitalized, suitable for printing.
1754
1755 The input parameter name is loosely matched, which means that white space,
1756 hyphens, and underscores are ignored (except for the trailing underscore in
1757 the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and
1758 both of which mean C<General_Category=Cased Letter>).
1759
1760 If the name is unknown, C<undef> is returned (or an empty list in list
1761 context).  Note that Perl typically recognizes property names in regular
1762 expressions with an optional C<"Is_>" (with or without the underscore)
1763 prefixed to them, such as C<\p{isgc=punct}>.  This function does not recognize
1764 those in the input, returning C<undef>.  Nor are they included in the output
1765 as possible synonyms.
1766
1767 C<prop_aliases> does know about the Perl extensions to Unicode properties,
1768 such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode
1769 properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>.  The
1770 final example demonstrates that the C<"Is_"> prefix is recognized for these
1771 extensions; it is needed to resolve ambiguities.  For example,
1772 C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but
1773 C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>.  This is
1774 because C<islc> is a Perl extension which is short for
1775 C<General_Category=Cased Letter>.  The lists returned for the Perl extensions
1776 will not include the C<"Is_"> prefix (whether or not the input had it) unless
1777 needed to resolve ambiguities, as shown in the C<"islc"> example, where the
1778 returned list had one element containing C<"Is_">, and the other without.
1779
1780 It is also possible for the reverse to happen:  C<prop_aliases('isc')> returns
1781 the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns
1782 C<(C, Other)> (the latter being a Perl extension meaning
1783 C<General_Category=Other>.
1784 L<perluniprops/Properties accessible through Unicode::UCD> lists the available
1785 forms, including which ones are discouraged from use.
1786
1787 Those discouraged forms are accepted as input to C<prop_aliases>, but are not
1788 returned in the lists.  C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
1789 which are old synonyms for C<"Is_LC"> and should not be used in new code, are
1790 examples of this.  These both return C<(Is_LC, Cased_Letter)>.  Thus this
1791 function allows you to take a discouraged form, and find its acceptable
1792 alternatives.  The same goes with single-form Block property equivalences.
1793 Only the forms that begin with C<"In_"> are not discouraged; if you pass
1794 C<prop_aliases> a discouraged form, you will get back the equivalent ones that
1795 begin with C<"In_">.  It will otherwise look like a new-style block name (see.
1796 L</Old-style versus new-style block names>).
1797
1798 C<prop_aliases> does not know about any user-defined properties, and will
1799 return C<undef> if called with one of those.  Likewise for Perl internal
1800 properties, with the exception of "Perl_Decimal_Digit" which it does know
1801 about (and which is documented below in L</prop_invmap()>).
1802
1803 =cut
1804
1805 # It may be that there are use cases where the discouraged forms should be
1806 # returned.  If that comes up, an optional boolean second parameter to the
1807 # function could be created, for example.
1808
1809 # These are created by mktables for this routine and stored in unicore/UCD.pl
1810 # where their structures are described.
1811 our %string_property_loose_to_name;
1812 our %ambiguous_names;
1813 our %loose_perlprop_to_name;
1814 our %prop_aliases;
1815
1816 sub prop_aliases ($) {
1817     my $prop = $_[0];
1818     return unless defined $prop;
1819
1820     require "unicore/UCD.pl";
1821     require "unicore/Heavy.pl";
1822     require "utf8_heavy.pl";
1823
1824     # The property name may be loosely or strictly matched; we don't know yet.
1825     # But both types use lower-case.
1826     $prop = lc $prop;
1827
1828     # It is loosely matched if its lower case isn't known to be strict.
1829     my $list_ref;
1830     if (! exists $utf8::stricter_to_file_of{$prop}) {
1831         my $loose = utf8::_loose_name($prop);
1832
1833         # There is a hash that converts from any loose name to its standard
1834         # form, mapping all synonyms for a  name to one name that can be used
1835         # as a key into another hash.  The whole concept is for memory
1836         # savings, as the second hash doesn't have to have all the
1837         # combinations.  Actually, there are two hashes that do the
1838         # converstion.  One is used in utf8_heavy.pl (stored in Heavy.pl) for
1839         # looking up properties matchable in regexes.  This function needs to
1840         # access string properties, which aren't available in regexes, so a
1841         # second conversion hash is made for them (stored in UCD.pl).  Look in
1842         # the string one now, as the rest can have an optional 'is' prefix,
1843         # which these don't.
1844         if (exists $string_property_loose_to_name{$loose}) {
1845
1846             # Convert to its standard loose name.
1847             $prop = $string_property_loose_to_name{$loose};
1848         }
1849         else {
1850             my $retrying = 0;   # bool.  ? Has an initial 'is' been stripped
1851         RETRY:
1852             if (exists $utf8::loose_property_name_of{$loose}
1853                 && (! $retrying
1854                     || ! exists $ambiguous_names{$loose}))
1855             {
1856                 # Found an entry giving the standard form.  We don't get here
1857                 # (in the test above) when we've stripped off an
1858                 # 'is' and the result is an ambiguous name.  That is because
1859                 # these are official Unicode properties (though Perl can have
1860                 # an optional 'is' prefix meaning the official property), and
1861                 # all ambiguous cases involve a Perl single-form extension
1862                 # for the gc, script, or block properties, and the stripped
1863                 # 'is' means that they mean one of those, and not one of
1864                 # these
1865                 $prop = $utf8::loose_property_name_of{$loose};
1866             }
1867             elsif (exists $loose_perlprop_to_name{$loose}) {
1868
1869                 # This hash is specifically for this function to list Perl
1870                 # extensions that aren't in the earlier hashes.  If there is
1871                 # only one element, the short and long names are identical.
1872                 # Otherwise the form is already in the same form as
1873                 # %prop_aliases, which is handled at the end of the function.
1874                 $list_ref = $loose_perlprop_to_name{$loose};
1875                 if (@$list_ref == 1) {
1876                     my @list = ($list_ref->[0], $list_ref->[0]);
1877                     $list_ref = \@list;
1878                 }
1879             }
1880             elsif (! exists $utf8::loose_to_file_of{$loose}) {
1881
1882                 # loose_to_file_of is a complete list of loose names.  If not
1883                 # there, the input is unknown.
1884                 return;
1885             }
1886             elsif ($loose =~ / [:=] /x) {
1887
1888                 # Here we found the name but not its aliases, so it has to
1889                 # exist.  Exclude property-value combinations.  (This shows up
1890                 # for something like ccc=vr which matches loosely, but is a
1891                 # synonym for ccc=9 which matches only strictly.
1892                 return;
1893             }
1894             else {
1895
1896                 # Here it has to exist, and isn't a property-value
1897                 # combination.  This means it must be one of the Perl
1898                 # single-form extensions.  First see if it is for a
1899                 # property-value combination in one of the following
1900                 # properties.
1901                 my @list;
1902                 foreach my $property ("gc", "script") {
1903                     @list = prop_value_aliases($property, $loose);
1904                     last if @list;
1905                 }
1906                 if (@list) {
1907
1908                     # Here, it is one of those property-value combination
1909                     # single-form synonyms.  There are ambiguities with some
1910                     # of these.  Check against the list for these, and adjust
1911                     # if necessary.
1912                     for my $i (0 .. @list -1) {
1913                         if (exists $ambiguous_names
1914                                    {utf8::_loose_name(lc $list[$i])})
1915                         {
1916                             # The ambiguity is resolved by toggling whether or
1917                             # not it has an 'is' prefix
1918                             $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/;
1919                         }
1920                     }
1921                     return @list;
1922                 }
1923
1924                 # Here, it wasn't one of the gc or script single-form
1925                 # extensions.  It could be a block property single-form
1926                 # extension.  An 'in' prefix definitely means that, and should
1927                 # be looked up without the prefix.  However, starting in
1928                 # Unicode 6.1, we have to special case 'indic...', as there
1929                 # is a property that begins with that name.   We shouldn't
1930                 # strip the 'in' from that.   I'm (khw) generalizing this to
1931                 # 'indic' instead of the single property, because I suspect
1932                 # that others of this class may come along in the future.
1933                 # However, this could backfire and a block created whose name
1934                 # begins with 'dic...', and we would want to strip the 'in'.
1935                 # At which point this would have to be tweaked.
1936                 my $began_with_in = $loose =~ s/^in(?!dic)//;
1937                 @list = prop_value_aliases("block", $loose);
1938                 if (@list) {
1939                     map { $_ =~ s/^/In_/ } @list;
1940                     return @list;
1941                 }
1942
1943                 # Here still haven't found it.  The last opportunity for it
1944                 # being valid is only if it began with 'is'.  We retry without
1945                 # the 'is', setting a flag to that effect so that we don't
1946                 # accept things that begin with 'isis...'
1947                 if (! $retrying && ! $began_with_in && $loose =~ s/^is//) {
1948                     $retrying = 1;
1949                     goto RETRY;
1950                 }
1951
1952                 # Here, didn't find it.  Since it was in %loose_to_file_of, we
1953                 # should have been able to find it.
1954                 carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'.  Send bug report to perlbug\@perl.org";
1955                 return;
1956             }
1957         }
1958     }
1959
1960     if (! $list_ref) {
1961         # Here, we have set $prop to a standard form name of the input.  Look
1962         # it up in the structure created by mktables for this purpose, which
1963         # contains both strict and loosely matched properties.  Avoid
1964         # autovivifying.
1965         $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop};
1966         return unless $list_ref;
1967     }
1968
1969     # The full name is in element 1.
1970     return $list_ref->[1] unless wantarray;
1971
1972     return @{_dclone $list_ref};
1973 }
1974
1975 =pod
1976
1977 =head2 B<prop_values()>
1978
1979     use Unicode::UCD 'prop_values';
1980
1981     print "AHex values are: ", join(", ", prop_values("AHex")),
1982                                "\n";
1983   prints:
1984     AHex values are: N, Y
1985
1986 Some Unicode properties have a restricted set of legal values.  For example,
1987 all binary properties are restricted to just C<true> or C<false>; and there
1988 are only a few dozen possible General Categories.  Use C<prop_values>
1989 to find out if a given property is one such, and if so, to get a list of the
1990 values:
1991
1992     print join ", ", prop_values("NFC_Quick_Check");
1993   prints:
1994     M, N, Y
1995
1996 If the property doesn't have such a restricted set, C<undef> is returned.
1997
1998 There are usually several synonyms for each possible value.  Use
1999 L</prop_value_aliases()> to access those.
2000
2001 Case, white space, hyphens, and underscores are ignored in the input property
2002 name (except for the trailing underscore in the old-form grandfathered-in
2003 general category property value C<"L_">, which is better written as C<"LC">).
2004
2005 If the property name is unknown, C<undef> is returned.  Note that Perl typically
2006 recognizes property names in regular expressions with an optional C<"Is_>"
2007 (with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
2008 This function does not recognize those in the property parameter, returning
2009 C<undef>.
2010
2011 For the block property, new-style block names are returned (see
2012 L</Old-style versus new-style block names>).
2013
2014 C<prop_values> does not know about any user-defined properties, and
2015 will return C<undef> if called with one of those.
2016
2017 =cut
2018
2019 # These are created by mktables for this module and stored in unicore/UCD.pl
2020 # where their structures are described.
2021 our %loose_to_standard_value;
2022 our %prop_value_aliases;
2023
2024 sub prop_values ($) {
2025     my $prop = shift;
2026     return undef unless defined $prop;
2027
2028     require "unicore/UCD.pl";
2029     require "utf8_heavy.pl";
2030
2031     # Find the property name synonym that's used as the key in other hashes,
2032     # which is element 0 in the returned list.
2033     ($prop) = prop_aliases($prop);
2034     return undef if ! $prop;
2035     $prop = utf8::_loose_name(lc $prop);
2036
2037     # Here is a legal property.
2038     return undef unless exists $prop_value_aliases{$prop};
2039     my @return;
2040     foreach my $value_key (sort { lc $a cmp lc $b }
2041                             keys %{$prop_value_aliases{$prop}})
2042     {
2043         push @return, $prop_value_aliases{$prop}{$value_key}[0];
2044     }
2045     return @return;
2046 }
2047
2048 =pod
2049
2050 =head2 B<prop_value_aliases()>
2051
2052     use Unicode::UCD 'prop_value_aliases';
2053
2054     my ($short_name, $full_name, @other_names)
2055                                    = prop_value_aliases("Gc", "Punct");
2056     my $same_full_name = prop_value_aliases("Gc", "P");   # Scalar cntxt
2057     my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
2058                                                            # element
2059     print "The full name is $full_name\n";
2060     print "The short name is $short_name\n";
2061     print "The other aliases are: ", join(", ", @other_names), "\n";
2062
2063   prints:
2064     The full name is Punctuation
2065     The short name is P
2066     The other aliases are: Punct
2067
2068 Some Unicode properties have a restricted set of legal values.  For example,
2069 all binary properties are restricted to just C<true> or C<false>; and there
2070 are only a few dozen possible General Categories.
2071
2072 You can use L</prop_values()> to find out if a given property is one which has
2073 a restricted set of values, and if so, what those values are.  But usually
2074 each value actually has several synonyms.  For example, in Unicode binary
2075 properties, I<truth> can be represented by any of the strings "Y", "Yes", "T",
2076 or "True"; and the General Category "Punctuation" by that string, or "Punct",
2077 or simply "P".
2078
2079 Like property names, there is typically at least a short name for each such
2080 property-value, and a long name.  If you know any name of the property-value
2081 (which you can get by L</prop_values()>, you can use C<prop_value_aliases>()
2082 to get the long name (when called in scalar context), or a list of all the
2083 names, with the short name in the 0th element, the long name in the next
2084 element, and any other synonyms in the remaining elements, in no particular
2085 order, except that any all-numeric synonyms will be last.
2086
2087 The long name is returned in a form nicely capitalized, suitable for printing.
2088
2089 Case, white space, hyphens, and underscores are ignored in the input parameters
2090 (except for the trailing underscore in the old-form grandfathered-in general
2091 category property value C<"L_">, which is better written as C<"LC">).
2092
2093 If either name is unknown, C<undef> is returned.  Note that Perl typically
2094 recognizes property names in regular expressions with an optional C<"Is_>"
2095 (with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
2096 This function does not recognize those in the property parameter, returning
2097 C<undef>.
2098
2099 If called with a property that doesn't have synonyms for its values, it
2100 returns the input value, possibly normalized with capitalization and
2101 underscores, but not necessarily checking that the input value is valid.
2102
2103 For the block property, new-style block names are returned (see
2104 L</Old-style versus new-style block names>).
2105
2106 To find the synonyms for single-forms, such as C<\p{Any}>, use
2107 L</prop_aliases()> instead.
2108
2109 C<prop_value_aliases> does not know about any user-defined properties, and
2110 will return C<undef> if called with one of those.
2111
2112 =cut
2113
2114 sub prop_value_aliases ($$) {
2115     my ($prop, $value) = @_;
2116     return unless defined $prop && defined $value;
2117
2118     require "unicore/UCD.pl";
2119     require "utf8_heavy.pl";
2120
2121     # Find the property name synonym that's used as the key in other hashes,
2122     # which is element 0 in the returned list.
2123     ($prop) = prop_aliases($prop);
2124     return if ! $prop;
2125     $prop = utf8::_loose_name(lc $prop);
2126
2127     # Here is a legal property, but the hash below (created by mktables for
2128     # this purpose) only knows about the properties that have a very finite
2129     # number of potential values, that is not ones whose value could be
2130     # anything, like most (if not all) string properties.  These don't have
2131     # synonyms anyway.  Simply return the input.  For example, there is no
2132     # synonym for ('Uppercase_Mapping', A').
2133     if (! exists $prop_value_aliases{$prop}) {
2134
2135         # Here, we have a legal property, but an unknown value.  Since the
2136         # property is legal, if it isn't in the prop_aliases hash, it must be
2137         # a Perl-extension All perl extensions are binary, hence are
2138         # enumerateds, which means that we know that the input unknown value
2139         # is illegal.
2140         return if ! exists $Unicode::UCD::prop_aliases{$prop};
2141
2142         # Otherwise, we assume it's valid, as documented.
2143         return $value;
2144     }
2145
2146     # The value name may be loosely or strictly matched; we don't know yet.
2147     # But both types use lower-case.
2148     $value = lc $value;
2149
2150     # If the name isn't found under loose matching, it certainly won't be
2151     # found under strict
2152     my $loose_value = utf8::_loose_name($value);
2153     return unless exists $loose_to_standard_value{"$prop=$loose_value"};
2154
2155     # Similarly if the combination under loose matching doesn't exist, it
2156     # won't exist under strict.
2157     my $standard_value = $loose_to_standard_value{"$prop=$loose_value"};
2158     return unless exists $prop_value_aliases{$prop}{$standard_value};
2159
2160     # Here we did find a combination under loose matching rules.  But it could
2161     # be that is a strict property match that shouldn't have matched.
2162     # %prop_value_aliases is set up so that the strict matches will appear as
2163     # if they were in loose form.  Thus, if the non-loose version is legal,
2164     # we're ok, can skip the further check.
2165     if (! exists $utf8::stricter_to_file_of{"$prop=$value"}
2166
2167         # We're also ok and skip the further check if value loosely matches.
2168         # mktables has verified that no strict name under loose rules maps to
2169         # an existing loose name.  This code relies on the very limited
2170         # circumstances that strict names can be here.  Strict name matching
2171         # happens under two conditions:
2172         # 1) when the name begins with an underscore.  But this function
2173         #    doesn't accept those, and %prop_value_aliases doesn't have
2174         #    them.
2175         # 2) When the values are numeric, in which case we need to look
2176         #    further, but their squeezed-out loose values will be in
2177         #    %stricter_to_file_of
2178         && exists $utf8::stricter_to_file_of{"$prop=$loose_value"})
2179     {
2180         # The only thing that's legal loosely under strict is that can have an
2181         # underscore between digit pairs XXX
2182         while ($value =~ s/(\d)_(\d)/$1$2/g) {}
2183         return unless exists $utf8::stricter_to_file_of{"$prop=$value"};
2184     }
2185
2186     # Here, we know that the combination exists.  Return it.
2187     my $list_ref = $prop_value_aliases{$prop}{$standard_value};
2188     if (@$list_ref > 1) {
2189         # The full name is in element 1.
2190         return $list_ref->[1] unless wantarray;
2191
2192         return @{_dclone $list_ref};
2193     }
2194
2195     return $list_ref->[0] unless wantarray;
2196
2197     # Only 1 element means that it repeats
2198     return ( $list_ref->[0], $list_ref->[0] );
2199 }
2200
2201 # All 1 bits is the largest possible UV.
2202 $Unicode::UCD::MAX_CP = ~0;
2203
2204 =pod
2205
2206 =head2 B<prop_invlist()>
2207
2208 C<prop_invlist> returns an inversion list (described below) that defines all the
2209 code points for the binary Unicode property (or "property=value" pair) given
2210 by the input parameter string:
2211
2212  use feature 'say';
2213  use Unicode::UCD 'prop_invlist';
2214  say join ", ", prop_invlist("Any");
2215
2216  prints:
2217  0, 1114112
2218
2219 If the input is unknown C<undef> is returned in scalar context; an empty-list
2220 in list context.  If the input is known, the number of elements in
2221 the list is returned if called in scalar context.
2222
2223 L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives
2224 the list of properties that this function accepts, as well as all the possible
2225 forms for them (including with the optional "Is_" prefixes).  (Except this
2226 function doesn't accept any Perl-internal properties, some of which are listed
2227 there.) This function uses the same loose or tighter matching rules for
2228 resolving the input property's name as is done for regular expressions.  These
2229 are also specified in L<perluniprops|perluniprops/Properties accessible
2230 through \p{} and \P{}>.  Examples of using the "property=value" form are:
2231
2232  say join ", ", prop_invlist("Script=Shavian");
2233
2234  prints:
2235  66640, 66688
2236
2237  say join ", ", prop_invlist("ASCII_Hex_Digit=No");
2238
2239  prints:
2240  0, 48, 58, 65, 71, 97, 103
2241
2242  say join ", ", prop_invlist("ASCII_Hex_Digit=Yes");
2243
2244  prints:
2245  48, 58, 65, 71, 97, 103
2246
2247 Inversion lists are a compact way of specifying Unicode property-value
2248 definitions.  The 0th item in the list is the lowest code point that has the
2249 property-value.  The next item (item [1]) is the lowest code point beyond that
2250 one that does NOT have the property-value.  And the next item beyond that
2251 ([2]) is the lowest code point beyond that one that does have the
2252 property-value, and so on.  Put another way, each element in the list gives
2253 the beginning of a range that has the property-value (for even numbered
2254 elements), or doesn't have the property-value (for odd numbered elements).
2255 The name for this data structure stems from the fact that each element in the
2256 list toggles (or inverts) whether the corresponding range is or isn't on the
2257 list.
2258
2259 In the final example above, the first ASCII Hex digit is code point 48, the
2260 character "0", and all code points from it through 57 (a "9") are ASCII hex
2261 digits.  Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F")
2262 are, as are 97 ("a") through 102 ("f").  103 starts a range of code points
2263 that aren't ASCII hex digits.  That range extends to infinity, which on your
2264 computer can be found in the variable C<$Unicode::UCD::MAX_CP>.  (This
2265 variable is as close to infinity as Perl can get on your platform, and may be
2266 too high for some operations to work; you may wish to use a smaller number for
2267 your purposes.)
2268
2269 Note that the inversion lists returned by this function can possibly include
2270 non-Unicode code points, that is anything above 0x10FFFF.  Unicode properties
2271 are not defined on such code points.  You might wish to change the output to
2272 not include these.  Simply add 0x110000 at the end of the non-empty returned
2273 list if it isn't already that value; and pop that value if it is; like:
2274
2275  my @list = prop_invlist("foo");
2276  if (@list) {
2277      if ($list[-1] == 0x110000) {
2278          pop @list;  # Defeat the turning on for above Unicode
2279      }
2280      else {
2281          push @list, 0x110000; # Turn off for above Unicode
2282      }
2283  }
2284
2285 It is a simple matter to expand out an inversion list to a full list of all
2286 code points that have the property-value:
2287
2288  my @invlist = prop_invlist($property_name);
2289  die "empty" unless @invlist;
2290  my @full_list;
2291  for (my $i = 0; $i < @invlist; $i += 2) {
2292     my $upper = ($i + 1) < @invlist
2293                 ? $invlist[$i+1] - 1      # In range
2294                 : $Unicode::UCD::MAX_CP;  # To infinity.  You may want
2295                                           # to stop much much earlier;
2296                                           # going this high may expose
2297                                           # perl deficiencies with very
2298                                           # large numbers.
2299     for my $j ($invlist[$i] .. $upper) {
2300         push @full_list, $j;
2301     }
2302  }
2303
2304 C<prop_invlist> does not know about any user-defined nor Perl internal-only
2305 properties, and will return C<undef> if called with one of those.
2306
2307 The L</search_invlist()> function is provided for finding a code point within
2308 an inversion list.
2309
2310 =cut
2311
2312 # User-defined properties could be handled with some changes to utf8_heavy.pl;
2313 # and implementing here of dealing with EXTRAS.  If done, consideration should
2314 # be given to the fact that the user subroutine could return different results
2315 # with each call; security issues need to be thought about.
2316
2317 # These are created by mktables for this routine and stored in unicore/UCD.pl
2318 # where their structures are described.
2319 our %loose_defaults;
2320 our $MAX_UNICODE_CODEPOINT;
2321
2322 sub prop_invlist ($;$) {
2323     my $prop = $_[0];
2324
2325     # Undocumented way to get at Perl internal properties; it may be changed
2326     # or removed without notice at any time.
2327     my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
2328
2329     return if ! defined $prop;
2330
2331     require "utf8_heavy.pl";
2332
2333     # Warnings for these are only for regexes, so not applicable to us
2334     no warnings 'deprecated';
2335
2336     # Get the swash definition of the property-value.
2337     my $swash = utf8::SWASHNEW(__PACKAGE__, $prop, undef, 1, 0);
2338
2339     # Fail if not found, or isn't a boolean property-value, or is a
2340     # user-defined property, or is internal-only.
2341     return if ! $swash
2342               || ref $swash eq ""
2343               || $swash->{'BITS'} != 1
2344               || $swash->{'USER_DEFINED'}
2345               || (! $internal_ok && $prop =~ /^\s*_/);
2346
2347     if ($swash->{'EXTRAS'}) {
2348         carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
2349         return;
2350     }
2351     if ($swash->{'SPECIALS'}) {
2352         carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has SPECIALS magic";
2353         return;
2354     }
2355
2356     my @invlist;
2357
2358     if ($swash->{'LIST'} =~ /^V/) {
2359
2360         # A 'V' as the first character marks the input as already an inversion
2361         # list, in which case, all we need to do is put the remaining lines
2362         # into our array.
2363         @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
2364         shift @invlist;
2365     }
2366     else {
2367         # The input lines look like:
2368         # 0041\t005A   # [26]
2369         # 005F
2370
2371         # Split into lines, stripped of trailing comments
2372         foreach my $range (split "\n",
2373                               $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr)
2374         {
2375             # And find the beginning and end of the range on the line
2376             my ($hex_begin, $hex_end) = split "\t", $range;
2377             my $begin = hex $hex_begin;
2378
2379             # If the new range merely extends the old, we remove the marker
2380             # created the last time through the loop for the old's end, which
2381             # causes the new one's end to be used instead.
2382             if (@invlist && $begin == $invlist[-1]) {
2383                 pop @invlist;
2384             }
2385             else {
2386                 # Add the beginning of the range
2387                 push @invlist, $begin;
2388             }
2389
2390             if (defined $hex_end) { # The next item starts with the code point 1
2391                                     # beyond the end of the range.
2392                 no warnings 'portable';
2393                 my $end = hex $hex_end;
2394                 last if $end == $Unicode::UCD::MAX_CP;
2395                 push @invlist, $end + 1;
2396             }
2397             else {  # No end of range, is a single code point.
2398                 push @invlist, $begin + 1;
2399             }
2400         }
2401     }
2402
2403     # Could need to be inverted: add or subtract a 0 at the beginning of the
2404     # list.
2405     if ($swash->{'INVERT_IT'}) {
2406         if (@invlist && $invlist[0] == 0) {
2407             shift @invlist;
2408         }
2409         else {
2410             unshift @invlist, 0;
2411         }
2412     }
2413
2414     return @invlist;
2415 }
2416
2417 =pod
2418
2419 =head2 B<prop_invmap()>
2420
2421  use Unicode::UCD 'prop_invmap';
2422  my ($list_ref, $map_ref, $format, $default)
2423                                       = prop_invmap("General Category");
2424
2425 C<prop_invmap> is used to get the complete mapping definition for a property,
2426 in the form of an inversion map.  An inversion map consists of two parallel
2427 arrays.  One is an ordered list of code points that mark range beginnings, and
2428 the other gives the value (or mapping) that all code points in the
2429 corresponding range have.
2430
2431 C<prop_invmap> is called with the name of the desired property.  The name is
2432 loosely matched, meaning that differences in case, white-space, hyphens, and
2433 underscores are not meaningful (except for the trailing underscore in the
2434 old-form grandfathered-in property C<"L_">, which is better written as C<"LC">,
2435 or even better, C<"Gc=LC">).
2436
2437 Many Unicode properties have more than one name (or alias).  C<prop_invmap>
2438 understands all of these, including Perl extensions to them.  Ambiguities are
2439 resolved as described above for L</prop_aliases()>.  The Perl internal
2440 property "Perl_Decimal_Digit, described below, is also accepted.  An empty
2441 list is returned if the property name is unknown.
2442 See L<perluniprops/Properties accessible through Unicode::UCD> for the
2443 properties acceptable as inputs to this function.
2444
2445 It is a fatal error to call this function except in list context.
2446
2447 In addition to the two arrays that form the inversion map, C<prop_invmap>
2448 returns two other values; one is a scalar that gives some details as to the
2449 format of the entries of the map array; the other is a default value, useful
2450 in maps whose format name begins with the letter C<"a">, as described
2451 L<below in its subsection|/a>; and for specialized purposes, such as
2452 converting to another data structure, described at the end of this main
2453 section.
2454
2455 This means that C<prop_invmap> returns a 4 element list.  For example,
2456
2457  my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default)
2458                                                  = prop_invmap("Block");
2459
2460 In this call, the two arrays will be populated as shown below (for Unicode
2461 6.0):
2462
2463  Index  @blocks_ranges  @blocks_maps
2464    0        0x0000      Basic Latin
2465    1        0x0080      Latin-1 Supplement
2466    2        0x0100      Latin Extended-A
2467    3        0x0180      Latin Extended-B
2468    4        0x0250      IPA Extensions
2469    5        0x02B0      Spacing Modifier Letters
2470    6        0x0300      Combining Diacritical Marks
2471    7        0x0370      Greek and Coptic
2472    8        0x0400      Cyrillic
2473   ...
2474  233        0x2B820     No_Block
2475  234        0x2F800     CJK Compatibility Ideographs Supplement
2476  235        0x2FA20     No_Block
2477  236        0xE0000     Tags
2478  237        0xE0080     No_Block
2479  238        0xE0100     Variation Selectors Supplement
2480  239        0xE01F0     No_Block
2481  240        0xF0000     Supplementary Private Use Area-A
2482  241        0x100000    Supplementary Private Use Area-B
2483  242        0x110000    No_Block
2484
2485 The first line (with Index [0]) means that the value for code point 0 is "Basic
2486 Latin".  The entry "0x0080" in the @blocks_ranges column in the second line
2487 means that the value from the first line, "Basic Latin", extends to all code
2488 points in the range from 0 up to but not including 0x0080, that is, through
2489 127.  In other words, the code points from 0 to 127 are all in the "Basic
2490 Latin" block.  Similarly, all code points in the range from 0x0080 up to (but
2491 not including) 0x0100 are in the block named "Latin-1 Supplement", etc.
2492 (Notice that the return is the old-style block names; see L</Old-style versus
2493 new-style block names>).
2494
2495 The final line (with Index [242]) means that the value for all code points above
2496 the legal Unicode maximum code point have the value "No_Block", which is the
2497 term Unicode uses for a non-existing block.
2498
2499 The arrays completely specify the mappings for all possible code points.
2500 The final element in an inversion map returned by this function will always be
2501 for the range that consists of all the code points that aren't legal Unicode,
2502 but that are expressible on the platform.  (That is, it starts with code point
2503 0x110000, the first code point above the legal Unicode maximum, and extends to
2504 infinity.) The value for that range will be the same that any typical
2505 unassigned code point has for the specified property.  (Certain unassigned
2506 code points are not "typical"; for example the non-character code points, or
2507 those in blocks that are to be written right-to-left.  The above-Unicode
2508 range's value is not based on these atypical code points.)  It could be argued
2509 that, instead of treating these as unassigned Unicode code points, the value
2510 for this range should be C<undef>.  If you wish, you can change the returned
2511 arrays accordingly.
2512
2513 The maps for almost all properties are simple scalars that should be
2514 interpreted as-is.
2515 These values are those given in the Unicode-supplied data files, which may be
2516 inconsistent as to capitalization and as to which synonym for a property-value
2517 is given.  The results may be normalized by using the L</prop_value_aliases()>
2518 function.
2519
2520 There are exceptions to the simple scalar maps.  Some properties have some
2521 elements in their map list that are themselves lists of scalars; and some
2522 special strings are returned that are not to be interpreted as-is.  Element
2523 [2] (placed into C<$format> in the example above) of the returned four element
2524 list tells you if the map has any of these special elements or not, as follows:
2525
2526 =over
2527
2528 =item B<C<s>>
2529
2530 means all the elements of the map array are simple scalars, with no special
2531 elements.  Almost all properties are like this, like the C<block> example
2532 above.
2533
2534 =item B<C<sl>>
2535
2536 means that some of the map array elements have the form given by C<"s">, and
2537 the rest are lists of scalars.  For example, here is a portion of the output
2538 of calling C<prop_invmap>() with the "Script Extensions" property:
2539
2540  @scripts_ranges  @scripts_maps
2541       ...
2542       0x0953      Devanagari
2543       0x0964      [ Bengali, Devanagari, Gurumukhi, Oriya ]
2544       0x0966      Devanagari
2545       0x0970      Common
2546
2547 Here, the code points 0x964 and 0x965 are both used in Bengali,
2548 Devanagari, Gurmukhi, and Oriya, but no other scripts.
2549
2550 The Name_Alias property is also of this form.  But each scalar consists of two
2551 components:  1) the name, and 2) the type of alias this is.  They are
2552 separated by a colon and a space.  In Unicode 6.1, there are several alias types:
2553
2554 =over
2555
2556 =item C<correction>
2557
2558 indicates that the name is a corrected form for the
2559 original name (which remains valid) for the same code point.
2560
2561 =item C<control>
2562
2563 adds a new name for a control character.
2564
2565 =item C<alternate>
2566
2567 is an alternate name for a character
2568
2569 =item C<figment>
2570
2571 is a name for a character that has been documented but was never in any
2572 actual standard.
2573
2574 =item C<abbreviation>
2575
2576 is a common abbreviation for a character
2577
2578 =back
2579
2580 The lists are ordered (roughly) so the most preferred names come before less
2581 preferred ones.
2582
2583 For example,
2584
2585  @aliases_ranges        @alias_maps
2586     ...
2587     0x009E        [ 'PRIVACY MESSAGE: control', 'PM: abbreviation' ]
2588     0x009F        [ 'APPLICATION PROGRAM COMMAND: control',
2589                     'APC: abbreviation'
2590                   ]
2591     0x00A0        'NBSP: abbreviation'
2592     0x00A1        ""
2593     0x00AD        'SHY: abbreviation'
2594     0x00AE        ""
2595     0x01A2        'LATIN CAPITAL LETTER GHA: correction'
2596     0x01A3        'LATIN SMALL LETTER GHA: correction'
2597     0x01A4        ""
2598     ...
2599
2600 A map to the empty string means that there is no alias defined for the code
2601 point.
2602
2603 =item B<C<a>>
2604
2605 is like C<"s"> in that all the map array elements are scalars, but here they are
2606 restricted to all being integers, and some have to be adjusted (hence the name
2607 C<"a">) to get the correct result.  For example, in:
2608
2609  my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
2610                           = prop_invmap("Simple_Uppercase_Mapping");
2611
2612 the returned arrays look like this:
2613
2614  @$uppers_ranges_ref    @$uppers_maps_ref   Note
2615        0                      0
2616       97                     65          'a' maps to 'A', b => B ...
2617      123                      0
2618      181                    924          MICRO SIGN => Greek Cap MU
2619      182                      0
2620      ...
2621
2622 and C<$default> is 0.
2623
2624 Let's start with the second line.  It says that the uppercase of code point 97
2625 is 65; or C<uc("a")> == "A".  But the line is for the entire range of code
2626 points 97 through 122.  To get the mapping for any code point in this range,
2627 you take the offset it has from the beginning code point of the range, and add
2628 that to the mapping for that first code point.  So, the mapping for 122 ("z")
2629 is derived by taking the offset of 122 from 97 (=25) and adding that to 65,
2630 yielding 90 ("z").  Likewise for everything in between.
2631
2632 Requiring this simple adjustment allows the returned arrays to be
2633 significantly smaller than otherwise, up to a factor of 10, speeding up
2634 searching through them.
2635
2636 Ranges that map to C<$default>, C<"0">, behave somewhat differently.  For
2637 these, each code point maps to itself.  So, in the first line in the example,
2638 S<C<ord(uc(chr(0)))>> is 0, S<C<ord(uc(chr(1)))>> is 1, ..
2639 S<C<ord(uc(chr(96)))>> is 96.
2640
2641 =item B<C<al>>
2642
2643 means that some of the map array elements have the form given by C<"a">, and
2644 the rest are ordered lists of code points.
2645 For example, in:
2646
2647  my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
2648                                  = prop_invmap("Uppercase_Mapping");
2649
2650 the returned arrays look like this:
2651
2652  @$uppers_ranges_ref    @$uppers_maps_ref
2653        0                      0
2654       97                     65
2655      123                      0
2656      181                    924
2657      182                      0
2658      ...
2659     0x0149              [ 0x02BC 0x004E ]
2660     0x014A                    0
2661     0x014B                  330
2662      ...
2663
2664 This is the full Uppercase_Mapping property (as opposed to the
2665 Simple_Uppercase_Mapping given in the example for format C<"a">).  The only
2666 difference between the two in the ranges shown is that the code point at
2667 0x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
2668 characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
2669 CAPITAL LETTER N).
2670
2671 No adjustments are needed to entries that are references to arrays; each such
2672 entry will have exactly one element in its range, so the offset is always 0.
2673
2674 The fourth (index [3]) element (C<$default>) in the list returned for this
2675 format is 0.
2676
2677 =item B<C<ae>>
2678
2679 This is like C<"a">, but some elements are the empty string, and should not be
2680 adjusted.
2681 The one internal Perl property accessible by C<prop_invmap> is of this type:
2682 "Perl_Decimal_Digit" returns an inversion map which gives the numeric values
2683 that are represented by the Unicode decimal digit characters.  Characters that
2684 don't represent decimal digits map to the empty string, like so:
2685
2686  @digits    @values
2687  0x0000       ""
2688  0x0030        0
2689  0x003A:      ""
2690  0x0660:       0
2691  0x066A:      ""
2692  0x06F0:       0
2693  0x06FA:      ""
2694  0x07C0:       0
2695  0x07CA:      ""
2696  0x0966:       0
2697  ...
2698
2699 This means that the code points from 0 to 0x2F do not represent decimal digits;
2700 the code point 0x30 (DIGIT ZERO) represents 0;  code point 0x31, (DIGIT ONE),
2701 represents 0+1-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9-0 = 9;
2702 ... code points 0x3A through 0x65F do not represent decimal digits; 0x660
2703 (ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE),
2704 represents 0+1-0 = 1 ...
2705
2706 The fourth (index [3]) element (C<$default>) in the list returned for this
2707 format is the empty string.
2708
2709 =item B<C<ale>>
2710
2711 is a combination of the C<"al"> type and the C<"ae"> type.  Some of
2712 the map array elements have the forms given by C<"al">, and
2713 the rest are the empty string.  The property C<NFKC_Casefold> has this form.
2714 An example slice is:
2715
2716  @$ranges_ref  @$maps_ref         Note
2717     ...
2718    0x00AA       97                FEMININE ORDINAL INDICATOR => 'a'
2719    0x00AB        0
2720    0x00AD                         SOFT HYPHEN => ""
2721    0x00AE        0
2722    0x00AF     [ 0x0020, 0x0304 ]  MACRON => SPACE . COMBINING MACRON
2723    0x00B0        0
2724    ...
2725
2726 The fourth (index [3]) element (C<$default>) in the list returned for this
2727 format is 0.
2728
2729 =item B<C<ar>>
2730
2731 means that all the elements of the map array are either rational numbers or
2732 the string C<"NaN">, meaning "Not a Number".  A rational number is either an
2733 integer, or two integers separated by a solidus (C<"/">).  The second integer
2734 represents the denominator of the division implied by the solidus, and is
2735 actually always positive, so it is guaranteed not to be 0 and to not be
2736 signed.  When the element is a plain integer (without the
2737 solidus), it may need to be adjusted to get the correct value by adding the
2738 offset, just as other C<"a"> properties.  No adjustment is needed for
2739 fractions, as the range is guaranteed to have just a single element, and so
2740 the offset is always 0.
2741
2742 If you want to convert the returned map to entirely scalar numbers, you
2743 can use something like this:
2744
2745  my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
2746  if ($format && $format eq "ar") {
2747      map { $_ = eval $_ if $_ ne 'NaN' } @$map_ref;
2748  }
2749
2750 Here's some entries from the output of the property "Nv", which has format
2751 C<"ar">.
2752
2753  @numerics_ranges  @numerics_maps       Note
2754         0x00           "NaN"
2755         0x30             0           DIGIT 0 .. DIGIT 9
2756         0x3A           "NaN"
2757         0xB2             2           SUPERSCRIPTs 2 and 3
2758         0xB4           "NaN"
2759         0xB9             1           SUPERSCRIPT 1
2760         0xBA           "NaN"
2761         0xBC            1/4          VULGAR FRACTION 1/4
2762         0xBD            1/2          VULGAR FRACTION 1/2
2763         0xBE            3/4          VULGAR FRACTION 3/4
2764         0xBF           "NaN"
2765         0x660            0           ARABIC-INDIC DIGIT ZERO .. NINE
2766         0x66A          "NaN"
2767
2768 The fourth (index [3]) element (C<$default>) in the list returned for this
2769 format is C<"NaN">.
2770
2771 =item B<C<n>>
2772
2773 means the Name property.  All the elements of the map array are simple
2774 scalars, but some of them contain special strings that require more work to
2775 get the actual name.
2776
2777 Entries such as:
2778
2779  CJK UNIFIED IDEOGRAPH-<code point>
2780
2781 mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-"
2782 with the code point (expressed in hexadecimal) appended to it, like "CJK
2783 UNIFIED IDEOGRAPH-3403" (similarly for S<C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
2784 pointE<gt>>>).
2785
2786 Also, entries like
2787
2788  <hangul syllable>
2789
2790 means that the name is algorithmically calculated.  This is easily done by
2791 the function L<charnames/charnames::viacode(code)>.
2792
2793 Note that for control characters (C<Gc=cc>), Unicode's data files have the
2794 string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
2795 string.  This function returns that real name, the empty string.  (There are
2796 names for these characters, but they are considered aliases, not the Name
2797 property name, and are contained in the C<Name_Alias> property.)
2798
2799 =item B<C<ad>>
2800
2801 means the Decomposition_Mapping property.  This property is like C<"al">
2802 properties, except that one of the scalar elements is of the form:
2803
2804  <hangul syllable>
2805
2806 This signifies that this entry should be replaced by the decompositions for
2807 all the code points whose decomposition is algorithmically calculated.  (All
2808 of them are currently in one range and no others outside the range are likely
2809 to ever be added to Unicode; the C<"n"> format
2810 has this same entry.)  These can be generated via the function
2811 L<Unicode::Normalize::NFD()|Unicode::Normalize>.
2812
2813 Note that the mapping is the one that is specified in the Unicode data files,
2814 and to get the final decomposition, it may need to be applied recursively.
2815
2816 The fourth (index [3]) element (C<$default>) in the list returned for this
2817 format is 0.
2818
2819 =back
2820
2821 Note that a format begins with the letter "a" if and only the property it is
2822 for requires adjustments by adding the offsets in multi-element ranges.  For
2823 all these properties, an entry should be adjusted only if the map is a scalar
2824 which is an integer.  That is, it must match the regular expression:
2825
2826     / ^ -? \d+ $ /xa
2827
2828 Further, the first element in a range never needs adjustment, as the
2829 adjustment would be just adding 0.
2830
2831 A binary search such as that provided by L</search_invlist()>, can be used to
2832 quickly find a code point in the inversion list, and hence its corresponding
2833 mapping.
2834
2835 The final, fourth element (index [3], assigned to C<$default> in the "block"
2836 example) in the four element list returned by this function is used with the
2837 C<"a"> format types; it may also be useful for applications
2838 that wish to convert the returned inversion map data structure into some
2839 other, such as a hash.  It gives the mapping that most code points map to
2840 under the property.  If you establish the convention that any code point not
2841 explicitly listed in your data structure maps to this value, you can
2842 potentially make your data structure much smaller.  As you construct your data
2843 structure from the one returned by this function, simply ignore those ranges
2844 that map to this value.  For example, to
2845 convert to the data structure searchable by L</charinrange()>, you can follow
2846 this recipe for properties that don't require adjustments:
2847
2848  my ($list_ref, $map_ref, $format, $default) = prop_invmap($property);
2849  my @range_list;
2850
2851  # Look at each element in the list, but the -2 is needed because we
2852  # look at $i+1 in the loop, and the final element is guaranteed to map
2853  # to $default by prop_invmap(), so we would skip it anyway.
2854  for my $i (0 .. @$list_ref - 2) {
2855     next if $map_ref->[$i] eq $default;
2856     push @range_list, [ $list_ref->[$i],
2857                         $list_ref->[$i+1],
2858                         $map_ref->[$i]
2859                       ];
2860  }
2861
2862  print charinrange(\@range_list, $code_point), "\n";
2863
2864 With this, C<charinrange()> will return C<undef> if its input code point maps
2865 to C<$default>.  You can avoid this by omitting the C<next> statement, and adding
2866 a line after the loop to handle the final element of the inversion map.
2867
2868 Similarly, this recipe can be used for properties that do require adjustments:
2869
2870  for my $i (0 .. @$list_ref - 2) {
2871     next if $map_ref->[$i] eq $default;
2872
2873     # prop_invmap() guarantees that if the mapping is to an array, the
2874     # range has just one element, so no need to worry about adjustments.
2875     if (ref $map_ref->[$i]) {
2876         push @range_list,
2877                    [ $list_ref->[$i], $list_ref->[$i], $map_ref->[$i] ];
2878     }
2879     else {  # Otherwise each element is actually mapped to a separate
2880             # value, so the range has to be split into single code point
2881             # ranges.
2882
2883         my $adjustment = 0;
2884
2885         # For each code point that gets mapped to something...
2886         for my $j ($list_ref->[$i] .. $list_ref->[$i+1] -1 ) {
2887
2888             # ... add a range consisting of just it mapping to the
2889             # original plus the adjustment, which is incremented for the
2890             # next time through the loop, as the offset increases by 1
2891             # for each element in the range
2892             push @range_list,
2893                              [ $j, $j, $map_ref->[$i] + $adjustment++ ];
2894         }
2895     }
2896  }
2897
2898 Note that the inversion maps returned for the C<Case_Folding> and
2899 C<Simple_Case_Folding> properties do not include the Turkic-locale mappings.
2900 Use L</casefold()> for these.
2901
2902 C<prop_invmap> does not know about any user-defined properties, and will
2903 return C<undef> if called with one of those.
2904
2905 The returned values for the Perl extension properties, such as C<Any> and
2906 C<Greek> are somewhat misleading.  The values are either C<"Y"> or C<"N>".
2907 All Unicode properties are bipartite, so you can actually use the C<"Y"> or
2908 C<"N>" in a Perl regular rexpression for these, like C<qr/\p{ID_Start=Y/}> or
2909 C<qr/\p{Upper=N/}>.  But the Perl extensions aren't specified this way, only
2910 like C</qr/\p{Any}>, I<etc>.  You can't actually use the C<"Y"> and C<"N>" in
2911 them.
2912
2913 =cut
2914
2915 # User-defined properties could be handled with some changes to utf8_heavy.pl;
2916 # if done, consideration should be given to the fact that the user subroutine
2917 # could return different results with each call, which could lead to some
2918 # security issues.
2919
2920 # One could store things in memory so they don't have to be recalculated, but
2921 # it is unlikely this will be called often, and some properties would take up
2922 # significant memory.
2923
2924 # These are created by mktables for this routine and stored in unicore/UCD.pl
2925 # where their structures are described.
2926 our @algorithmic_named_code_points;
2927 our $HANGUL_BEGIN;
2928 our $HANGUL_COUNT;
2929
2930 sub prop_invmap ($;$) {
2931
2932     croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray;
2933
2934     my $prop = $_[0];
2935     return unless defined $prop;
2936
2937     # Undocumented way to get at Perl internal properties; it may be changed
2938     # or removed without notice at any time.  It currently also changes the
2939     # output to use the format specified in the file rather than the one we
2940     # normally compute and return
2941     my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
2942
2943     # Fail internal properties
2944     return if $prop =~ /^_/ && ! $internal_ok;
2945
2946     # The values returned by this function.
2947     my (@invlist, @invmap, $format, $missing);
2948
2949     # The swash has two components we look at, the base list, and a hash,
2950     # named 'SPECIALS', containing any additional members whose mappings don't
2951     # fit into the base list scheme of things.  These generally 'override'
2952     # any value in the base list for the same code point.
2953     my $overrides;
2954
2955     require "utf8_heavy.pl";
2956     require "unicore/UCD.pl";
2957
2958 RETRY:
2959
2960     # If there are multiple entries for a single code point
2961     my $has_multiples = 0;
2962
2963     # Try to get the map swash for the property.  They have 'To' prepended to
2964     # the property name, and 32 means we will accept 32 bit return values.
2965     # The 0 means we aren't calling this from tr///.
2966     my $swash = utf8::SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0);
2967
2968     # If didn't find it, could be because needs a proxy.  And if was the
2969     # 'Block' or 'Name' property, use a proxy even if did find it.  Finding it
2970     # in these cases would be the result of the installation changing mktables
2971     # to output the Block or Name tables.  The Block table gives block names
2972     # in the new-style, and this routine is supposed to return old-style block
2973     # names.  The Name table is valid, but we need to execute the special code
2974     # below to add in the algorithmic-defined name entries.
2975     # And NFKCCF needs conversion, so handle that here too.
2976     if (ref $swash eq ""
2977         || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x)
2978     {
2979
2980         # Get the short name of the input property, in standard form
2981         my ($second_try) = prop_aliases($prop);
2982         return unless $second_try;
2983         $second_try = utf8::_loose_name(lc $second_try);
2984
2985         if ($second_try eq "in") {
2986
2987             # This property is identical to age for inversion map purposes
2988             $prop = "age";
2989             goto RETRY;
2990         }
2991         elsif ($second_try =~ / ^ s ( cf | fc | [ltu] c ) $ /x) {
2992
2993             # These properties use just the LIST part of the full mapping,
2994             # which includes the simple maps that are otherwise overridden by
2995             # the SPECIALS.  So all we need do is to not look at the SPECIALS;
2996             # set $overrides to indicate that
2997             $overrides = -1;
2998
2999             # The full name is the simple name stripped of its initial 's'
3000             $prop = $1;
3001
3002             # .. except for this case
3003             $prop = 'cf' if $prop eq 'fc';
3004
3005             goto RETRY;
3006         }
3007         elsif ($second_try eq "blk") {
3008
3009             # We use the old block names.  Just create a fake swash from its
3010             # data.
3011             _charblocks();
3012             my %blocks;
3013             $blocks{'LIST'} = "";
3014             $blocks{'TYPE'} = "ToBlk";
3015             $utf8::SwashInfo{ToBlk}{'missing'} = "No_Block";
3016             $utf8::SwashInfo{ToBlk}{'format'} = "s";
3017
3018             foreach my $block (@BLOCKS) {
3019                 $blocks{'LIST'} .= sprintf "%x\t%x\t%s\n",
3020                                            $block->[0],
3021                                            $block->[1],
3022                                            $block->[2];
3023             }
3024             $swash = \%blocks;
3025         }
3026         elsif ($second_try eq "na") {
3027
3028             # Use the combo file that has all the Name-type properties in it,
3029             # extracting just the ones that are for the actual 'Name'
3030             # property.  And create a fake swash from it.
3031             my %names;
3032             $names{'LIST'} = "";
3033             my $original = do "unicore/Name.pl";
3034             my $algorithm_names = \@algorithmic_named_code_points;
3035
3036             # We need to remove the names from it that are aliases.  For that
3037             # we need to also read in that table.  Create a hash with the keys
3038             # being the code points, and the values being a list of the
3039             # aliases for the code point key.
3040             my ($aliases_code_points, $aliases_maps, undef, undef) =
3041                                                 &prop_invmap('Name_Alias');
3042             my %aliases;
3043             for (my $i = 0; $i < @$aliases_code_points; $i++) {
3044                 my $code_point = $aliases_code_points->[$i];
3045                 $aliases{$code_point} = $aliases_maps->[$i];
3046
3047                 # If not already a list, make it into one, so that later we
3048                 # can treat things uniformly
3049                 if (! ref $aliases{$code_point}) {
3050                     $aliases{$code_point} = [ $aliases{$code_point} ];
3051                 }
3052
3053                 # Remove the alias type from the entry, retaining just the
3054                 # name.
3055                 map { s/:.*// } @{$aliases{$code_point}};
3056             }
3057
3058             my $i = 0;
3059             foreach my $line (split "\n", $original) {
3060                 my ($hex_code_point, $name) = split "\t", $line;
3061
3062                 # Weeds out all comments, blank lines, and named sequences
3063                 next if $hex_code_point =~ /[^[:xdigit:]]/a;
3064
3065                 my $code_point = hex $hex_code_point;
3066
3067                 # The name of all controls is the default: the empty string.
3068                 # The set of controls is immutable
3069                 next if chr($code_point) =~ /[[:cntrl:]]/u;
3070
3071                 # If this is a name_alias, it isn't a name
3072                 next if grep { $_ eq $name } @{$aliases{$code_point}};
3073
3074                 # If we are beyond where one of the special lines needs to
3075                 # be inserted ...
3076                 while ($i < @$algorithm_names
3077                     && $code_point > $algorithm_names->[$i]->{'low'})
3078                 {
3079
3080                     # ... then insert it, ahead of what we were about to
3081                     # output
3082                     $names{'LIST'} .= sprintf "%x\t%x\t%s\n",
3083                                             $algorithm_names->[$i]->{'low'},
3084                                             $algorithm_names->[$i]->{'high'},
3085                                             $algorithm_names->[$i]->{'name'};
3086
3087                     # Done with this range.
3088                     $i++;
3089
3090                     # We loop until all special lines that precede the next
3091                     # regular one are output.
3092                 }
3093
3094                 # Here, is a normal name.
3095                 $names{'LIST'} .= sprintf "%x\t\t%s\n", $code_point, $name;
3096             } # End of loop through all the names
3097
3098             $names{'TYPE'} = "ToNa";
3099             $utf8::SwashInfo{ToNa}{'missing'} = "";
3100             $utf8::SwashInfo{ToNa}{'format'} = "n";
3101             $swash = \%names;
3102         }
3103         elsif ($second_try =~ / ^ ( d [mt] ) $ /x) {
3104
3105             # The file is a combination of dt and dm properties.  Create a
3106             # fake swash from the portion that we want.
3107             my $original = do "unicore/Decomposition.pl";
3108             my %decomps;
3109
3110             if ($second_try eq 'dt') {
3111                 $decomps{'TYPE'} = "ToDt";
3112                 $utf8::SwashInfo{'ToDt'}{'missing'} = "None";
3113                 $utf8::SwashInfo{'ToDt'}{'format'} = "s";
3114             }   # 'dm' is handled below, with 'nfkccf'
3115
3116             $decomps{'LIST'} = "";
3117
3118             # This property has one special range not in the file: for the
3119             # hangul syllables.  But not in Unicode version 1.
3120             UnicodeVersion() unless defined $v_unicode_version;
3121             my $done_hangul = ($v_unicode_version lt v2.0.0)
3122                               ? 1
3123                               : 0;    # Have we done the hangul range ?
3124             foreach my $line (split "\n", $original) {
3125                 my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
3126                 my $code_point = hex $hex_lower;
3127                 my $value;
3128                 my $redo = 0;
3129
3130                 # The type, enclosed in <...>, precedes the mapping separated
3131                 # by blanks
3132                 if ($type_and_map =~ / ^ < ( .* ) > \s+ (.*) $ /x) {
3133                     $value = ($second_try eq 'dt') ? $1 : $2
3134                 }
3135                 else {  # If there is no type specified, it's canonical
3136                     $value = ($second_try eq 'dt')
3137                              ? "Canonical" :
3138                              $type_and_map;
3139                 }
3140
3141                 # Insert the hangul range at the appropriate spot.
3142                 if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
3143                     $done_hangul = 1;
3144                     $decomps{'LIST'} .=
3145                                 sprintf "%x\t%x\t%s\n",
3146                                         $HANGUL_BEGIN,
3147                                         $HANGUL_BEGIN + $HANGUL_COUNT - 1,
3148                                         ($second_try eq 'dt')
3149                                         ? "Canonical"
3150                                         : "<hangul syllable>";
3151                 }
3152
3153                 if ($value =~ / / && $hex_upper ne "" && $hex_upper ne $hex_lower) {
3154                     $line = sprintf("%04X\t%s\t%s", hex($hex_lower) + 1, $hex_upper, $value);
3155                     $hex_upper = "";
3156                     $redo = 1;
3157                 }
3158
3159                 # And append this to our constructed LIST.
3160                 $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
3161
3162                 redo if $redo;
3163             }
3164             $swash = \%decomps;
3165         }
3166         elsif ($second_try ne 'nfkccf') { # Don't know this property. Fail.
3167             return;
3168         }
3169
3170         if ($second_try eq 'nfkccf' || $second_try eq 'dm') {
3171
3172             # The 'nfkccf' property is stored in the old format for backwards
3173             # compatibility for any applications that has read its file
3174             # directly before prop_invmap() existed.
3175             # And the code above has extracted the 'dm' property from its file
3176             # yielding the same format.  So here we convert them to adjusted
3177             # format for compatibility with the other properties similar to
3178             # them.
3179             my %revised_swash;
3180
3181             # We construct a new converted list.
3182             my $list = "";
3183
3184             my @ranges = split "\n", $swash->{'LIST'};
3185             for (my $i = 0; $i < @ranges; $i++) {
3186                 my ($hex_begin, $hex_end, $map) = split "\t", $ranges[$i];
3187
3188                 # The dm property has maps that are space separated sequences
3189                 # of code points, as well as the special entry "<hangul
3190                 # syllable>, which also contains a blank.
3191                 my @map = split " ", $map;
3192                 if (@map > 1) {
3193
3194                     # If it's just the special entry, append as-is.
3195                     if ($map eq '<hangul syllable>') {
3196                         $list .= "$ranges[$i]\n";
3197                     }
3198                     else {
3199
3200                         # These should all be single-element ranges.
3201                         croak __PACKAGE__, "::prop_invmap: Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne "" && $hex_end ne $hex_begin;
3202
3203                         # Convert them to decimal, as that's what's expected.
3204                         $list .= "$hex_begin\t\t"
3205                             . join(" ", map { hex } @map)
3206                             . "\n";
3207                     }
3208                     next;
3209                 }
3210
3211                 # Here, the mapping doesn't have a blank, is for a single code
3212                 # point.
3213                 my $begin = hex $hex_begin;
3214                 my $end = (defined $hex_end && $hex_end ne "")
3215                         ? hex $hex_end
3216                         : $begin;
3217
3218                 # Again, the output is to be in decimal.
3219                 my $decimal_map = hex $map;
3220
3221                 # We know that multi-element ranges with the same mapping
3222                 # should not be adjusted, as after the adjustment
3223                 # multi-element ranges are for consecutive increasing code
3224                 # points.  Further, the final element in the list won't be
3225                 # adjusted, as there is nothing after it to include in the
3226                 # adjustment
3227                 if ($begin != $end || $i == @ranges -1) {
3228
3229                     # So just convert these to single-element ranges
3230                     foreach my $code_point ($begin .. $end) {
3231                         $list .= sprintf("%04X\t\t%d\n",
3232                                         $code_point, $decimal_map);
3233                     }
3234                 }
3235                 else {
3236
3237                     # Here, we have a candidate for adjusting.  What we do is
3238                     # look through the subsequent adjacent elements in the
3239                     # input.  If the map to the next one differs by 1 from the
3240                     # one before, then we combine into a larger range with the
3241                     # initial map.  Loop doing this until we find one that
3242                     # can't be combined.
3243
3244                     my $offset = 0;     # How far away are we from the initial
3245                                         # map
3246                     my $squished = 0;   # ? Did we squish at least two
3247                                         # elements together into one range
3248                     for ( ; $i < @ranges; $i++) {
3249                         my ($next_hex_begin, $next_hex_end, $next_map)
3250                                                 = split "\t", $ranges[$i+1];
3251
3252                         # In the case of 'dm', the map may be a sequence of
3253                         # multiple code points, which are never combined with
3254                         # another range
3255                         last if $next_map =~ / /;
3256
3257                         $offset++;
3258                         my $next_decimal_map = hex $next_map;
3259
3260                         # If the next map is not next in sequence, it
3261                         # shouldn't be combined.
3262                         last if $next_decimal_map != $decimal_map + $offset;
3263
3264                         my $next_begin = hex $next_hex_begin;
3265
3266                         # Likewise, if the next element isn't adjacent to the
3267                         # previous one, it shouldn't be combined.
3268                         last if $next_begin != $begin + $offset;
3269
3270                         my $next_end = (defined $next_hex_end
3271                                         && $next_hex_end ne "")
3272                                             ? hex $next_hex_end
3273                                             : $next_begin;
3274
3275                         # And finally, if the next element is a multi-element
3276                         # range, it shouldn't be combined.
3277                         last if $next_end != $next_begin;
3278
3279                         # Here, we will combine.  Loop to see if we should
3280                         # combine the next element too.
3281                         $squished = 1;
3282                     }
3283
3284                     if ($squished) {
3285
3286                         # Here, 'i' is the element number of the last element to
3287                         # be combined, and the range is single-element, or we
3288                         # wouldn't be combining.  Get it's code point.
3289                         my ($hex_end, undef, undef) = split "\t", $ranges[$i];
3290                         $list .= "$hex_begin\t$hex_end\t$decimal_map\n";
3291                     } else {
3292
3293                         # Here, no combining done.  Just append the initial
3294                         # (and current) values.
3295                         $list .= "$hex_begin\t\t$decimal_map\n";
3296                     }
3297                 }
3298             } # End of loop constructing the converted list
3299
3300             # Finish up the data structure for our converted swash
3301             my $type = ($second_try eq 'nfkccf') ? 'ToNFKCCF' : 'ToDm';
3302             $revised_swash{'LIST'} = $list;
3303             $revised_swash{'TYPE'} = $type;
3304             $revised_swash{'SPECIALS'} = $swash->{'SPECIALS'};
3305             $swash = \%revised_swash;
3306
3307             $utf8::SwashInfo{$type}{'missing'} = 0;
3308             $utf8::SwashInfo{$type}{'format'} = 'a';
3309         }
3310     }
3311
3312     if ($swash->{'EXTRAS'}) {
3313         carp __PACKAGE__, "::prop_invmap: swash returned for $prop unexpectedly has EXTRAS magic";
3314         return;
3315     }
3316
3317     # Here, have a valid swash return.  Examine it.
3318     my $returned_prop = $swash->{'TYPE'};
3319
3320     # All properties but binary ones should have 'missing' and 'format'
3321     # entries
3322     $missing = $utf8::SwashInfo{$returned_prop}{'missing'};
3323     $missing = 'N' unless defined $missing;
3324
3325     $format = $utf8::SwashInfo{$returned_prop}{'format'};
3326     $format = 'b' unless defined $format;
3327
3328     my $requires_adjustment = $format =~ /^a/;
3329
3330     if ($swash->{'LIST'} =~ /^V/) {
3331         @invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
3332         shift @invlist;
3333         foreach my $i (0 .. @invlist - 1) {
3334             $invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N'
3335         }
3336
3337         # The map includes lines for all code points; add one for the range
3338         # from 0 to the first Y.
3339         if ($invlist[0] != 0) {
3340             unshift @invlist, 0;
3341             unshift @invmap, 'N';
3342         }
3343     }
3344     else {
3345         # The LIST input lines look like:
3346         # ...
3347         # 0374\t\tCommon
3348         # 0375\t0377\tGreek   # [3]
3349         # 037A\t037D\tGreek   # [4]
3350         # 037E\t\tCommon
3351         # 0384\t\tGreek
3352         # ...
3353         #
3354         # Convert them to like
3355         # 0374 => Common
3356         # 0375 => Greek
3357         # 0378 => $missing
3358         # 037A => Greek
3359         # 037E => Common
3360         # 037F => $missing
3361         # 0384 => Greek
3362         #
3363         # For binary properties, the final non-comment column is absent, and
3364         # assumed to be 'Y'.
3365
3366         foreach my $range (split "\n", $swash->{'LIST'}) {
3367             $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments
3368
3369             # Find the beginning and end of the range on the line
3370             my ($hex_begin, $hex_end, $map) = split "\t", $range;
3371             my $begin = hex $hex_begin;
3372             no warnings 'portable';
3373             my $end = (defined $hex_end && $hex_end ne "")
3374                     ? hex $hex_end
3375                     : $begin;
3376
3377             # Each time through the loop (after the first):
3378             # $invlist[-2] contains the beginning of the previous range processed
3379             # $invlist[-1] contains the end+1 of the previous range processed
3380             # $invmap[-2] contains the value of the previous range processed
3381             # $invmap[-1] contains the default value for missing ranges
3382             #                                                       ($missing)
3383             #
3384             # Thus, things are set up for the typical case of a new
3385             # non-adjacent range of non-missings to be added.  But, if the new
3386             # range is adjacent, it needs to replace the [-1] element; and if
3387             # the new range is a multiple value of the previous one, it needs
3388             # to be added to the [-2] map element.
3389
3390             # The first time through, everything will be empty.  If the
3391             # property doesn't have a range that begins at 0, add one that
3392             # maps to $missing
3393             if (! @invlist) {
3394                 if ($begin != 0) {
3395                     push @invlist, 0;
3396                     push @invmap, $missing;
3397                 }
3398             }
3399             elsif (@invlist > 1 && $invlist[-2] == $begin) {
3400
3401                 # Here we handle the case where the input has multiple entries
3402                 # for each code point.  mktables should have made sure that
3403                 # each such range contains only one code point.  At this
3404                 # point, $invlist[-1] is the $missing that was added at the
3405                 # end of the last loop iteration, and [-2] is the last real
3406                 # input code point, and that code point is the same as the one
3407                 # we are adding now, making the new one a multiple entry.  Add
3408                 # it to the existing entry, either by pushing it to the
3409                 # existing list of multiple entries, or converting the single
3410                 # current entry into a list with both on it.  This is all we
3411                 # need do for this iteration.
3412
3413                 if ($end != $begin) {
3414                     croak __PACKAGE__, ":prop_invmap: Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
3415                 }
3416                 if (! ref $invmap[-2]) {
3417                     $invmap[-2] = [ $invmap[-2], $map ];
3418                 }
3419                 else {
3420                     push @{$invmap[-2]}, $map;
3421                 }
3422                 $has_multiples = 1;
3423                 next;
3424             }
3425             elsif ($invlist[-1] == $begin) {
3426
3427                 # If the input isn't in the most compact form, so that there
3428                 # are two adjacent ranges that map to the same thing, they
3429                 # should be combined (EXCEPT where the arrays require
3430                 # adjustments, in which case everything is already set up
3431                 # correctly).  This happens in our constructed dt mapping, as
3432                 # Element [-2] is the map for the latest range so far
3433                 # processed.  Just set the beginning point of the map to
3434                 # $missing (in invlist[-1]) to 1 beyond where this range ends.
3435                 # For example, in
3436                 # 12\t13\tXYZ
3437                 # 14\t17\tXYZ
3438                 # we have set it up so that it looks like
3439                 # 12 => XYZ
3440                 # 14 => $missing
3441                 #
3442                 # We now see that it should be
3443                 # 12 => XYZ
3444                 # 18 => $missing
3445                 if (! $requires_adjustment && @invlist > 1 && ( (defined $map)
3446                                     ? $invmap[-2] eq $map
3447                                     : $invmap[-2] eq 'Y'))
3448                 {
3449                     $invlist[-1] = $end + 1;
3450                     next;
3451                 }
3452
3453                 # Here, the range started in the previous iteration that maps
3454                 # to $missing starts at the same code point as this range.
3455                 # That means there is no gap to fill that that range was
3456                 # intended for, so we just pop it off the parallel arrays.
3457                 pop @invlist;
3458                 pop @invmap;
3459             }
3460
3461             # Add the range beginning, and the range's map.
3462             push @invlist, $begin;
3463             if ($returned_prop eq 'ToDm') {
3464
3465                 # The decomposition maps are either a line like <hangul
3466                 # syllable> which are to be taken as is; or a sequence of code
3467                 # points in hex and separated by blanks.  Convert them to
3468                 # decimal, and if there is more than one, use an anonymous
3469                 # array as the map.
3470                 if ($map =~ /^ < /x) {
3471                     push @invmap, $map;
3472                 }
3473                 else {
3474                     my @map = split " ", $map;
3475                     if (@map == 1) {
3476                         push @invmap, $map[0];
3477                     }
3478                     else {
3479                         push @invmap, \@map;
3480                     }
3481                 }
3482             }
3483             else {
3484
3485                 # Otherwise, convert hex formatted list entries to decimal;
3486                 # add a 'Y' map for the missing value in binary properties, or
3487                 # otherwise, use the input map unchanged.
3488                 $map = ($format eq 'x' || $format eq 'ax')
3489                     ? hex $map
3490                     : $format eq 'b'
3491                     ? 'Y'
3492                     : $map;
3493                 push @invmap, $map;
3494             }
3495
3496             # We just started a range.  It ends with $end.  The gap between it
3497             # and the next element in the list must be filled with a range
3498             # that maps to the default value.  If there is no gap, the next
3499             # iteration will pop this, unless there is no next iteration, and
3500             # we have filled all of the Unicode code space, so check for that
3501             # and skip.
3502             if ($end < $Unicode::UCD::MAX_CP) {
3503                 push @invlist, $end + 1;
3504                 push @invmap, $missing;
3505             }
3506         }
3507     }
3508
3509     # If the property is empty, make all code points use the value for missing
3510     # ones.
3511     if (! @invlist) {
3512         push @invlist, 0;
3513         push @invmap, $missing;
3514     }
3515
3516     # The final element is always for just the above-Unicode code points.  If
3517     # not already there, add it.  It merely splits the current final range
3518     # that extends to infinity into two elements, each with the same map.
3519     # (This is to conform with the API that says the final element is for
3520     # $MAX_UNICODE_CODEPOINT + 1 .. INFINITY.)
3521     if ($invlist[-1] != $MAX_UNICODE_CODEPOINT + 1) {
3522         push @invmap, $invmap[-1];
3523         push @invlist, $MAX_UNICODE_CODEPOINT + 1;
3524     }
3525
3526     # The second component of the map are those values that require
3527     # non-standard specification, stored in SPECIALS.  These override any
3528     # duplicate code points in LIST.  If we are using a proxy, we may have
3529     # already set $overrides based on the proxy.
3530     $overrides = $swash->{'SPECIALS'} unless defined $overrides;
3531     if ($overrides) {
3532
3533         # A negative $overrides implies that the SPECIALS should be ignored,
3534         # and a simple 'a' list is the value.
3535         if ($overrides < 0) {
3536             $format = 'a';
3537         }
3538         else {
3539
3540             # Currently, all overrides are for properties that normally map to
3541             # single code points, but now some will map to lists of code
3542             # points (but there is an exception case handled below).
3543             $format = 'al';
3544
3545             # Look through the overrides.
3546             foreach my $cp_maybe_utf8 (keys %$overrides) {
3547                 my $cp;
3548                 my @map;
3549
3550                 # If the overrides came from SPECIALS, the code point keys are
3551                 # packed UTF-8.
3552                 if ($overrides == $swash->{'SPECIALS'}) {
3553                     $cp = $cp_maybe_utf8;
3554                     if (! utf8::decode($cp)) {
3555                         croak __PACKAGE__, "::prop_invmap: Malformed UTF-8: ",
3556                               map { sprintf("\\x{%02X}", unpack("C", $_)) }
3557                                                                 split "", $cp;
3558                     }
3559
3560                     $cp = unpack("W", $cp);
3561                     @map = unpack "W*", $swash->{'SPECIALS'}{$cp_maybe_utf8};
3562
3563                     # The empty string will show up unpacked as an empty
3564                     # array.
3565                     $format = 'ale' if @map == 0;
3566                 }
3567                 else {
3568
3569                     # But if we generated the overrides, we didn't bother to
3570                     # pack them, and we, so far, do this only for properties
3571                     # that are 'a' ones.
3572                     $cp = $cp_maybe_utf8;
3573                     @map = hex $overrides->{$cp};
3574                     $format = 'a';
3575                 }
3576
3577                 # Find the range that the override applies to.
3578                 my $i = search_invlist(\@invlist, $cp);
3579                 if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
3580                     croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
3581                 }
3582
3583                 # And what that range currently maps to
3584                 my $cur_map = $invmap[$i];
3585
3586                 # If there is a gap between the next range and the code point
3587                 # we are overriding, we have to add elements to both arrays to
3588                 # fill that gap, using the map that applies to it, which is
3589                 # $cur_map, since it is part of the current range.
3590                 if ($invlist[$i + 1] > $cp + 1) {
3591                     #use feature 'say';
3592                     #say "Before splice:";
3593                     #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3594                     #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3595                     #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3596                     #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3597                     #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3598
3599                     splice @invlist, $i + 1, 0, $cp + 1;
3600                     splice @invmap, $i + 1, 0, $cur_map;
3601
3602                     #say "After splice:";
3603                     #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3604                     #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3605                     #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3606                     #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3607                     #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3608                 }
3609
3610                 # If the remaining portion of the range is multiple code
3611                 # points (ending with the one we are replacing, guaranteed by
3612                 # the earlier splice).  We must split it into two
3613                 if ($invlist[$i] < $cp) {
3614                     $i++;   # Compensate for the new element
3615
3616                     #use feature 'say';
3617                     #say "Before splice:";
3618                     #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3619                     #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3620                     #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3621                     #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3622                     #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3623
3624                     splice @invlist, $i, 0, $cp;
3625                     splice @invmap, $i, 0, 'dummy';
3626
3627                     #say "After splice:";
3628                     #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3629                     #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3630                     #say 'i  =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3631                     #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3632                     #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3633                 }
3634
3635                 # Here, the range we are overriding contains a single code
3636                 # point.  The result could be the empty string, a single
3637                 # value, or a list.  If the last case, we use an anonymous
3638                 # array.
3639                 $invmap[$i] = (scalar @map == 0)
3640                                ? ""
3641                                : (scalar @map > 1)
3642                                   ? \@map
3643                                   : $map[0];
3644             }
3645         }
3646     }
3647     elsif ($format eq 'x') {
3648
3649         # All hex-valued properties are really to code points, and have been
3650         # converted to decimal.
3651         $format = 's';
3652     }
3653     elsif ($returned_prop eq 'ToDm') {
3654         $format = 'ad';
3655     }
3656     elsif ($format eq 'sw') { # blank-separated elements to form a list.
3657         map { $_ = [ split " ", $_  ] if $_ =~ / / } @invmap;
3658         $format = 'sl';
3659     }
3660     elsif ($returned_prop eq 'ToNameAlias') {
3661
3662         # This property currently doesn't have any lists, but theoretically
3663         # could
3664         $format = 'sl';
3665     }
3666     elsif ($returned_prop eq 'ToPerlDecimalDigit') {
3667         $format = 'ae';
3668     }
3669     elsif ($returned_prop eq 'ToNv') {
3670
3671         # The one property that has this format is stored as a delta, so needs
3672         # to indicate that need to add code point to it.
3673         $format = 'ar';
3674     }
3675     elsif ($format ne 'n' && $format ne 'a') {
3676
3677         # All others are simple scalars
3678         $format = 's';
3679     }
3680     if ($has_multiples &&  $format !~ /l/) {
3681         croak __PACKAGE__, "::prop_invmap: Wrong format '$format' for prop_invmap('$prop'); should indicate has lists";
3682     }
3683
3684     return (\@invlist, \@invmap, $format, $missing);
3685 }
3686
3687 sub search_invlist {
3688
3689 =pod
3690
3691 =head2 B<search_invlist()>
3692
3693  use Unicode::UCD qw(prop_invmap prop_invlist);
3694  use Unicode::UCD 'search_invlist';
3695
3696  my @invlist = prop_invlist($property_name);
3697  print $code_point, ((search_invlist(\@invlist, $code_point) // -1) % 2)
3698                      ? " isn't"
3699                      : " is",
3700      " in $property_name\n";
3701
3702  my ($blocks_ranges_ref, $blocks_map_ref) = prop_invmap("Block");
3703  my $index = search_invlist($blocks_ranges_ref, $code_point);
3704  print "$code_point is in block ", $blocks_map_ref->[$index], "\n";
3705
3706 C<search_invlist> is used to search an inversion list returned by
3707 C<prop_invlist> or C<prop_invmap> for a particular L</code point argument>.
3708 C<undef> is returned if the code point is not found in the inversion list
3709 (this happens only when it is not a legal L<code point argument>, or is less
3710 than the list's first element).  A warning is raised in the first instance.
3711
3712 Otherwise, it returns the index into the list of the range that contains the
3713 code point.; that is, find C<i> such that
3714
3715     list[i]<= code_point < list[i+1].
3716
3717 As explained in L</prop_invlist()>, whether a code point is in the list or not
3718 depends on if the index is even (in) or odd (not in).  And as explained in
3719 L</prop_invmap()>, the index is used with the returned parallel array to find
3720 the mapping.
3721
3722 =cut
3723
3724
3725     my $list_ref = shift;
3726     my $input_code_point = shift;
3727     my $code_point = _getcode($input_code_point);
3728
3729     if (! defined $code_point) {
3730         carp __PACKAGE__, "::search_invlist: unknown code '$input_code_point'";
3731         return;
3732     }
3733
3734     my $max_element = @$list_ref - 1;
3735
3736     # Return undef if list is empty or requested item is before the first element.
3737     return if $max_element < 0;
3738     return if $code_point < $list_ref->[0];
3739
3740     # Short cut something at the far-end of the table.  This also allows us to
3741     # refer to element [$i+1] without fear of being out-of-bounds in the loop
3742     # below.
3743     return $max_element if $code_point >= $list_ref->[$max_element];
3744
3745     use integer;        # want integer division
3746
3747     my $i = $max_element / 2;
3748
3749     my $lower = 0;
3750     my $upper = $max_element;
3751     while (1) {
3752
3753         if ($code_point >= $list_ref->[$i]) {
3754
3755             # Here we have met the lower constraint.  We can quit if we
3756             # also meet the upper one.
3757             last if $code_point < $list_ref->[$i+1];
3758
3759             $lower = $i;        # Still too low.
3760
3761         }
3762         else {
3763
3764             # Here, $code_point < $list_ref[$i], so look lower down.
3765             $upper = $i;
3766         }
3767
3768         # Split search domain in half to try again.
3769         my $temp = ($upper + $lower) / 2;
3770
3771         # No point in continuing unless $i changes for next time
3772         # in the loop.
3773         return $i if $temp == $i;
3774         $i = $temp;
3775     } # End of while loop
3776
3777     # Here we have found the offset
3778     return $i;
3779 }
3780
3781 =head2 Unicode::UCD::UnicodeVersion
3782
3783 This returns the version of the Unicode Character Database, in other words, the
3784 version of the Unicode standard the database implements.  The version is a
3785 string of numbers delimited by dots (C<'.'>).
3786
3787 =cut
3788
3789 my $UNICODEVERSION;
3790
3791 sub UnicodeVersion {
3792     unless (defined $UNICODEVERSION) {
3793         openunicode(\$VERSIONFH, "version");
3794         local $/ = "\n";
3795         chomp($UNICODEVERSION = <$VERSIONFH>);
3796         close($VERSIONFH);
3797         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
3798             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
3799     }
3800     $v_unicode_version = pack "C*", split /\./, $UNICODEVERSION;
3801     return $UNICODEVERSION;
3802 }
3803
3804 =head2 B<Blocks versus Scripts>
3805
3806 The difference between a block and a script is that scripts are closer
3807 to the linguistic notion of a set of code points required to represent
3808 languages, while block is more of an artifact of the Unicode code point
3809 numbering and separation into blocks of consecutive code points (so far the
3810 size of a block is some multiple of 16, like 128 or 256).
3811
3812 For example the Latin B<script> is spread over several B<blocks>, such
3813 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
3814 C<Latin Extended-B>.  On the other hand, the Latin script does not
3815 contain all the characters of the C<Basic Latin> block (also known as
3816 ASCII): it includes only the letters, and not, for example, the digits
3817 nor the punctuation.
3818
3819 For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
3820
3821 For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
3822
3823 =head2 B<Matching Scripts and Blocks>
3824
3825 Scripts are matched with the regular-expression construct
3826 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
3827 while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches
3828 any of the 256 code points in the Tibetan block).
3829
3830 =head2 Old-style versus new-style block names
3831
3832 Unicode publishes the names of blocks in two different styles, though the two
3833 are equivalent under Unicode's loose matching rules.
3834
3835 The original style uses blanks and hyphens in the block names (except for
3836 C<No_Block>), like so:
3837
3838  Miscellaneous Mathematical Symbols-B
3839
3840 The newer style replaces these with underscores, like this:
3841
3842  Miscellaneous_Mathematical_Symbols_B
3843
3844 This newer style is consistent with the values of other Unicode properties.
3845 To preserve backward compatibility, all the functions in Unicode::UCD that
3846 return block names (except one) return the old-style ones.  That one function,
3847 L</prop_value_aliases()> can be used to convert from old-style to new-style:
3848
3849  my $new_style = prop_values_aliases("block", $old_style);
3850
3851 Perl also has single-form extensions that refer to blocks, C<In_Cyrillic>,
3852 meaning C<Block=Cyrillic>.  These have always been written in the new style.
3853
3854 To convert from new-style to old-style, follow this recipe:
3855
3856  $old_style = charblock((prop_invlist("block=$new_style"))[0]);
3857
3858 (which finds the range of code points in the block using C<prop_invlist>,
3859 gets the lower end of the range (0th element) and then looks up the old name
3860 for its block using C<charblock>).
3861
3862 Note that starting in Unicode 6.1, many of the block names have shorter
3863 synonyms.  These are always given in the new style.
3864
3865 =head1 AUTHOR
3866
3867 Jarkko Hietaniemi.  Now maintained by perl5 porters.
3868
3869 =cut
3870
3871 1;