lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5 use charnames ();
   6
   7 our $VERSION = '0.32';
   8
   9 use Storable qw(dclone);
  10
  11 require Exporter;
  12
  13 our @ISA = qw(Exporter);
  14
  15 our @EXPORT_OK = qw(charinfo
  16                     charblock charscript
  17                     charblocks charscripts
  18                     charinrange
  19                     general_categories bidi_types
  20                     compexcl
  21                     casefold casespec
  22                     namedseq
  23                     num
  24                 );
  25
  26 use Carp;
  27
  28 =head1 NAME
  29
  30 Unicode::UCD - Unicode character database
  31
  32 =head1 SYNOPSIS
  33
  34     use Unicode::UCD 'charinfo';
  35     my $charinfo   = charinfo($codepoint);
  36
  37     use Unicode::UCD 'casefold';
  38     my $casefold = casefold(0xFB00);
  39
  40     use Unicode::UCD 'casespec';
  41     my $casespec = casespec(0xFB00);
  42
  43     use Unicode::UCD 'charblock';
  44     my $charblock  = charblock($codepoint);
  45
  46     use Unicode::UCD 'charscript';
  47     my $charscript = charscript($codepoint);
  48
  49     use Unicode::UCD 'charblocks';
  50     my $charblocks = charblocks();
  51
  52     use Unicode::UCD 'charscripts';
  53     my $charscripts = charscripts();
  54
  55     use Unicode::UCD qw(charscript charinrange);
  56     my $range = charscript($script);
  57     print "looks like $script\n" if charinrange($range, $codepoint);
  58
  59     use Unicode::UCD qw(general_categories bidi_types);
  60     my $categories = general_categories();
  61     my $types = bidi_types();
  62
  63     use Unicode::UCD 'compexcl';
  64     my $compexcl = compexcl($codepoint);
  65
  66     use Unicode::UCD 'namedseq';
  67     my $namedseq = namedseq($named_sequence_name);
  68
  69     my $unicode_version = Unicode::UCD::UnicodeVersion();
  70
  71     my $convert_to_numeric =
  72                 Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
  73
  74 =head1 DESCRIPTION
  75
  76 The Unicode::UCD module offers a series of functions that
  77 provide a simple interface to the Unicode
  78 Character Database.
  79
  80 =head2 code point argument
  81
  82 Some of the functions are called with a I<code point argument>, which is either
  83 a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
  84 followed by hexadecimals designating a Unicode code point.  In other words, if
  85 you want a code point to be interpreted as a hexadecimal number, you must
  86 prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
  87 interpreted as a decimal code point.  Also note that Unicode is B<not> limited
  88 to 16 bits (the number of Unicode code points is open-ended, in theory
  89 unlimited): you may have more than 4 hexdigits.
  90 =cut
  91
  92 my $UNICODEFH;
  93 my $BLOCKSFH;
  94 my $VERSIONFH;
  95 my $COMPEXCLFH;
  96 my $CASEFOLDFH;
  97 my $CASESPECFH;
  98 my $NAMEDSEQFH;
  99
 100 sub openunicode {
 101     my ($rfh, @path) = @_;
 102     my $f;
 103     unless (defined $$rfh) {
 104         for my $d (@INC) {
 105             use File::Spec;
 106             $f = File::Spec->catfile($d, "unicore", @path);
 107             last if open($$rfh, $f);
 108             undef $f;
 109         }
 110         croak __PACKAGE__, ": failed to find ",
 111               File::Spec->catfile(@path), " in @INC"
 112             unless defined $f;
 113     }
 114     return $f;
 115 }
 116
 117 =head2 B<charinfo()>
 118
 119     use Unicode::UCD 'charinfo';
 120
 121     my $charinfo = charinfo(0x41);
 122
 123 This returns information about the input L</code point argument>
 124 as a reference to a hash of fields as defined by the Unicode
 125 standard.  If the L</code point argument> is not assigned in the standard
 126 (i.e., has the general category C<Cn> meaning C<Unassigned>)
 127 or is a non-character (meaning it is guaranteed to never be assigned in
 128 the standard),
 129 B<undef> is returned.
 130
 131 Fields that aren't applicable to the particular code point argument exist in the
 132 returned hash, and are empty.
 133
 134 The keys in the hash with the meanings of their values are:
 135
 136 =over
 137
 138 =item B<code>
 139
 140 the input L</code point argument> expressed in hexadecimal, with leading zeros
 141 added if necessary to make it contain at least four hexdigits
 142
 143 =item B<name>
 144
 145 name of I<code>, all IN UPPER CASE.
 146 Some control-type code points do not have names.
 147 This field will be empty for C<Surrogate> and C<Private Use> code points,
 148 and for the others without a name,
 149 it will contain a description enclosed in angle brackets, like
 150 C<E<lt>controlE<gt>>.
 151
 152
 153 =item B<category>
 154
 155 The short name of the general category of I<code>.
 156 This will match one of the keys in the hash returned by L</general_categories()>.
 157
 158 =item B<combining>
 159
 160 the combining class number for I<code> used in the Canonical Ordering Algorithm.
 161 For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
 162 available at
 163 L<http://www.unicode.org/versions/Unicode5.1.0/>
 164
 165 =item B<bidi>
 166
 167 bidirectional type of I<code>.
 168 This will match one of the keys in the hash returned by L</bidi_types()>.
 169
 170 =item B<decomposition>
 171
 172 is empty if I<code> has no decomposition; or is one or more codes
 173 (separated by spaces) that taken in order represent a decomposition for
 174 I<code>.  Each has at least four hexdigits.
 175 The codes may be preceded by a word enclosed in angle brackets then a space,
 176 like C<E<lt>compatE<gt> >, giving the type of decomposition
 177
 178 This decomposition may be an intermediate one whose components are also
 179 decomposable.  Use L<Unicode::Normalize> to get the final decomposition.
 180
 181 =item B<decimal>
 182
 183 if I<code> is a decimal digit this is its integer numeric value
 184
 185 =item B<digit>
 186
 187 if I<code> represents some other digit-like number, this is its integer
 188 numeric value
 189
 190 =item B<numeric>
 191
 192 if I<code> represents a whole or rational number, this is its numeric value.
 193 Rational values are expressed as a string like C<1/4>.
 194
 195 =item B<mirrored>
 196
 197 C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
 198
 199 =item B<unicode10>
 200
 201 name of I<code> in the Unicode 1.0 standard if one
 202 existed for this code point and is different from the current name
 203
 204 =item B<comment>
 205
 206 As of Unicode 6.0, this is always empty.
 207
 208 =item B<upper>
 209
 210 is empty if there is no single code point uppercase mapping for I<code>
 211 (it's uppercase mapping is itself);
 212 otherwise it is that mapping expressed as at least four hexdigits.
 213 (L</casespec()> should be used in addition to B<charinfo()>
 214 for case mappings when the calling program can cope with multiple code point
 215 mappings.)
 216
 217 =item B<lower>
 218
 219 is empty if there is no single code point lowercase mapping for I<code>
 220 (it's lowercase mapping is itself);
 221 otherwise it is that mapping expressed as at least four hexdigits.
 222 (L</casespec()> should be used in addition to B<charinfo()>
 223 for case mappings when the calling program can cope with multiple code point
 224 mappings.)
 225
 226 =item B<title>
 227
 228 is empty if there is no single code point titlecase mapping for I<code>
 229 (it's titlecase mapping is itself);
 230 otherwise it is that mapping expressed as at least four hexdigits.
 231 (L</casespec()> should be used in addition to B<charinfo()>
 232 for case mappings when the calling program can cope with multiple code point
 233 mappings.)
 234
 235 =item B<block>
 236
 237 block I<code> belongs to (used in C<\p{Blk=...}>).
 238 See L</Blocks versus Scripts>.
 239
 240
 241 =item B<script>
 242
 243 script I<code> belongs to.
 244 See L</Blocks versus Scripts>.
 245
 246 =back
 247
 248 Note that you cannot do (de)composition and casing based solely on the
 249 I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
 250 you will need also the L</compexcl()>, and L</casespec()> functions.
 251
 252 =cut
 253
 254 # NB: This function is nearly duplicated in charnames.pm
 255 sub _getcode {
 256     my $arg = shift;
 257
 258     if ($arg =~ /^[1-9]\d*$/) {
 259         return $arg;
 260     } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 261         return hex($1);
 262     }
 263
 264     return;
 265 }
 266
 267 # Lingua::KO::Hangul::Util not part of the standard distribution
 268 # but it will be used if available.
 269
 270 eval { require Lingua::KO::Hangul::Util };
 271 my $hasHangulUtil = ! $@;
 272 if ($hasHangulUtil) {
 273     Lingua::KO::Hangul::Util->import();
 274 }
 275
 276 sub hangul_decomp { # internal: called from charinfo
 277     if ($hasHangulUtil) {
 278         my @tmp = decomposeHangul(shift);
 279         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 280         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 281     }
 282     return;
 283 }
 284
 285 sub hangul_charname { # internal: called from charinfo
 286     return sprintf("HANGUL SYLLABLE-%04X", shift);
 287 }
 288
 289 sub han_charname { # internal: called from charinfo
 290     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 291 }
 292
 293 # Overwritten by data in file
 294 my %first_last = (
 295    'CJK Ideograph Extension A' => [ 0x3400,   0x4DB5   ],
 296    'CJK Ideograph'             => [ 0x4E00,   0x9FA5   ],
 297    'CJK Ideograph Extension B' => [ 0x20000,  0x2A6D6  ],
 298 );
 299
 300 get_charinfo_ranges();
 301
 302 sub get_charinfo_ranges {
 303    my @blocks = keys %first_last;
 304
 305    my $fh;
 306    openunicode( \$fh, 'UnicodeData.txt' );
 307    if( defined $fh ){
 308       while( my $line = <$fh> ){
 309          next unless $line =~ /(?:First|Last)/;
 310          if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
 311             my ($number,$block,$type);
 312             ($number,$block) = split /;/, $line;
 313             $block =~ s/<|>//g;
 314             ($block,$type) = split /, /, $block;
 315             my $index = $type eq 'First' ? 0 : 1;
 316             $first_last{ $block }->[$index] = hex $number;
 317          }
 318       }
 319    }
 320 }
 321
 322 my @CharinfoRanges = (
 323 # block name
 324 # [ first, last, coderef to name, coderef to decompose ],
 325 # CJK Ideographs Extension A
 326   [ @{ $first_last{'CJK Ideograph Extension A'} },        \&han_charname,   undef  ],
 327 # CJK Ideographs
 328   [ @{ $first_last{'CJK Ideograph'} },                    \&han_charname,   undef  ],
 329 # Hangul Syllables
 330   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 331 # Non-Private Use High Surrogates
 332   [ 0xD800,   0xDB7F,   undef,   undef  ],
 333 # Private Use High Surrogates
 334   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 335 # Low Surrogates
 336   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 337 # The Private Use Area
 338   [ 0xE000,   0xF8FF,   undef,   undef  ],
 339 # CJK Ideographs Extension B
 340   [ @{ $first_last{'CJK Ideograph Extension B'} },        \&han_charname,   undef  ],
 341 # Plane 15 Private Use Area
 342   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 343 # Plane 16 Private Use Area
 344   [ 0x100000, 0x10FFFD, undef,   undef  ],
 345 );
 346
 347 sub charinfo {
 348     my $arg  = shift;
 349     my $code = _getcode($arg);
 350     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 351         unless defined $code;
 352     my $hexk = sprintf("%06X", $code);
 353     my($rcode,$rname,$rdec);
 354     foreach my $range (@CharinfoRanges){
 355       if ($range->[0] <= $code && $code <= $range->[1]) {
 356         $rcode = $hexk;
 357         $rcode =~ s/^0+//;
 358         $rcode =  sprintf("%04X", hex($rcode));
 359         $rname = $range->[2] ? $range->[2]->($code) : '';
 360         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 361         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 362         last;
 363       }
 364     }
 365     openunicode(\$UNICODEFH, "UnicodeData.txt");
 366     if (defined $UNICODEFH) {
 367         use Search::Dict 1.02;
 368         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 369             my $line = <$UNICODEFH>;
 370             return unless defined $line;
 371             chomp $line;
 372             my %prop;
 373             @prop{qw(
 374                      code name category
 375                      combining bidi decomposition
 376                      decimal digit numeric
 377                      mirrored unicode10 comment
 378                      upper lower title
 379                     )} = split(/;/, $line, -1);
 380             $hexk =~ s/^0+//;
 381             $hexk =  sprintf("%04X", hex($hexk));
 382             if ($prop{code} eq $hexk) {
 383                 $prop{block}  = charblock($code);
 384                 $prop{script} = charscript($code);
 385                 if(defined $rname){
 386                     $prop{code} = $rcode;
 387                     $prop{name} = $rname;
 388                     $prop{decomposition} = $rdec;
 389                 }
 390                 return \%prop;
 391             }
 392         }
 393     }
 394     return;
 395 }
 396
 397 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 398     my ($table, $lo, $hi, $code) = @_;
 399
 400     return if $lo > $hi;
 401
 402     my $mid = int(($lo+$hi) / 2);
 403
 404     if ($table->[$mid]->[0] < $code) {
 405         if ($table->[$mid]->[1] >= $code) {
 406             return $table->[$mid]->[2];
 407         } else {
 408             _search($table, $mid + 1, $hi, $code);
 409         }
 410     } elsif ($table->[$mid]->[0] > $code) {
 411         _search($table, $lo, $mid - 1, $code);
 412     } else {
 413         return $table->[$mid]->[2];
 414     }
 415 }
 416
 417 sub _read_table {
 418
 419     # Returns the contents of the mktables generated table file located at $1
 420     # in the form of an array of arrays.  Each outer array denotes a range
 421     # with [0] the start point of that range; [1] the end point; and [2] the
 422     # value that every code point in the range has.
 423     #
 424     # This has the side effect of setting
 425     # $utf8::SwashInfo{$property}{'format'} to be the mktables format of the
 426     #                                       table; and
 427     # $utf8::SwashInfo{$property}{'missing'} to be the value for all entries
 428     #                                        not listed in the table.
 429     # where $property is the Unicode property name, preceded by 'To' for map
 430     # properties., e.g., 'ToSc'.
 431     #
 432     # Table entries look like one of:
 433     # 0000      0040    Common  # [65]
 434     # 00AA              Latin
 435
 436     my $table = shift;
 437     my @return;
 438     local $_;
 439
 440     for (split /^/m, do $table) {
 441         my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
 442                                         \s* ( \# .* )?  # Optional comment
 443                                         $ /x;
 444         $end = $start if $end eq "";
 445         push @return, [ hex $start, hex $end, $value ];
 446     }
 447     return @return;
 448 }
 449
 450 sub charinrange {
 451     my ($range, $arg) = @_;
 452     my $code = _getcode($arg);
 453     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 454         unless defined $code;
 455     _search($range, 0, $#$range, $code);
 456 }
 457
 458 =head2 B<charblock()>
 459
 460     use Unicode::UCD 'charblock';
 461
 462     my $charblock = charblock(0x41);
 463     my $charblock = charblock(1234);
 464     my $charblock = charblock(0x263a);
 465     my $charblock = charblock("U+263a");
 466
 467     my $range     = charblock('Armenian');
 468
 469 With a L</code point argument> charblock() returns the I<block> the code point
 470 belongs to, e.g.  C<Basic Latin>.
 471 If the code point is unassigned, this returns the block it would belong to if
 472 it were assigned (which it may in future versions of the Unicode Standard).
 473
 474 See also L</Blocks versus Scripts>.
 475
 476 If supplied with an argument that can't be a code point, charblock() tries
 477 to do the opposite and interpret the argument as a code point block. The
 478 return value is a I<range>: an anonymous list of lists that contain
 479 I<start-of-range>, I<end-of-range> code point pairs. You can test whether
 480 a code point is in a range using the L</charinrange()> function. If the
 481 argument is not a known code point block, B<undef> is returned.
 482
 483 =cut
 484
 485 my @BLOCKS;
 486 my %BLOCKS;
 487
 488 sub _charblocks {
 489
 490     # Can't read from the mktables table because it loses the hyphens in the
 491     # original.
 492     unless (@BLOCKS) {
 493         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 494             local $_;
 495             while (<$BLOCKSFH>) {
 496                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 497                     my ($lo, $hi) = (hex($1), hex($2));
 498                     my $subrange = [ $lo, $hi, $3 ];
 499                     push @BLOCKS, $subrange;
 500                     push @{$BLOCKS{$3}}, $subrange;
 501                 }
 502             }
 503             close($BLOCKSFH);
 504         }
 505     }
 506 }
 507
 508 sub charblock {
 509     my $arg = shift;
 510
 511     _charblocks() unless @BLOCKS;
 512
 513     my $code = _getcode($arg);
 514
 515     if (defined $code) {
 516         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 517     } else {
 518         if (exists $BLOCKS{$arg}) {
 519             return dclone $BLOCKS{$arg};
 520         } else {
 521             return;
 522         }
 523     }
 524 }
 525
 526 =head2 B<charscript()>
 527
 528     use Unicode::UCD 'charscript';
 529
 530     my $charscript = charscript(0x41);
 531     my $charscript = charscript(1234);
 532     my $charscript = charscript("U+263a");
 533
 534     my $range      = charscript('Thai');
 535
 536 With a L</code point argument> charscript() returns the I<script> the
 537 code point belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 538 If the code point is unassigned, it returns B<undef>
 539
 540 If supplied with an argument that can't be a code point, charscript() tries
 541 to do the opposite and interpret the argument as a code point script. The
 542 return value is a I<range>: an anonymous list of lists that contain
 543 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 544 code point is in a range using the L</charinrange()> function. If the
 545 argument is not a known code point script, B<undef> is returned.
 546
 547 See also L</Blocks versus Scripts>.
 548
 549 =cut
 550
 551 my @SCRIPTS;
 552 my %SCRIPTS;
 553
 554 sub _charscripts {
 555     @SCRIPTS =_read_table("unicore/To/Sc.pl") unless @SCRIPTS;
 556     foreach my $entry (@SCRIPTS) {
 557         push @{$SCRIPTS{$entry->[2]}}, $entry;
 558     }
 559 }
 560
 561 sub charscript {
 562     my $arg = shift;
 563
 564     _charscripts() unless @SCRIPTS;
 565
 566     my $code = _getcode($arg);
 567
 568     if (defined $code) {
 569         my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 570         return $result if defined $result;
 571         #return $utf8::SwashInfo{'ToSc'}{'missing'};
 572     } elsif (exists $SCRIPTS{$arg}) {
 573         return dclone $SCRIPTS{$arg};
 574     }
 575
 576     return;
 577 }
 578
 579 =head2 B<charblocks()>
 580
 581     use Unicode::UCD 'charblocks';
 582
 583     my $charblocks = charblocks();
 584
 585 charblocks() returns a reference to a hash with the known block names
 586 as the keys, and the code point ranges (see L</charblock()>) as the values.
 587
 588 See also L</Blocks versus Scripts>.
 589
 590 =cut
 591
 592 sub charblocks {
 593     _charblocks() unless %BLOCKS;
 594     return dclone \%BLOCKS;
 595 }
 596
 597 =head2 B<charscripts()>
 598
 599     use Unicode::UCD 'charscripts';
 600
 601     my $charscripts = charscripts();
 602
 603 charscripts() returns a reference to a hash with the known script
 604 names as the keys, and the code point ranges (see L</charscript()>) as
 605 the values.
 606
 607 See also L</Blocks versus Scripts>.
 608
 609 =cut
 610
 611 sub charscripts {
 612     _charscripts() unless %SCRIPTS;
 613     return dclone \%SCRIPTS;
 614 }
 615
 616 =head2 B<charinrange()>
 617
 618 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 619 can also test whether a code point is in the I<range> as returned by
 620 L</charblock()> and L</charscript()> or as the values of the hash returned
 621 by L</charblocks()> and L</charscripts()> by using charinrange():
 622
 623     use Unicode::UCD qw(charscript charinrange);
 624
 625     $range = charscript('Hiragana');
 626     print "looks like hiragana\n" if charinrange($range, $codepoint);
 627
 628 =cut
 629
 630 my %GENERAL_CATEGORIES =
 631  (
 632     'L'  =>         'Letter',
 633     'LC' =>         'CasedLetter',
 634     'Lu' =>         'UppercaseLetter',
 635     'Ll' =>         'LowercaseLetter',
 636     'Lt' =>         'TitlecaseLetter',
 637     'Lm' =>         'ModifierLetter',
 638     'Lo' =>         'OtherLetter',
 639     'M'  =>         'Mark',
 640     'Mn' =>         'NonspacingMark',
 641     'Mc' =>         'SpacingMark',
 642     'Me' =>         'EnclosingMark',
 643     'N'  =>         'Number',
 644     'Nd' =>         'DecimalNumber',
 645     'Nl' =>         'LetterNumber',
 646     'No' =>         'OtherNumber',
 647     'P'  =>         'Punctuation',
 648     'Pc' =>         'ConnectorPunctuation',
 649     'Pd' =>         'DashPunctuation',
 650     'Ps' =>         'OpenPunctuation',
 651     'Pe' =>         'ClosePunctuation',
 652     'Pi' =>         'InitialPunctuation',
 653     'Pf' =>         'FinalPunctuation',
 654     'Po' =>         'OtherPunctuation',
 655     'S'  =>         'Symbol',
 656     'Sm' =>         'MathSymbol',
 657     'Sc' =>         'CurrencySymbol',
 658     'Sk' =>         'ModifierSymbol',
 659     'So' =>         'OtherSymbol',
 660     'Z'  =>         'Separator',
 661     'Zs' =>         'SpaceSeparator',
 662     'Zl' =>         'LineSeparator',
 663     'Zp' =>         'ParagraphSeparator',
 664     'C'  =>         'Other',
 665     'Cc' =>         'Control',
 666     'Cf' =>         'Format',
 667     'Cs' =>         'Surrogate',
 668     'Co' =>         'PrivateUse',
 669     'Cn' =>         'Unassigned',
 670  );
 671
 672 sub general_categories {
 673     return dclone \%GENERAL_CATEGORIES;
 674 }
 675
 676 =head2 B<general_categories()>
 677
 678     use Unicode::UCD 'general_categories';
 679
 680     my $categories = general_categories();
 681
 682 This returns a reference to a hash which has short
 683 general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
 684 names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
 685 C<Symbol>) as values.  The hash is reversible in case you need to go
 686 from the long names to the short names.  The general category is the
 687 one returned from
 688 L</charinfo()> under the C<category> key.
 689
 690 =cut
 691
 692 my %BIDI_TYPES =
 693  (
 694    'L'   => 'Left-to-Right',
 695    'LRE' => 'Left-to-Right Embedding',
 696    'LRO' => 'Left-to-Right Override',
 697    'R'   => 'Right-to-Left',
 698    'AL'  => 'Right-to-Left Arabic',
 699    'RLE' => 'Right-to-Left Embedding',
 700    'RLO' => 'Right-to-Left Override',
 701    'PDF' => 'Pop Directional Format',
 702    'EN'  => 'European Number',
 703    'ES'  => 'European Number Separator',
 704    'ET'  => 'European Number Terminator',
 705    'AN'  => 'Arabic Number',
 706    'CS'  => 'Common Number Separator',
 707    'NSM' => 'Non-Spacing Mark',
 708    'BN'  => 'Boundary Neutral',
 709    'B'   => 'Paragraph Separator',
 710    'S'   => 'Segment Separator',
 711    'WS'  => 'Whitespace',
 712    'ON'  => 'Other Neutrals',
 713  );
 714
 715 =head2 B<bidi_types()>
 716
 717     use Unicode::UCD 'bidi_types';
 718
 719     my $categories = bidi_types();
 720
 721 This returns a reference to a hash which has the short
 722 bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
 723 names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
 724 hash is reversible in case you need to go from the long names to the
 725 short names.  The bidi type is the one returned from
 726 L</charinfo()>
 727 under the C<bidi> key.  For the exact meaning of the various bidi classes
 728 the Unicode TR9 is recommended reading:
 729 L<http://www.unicode.org/reports/tr9/>
 730 (as of Unicode 5.0.0)
 731
 732 =cut
 733
 734 sub bidi_types {
 735     return dclone \%BIDI_TYPES;
 736 }
 737
 738 =head2 B<compexcl()>
 739
 740     use Unicode::UCD 'compexcl';
 741
 742     my $compexcl = compexcl(0x09dc);
 743
 744 This routine is included for backwards compatibility, but as of Perl 5.12, for
 745 most purposes it is probably more convenient to use one of the following
 746 instead:
 747
 748     my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
 749     my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
 750
 751 or even
 752
 753     my $compexcl = chr(0x09dc) =~ /\p{CE};
 754     my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
 755
 756 The first two forms return B<true> if the L</code point argument> should not
 757 be produced by composition normalization.  The final two forms
 758 additionally require that this fact not otherwise be determinable from
 759 the Unicode data base for them to return B<true>.
 760
 761 This routine behaves identically to the final two forms.  That is,
 762 it does not return B<true> if the code point has a decomposition
 763 consisting of another single code point, nor if its decomposition starts
 764 with a code point whose combining class is non-zero.  Code points that meet
 765 either of these conditions should also not be produced by composition
 766 normalization, which is probably why you should use the
 767 C<Full_Composition_Exclusion> property instead, as shown above.
 768
 769 The routine returns B<false> otherwise.
 770
 771 =cut
 772
 773 sub compexcl {
 774     my $arg  = shift;
 775     my $code = _getcode($arg);
 776     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 777         unless defined $code;
 778
 779     no warnings "utf8";     # So works on surrogates and non-Unicode code points
 780     return chr($code) =~ /\p{Composition_Exclusion}/;
 781 }
 782
 783 =head2 B<casefold()>
 784
 785     use Unicode::UCD 'casefold';
 786
 787     my $casefold = casefold(0xDF);
 788     if (defined $casefold) {
 789         my @full_fold_hex = split / /, $casefold->{'full'};
 790         my $full_fold_string =
 791                     join "", map {chr(hex($_))} @full_fold_hex;
 792         my @turkic_fold_hex =
 793                         split / /, ($casefold->{'turkic'} ne "")
 794                                         ? $casefold->{'turkic'}
 795                                         : $casefold->{'full'};
 796         my $turkic_fold_string =
 797                         join "", map {chr(hex($_))} @turkic_fold_hex;
 798     }
 799     if (defined $casefold && $casefold->{'simple'} ne "") {
 800         my $simple_fold_hex = $casefold->{'simple'};
 801         my $simple_fold_string = chr(hex($simple_fold_hex));
 802     }
 803
 804 This returns the (almost) locale-independent case folding of the
 805 character specified by the L</code point argument>.
 806
 807 If there is no case folding for that code point, B<undef> is returned.
 808
 809 If there is a case folding for that code point, a reference to a hash
 810 with the following fields is returned:
 811
 812 =over
 813
 814 =item B<code>
 815
 816 the input L</code point argument> expressed in hexadecimal, with leading zeros
 817 added if necessary to make it contain at least four hexdigits
 818
 819 =item B<full>
 820
 821 one or more codes (separated by spaces) that taken in order give the
 822 code points for the case folding for I<code>.
 823 Each has at least four hexdigits.
 824
 825 =item B<simple>
 826
 827 is empty, or is exactly one code with at least four hexdigits which can be used
 828 as an alternative case folding when the calling program cannot cope with the
 829 fold being a sequence of multiple code points.  If I<full> is just one code
 830 point, then I<simple> equals I<full>.  If there is no single code point folding
 831 defined for I<code>, then I<simple> is the empty string.  Otherwise, it is an
 832 inferior, but still better-than-nothing alternative folding to I<full>.
 833
 834 =item B<mapping>
 835
 836 is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
 837 otherwise.  It can be considered to be the simplest possible folding for
 838 I<code>.  It is defined primarily for backwards compatibility.
 839
 840 =item B<status>
 841
 842 is C<C> (for C<common>) if the best possible fold is a single code point
 843 (I<simple> equals I<full> equals I<mapping>).  It is C<S> if there are distinct
 844 folds, I<simple> and I<full> (I<mapping> equals I<simple>).  And it is C<F> if
 845 there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).  Note
 846 that this
 847 describes the contents of I<mapping>.  It is defined primarily for backwards
 848 compatibility.
 849
 850 On versions 3.1 and earlier of Unicode, I<status> can also be
 851 C<I> which is the same as C<C> but is a special case for dotted uppercase I and
 852 dotless lowercase i:
 853
 854 =over
 855
 856 =item B<*>
 857
 858 If you use this C<I> mapping, the result is case-insensitive,
 859 but dotless and dotted I's are not distinguished
 860
 861 =item B<*>
 862
 863 If you exclude this C<I> mapping, the result is not fully case-insensitive, but
 864 dotless and dotted I's are distinguished
 865
 866 =back
 867
 868 =item B<turkic>
 869
 870 contains any special folding for Turkic languages.  For versions of Unicode
 871 starting with 3.2, this field is empty unless I<code> has a different folding
 872 in Turkic languages, in which case it is one or more codes (separated by
 873 spaces) that taken in order give the code points for the case folding for
 874 I<code> in those languages.
 875 Each code has at least four hexdigits.
 876 Note that this folding does not maintain canonical equivalence without
 877 additional processing.
 878
 879 For versions of Unicode 3.1 and earlier, this field is empty unless there is a
 880 special folding for Turkic languages, in which case I<status> is C<I>, and
 881 I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
 882
 883 =back
 884
 885 Programs that want complete generality and the best folding results should use
 886 the folding contained in the I<full> field.  But note that the fold for some
 887 code points will be a sequence of multiple code points.
 888
 889 Programs that can't cope with the fold mapping being multiple code points can
 890 use the folding contained in the I<simple> field, with the loss of some
 891 generality.  In Unicode 5.1, about 7% of the defined foldings have no single
 892 code point folding.
 893
 894 The I<mapping> and I<status> fields are provided for backwards compatibility for
 895 existing programs.  They contain the same values as in previous versions of
 896 this function.
 897
 898 Locale is not completely independent.  The I<turkic> field contains results to
 899 use when the locale is a Turkic language.
 900
 901 For more information about case mappings see
 902 L<http://www.unicode.org/unicode/reports/tr21>
 903
 904 =cut
 905
 906 my %CASEFOLD;
 907
 908 sub _casefold {
 909     unless (%CASEFOLD) {
 910         if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 911             local $_;
 912             while (<$CASEFOLDFH>) {
 913                 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 914                     my $code = hex($1);
 915                     $CASEFOLD{$code}{'code'} = $1;
 916                     $CASEFOLD{$code}{'turkic'} = "" unless
 917                                             defined $CASEFOLD{$code}{'turkic'};
 918                     if ($2 eq 'C' || $2 eq 'I') {       # 'I' is only on 3.1 and
 919                                                         # earlier Unicodes
 920                                                         # Both entries there (I
 921                                                         # only checked 3.1) are
 922                                                         # the same as C, and
 923                                                         # there are no other
 924                                                         # entries for those
 925                                                         # codepoints, so treat
 926                                                         # as if C, but override
 927                                                         # the turkic one for
 928                                                         # 'I'.
 929                         $CASEFOLD{$code}{'status'} = $2;
 930                         $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
 931                         $CASEFOLD{$code}{'mapping'} = $3;
 932                         $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
 933                     } elsif ($2 eq 'F') {
 934                         $CASEFOLD{$code}{'full'} = $3;
 935                         unless (defined $CASEFOLD{$code}{'simple'}) {
 936                                 $CASEFOLD{$code}{'simple'} = "";
 937                                 $CASEFOLD{$code}{'mapping'} = $3;
 938                                 $CASEFOLD{$code}{'status'} = $2;
 939                         }
 940                     } elsif ($2 eq 'S') {
 941
 942
 943                         # There can't be a simple without a full, and simple
 944                         # overrides all but full
 945
 946                         $CASEFOLD{$code}{'simple'} = $3;
 947                         $CASEFOLD{$code}{'mapping'} = $3;
 948                         $CASEFOLD{$code}{'status'} = $2;
 949                     } elsif ($2 eq 'T') {
 950                         $CASEFOLD{$code}{'turkic'} = $3;
 951                     } # else can't happen because only [CIFST] are possible
 952                 }
 953             }
 954             close($CASEFOLDFH);
 955         }
 956     }
 957 }
 958
 959 sub casefold {
 960     my $arg  = shift;
 961     my $code = _getcode($arg);
 962     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 963         unless defined $code;
 964
 965     _casefold() unless %CASEFOLD;
 966
 967     return $CASEFOLD{$code};
 968 }
 969
 970 =head2 B<casespec()>
 971
 972     use Unicode::UCD 'casespec';
 973
 974     my $casespec = casespec(0xFB00);
 975
 976 This returns the potentially locale-dependent case mappings of the L</code point
 977 argument>.  The mappings may be longer than a single code point (which the basic
 978 Unicode case mappings as returned by L</charinfo()> never are).
 979
 980 If there are no case mappings for the L</code point argument>, or if all three
 981 possible mappings (I<lower>, I<title> and I<upper>) result in single code
 982 points and are locale independent and unconditional, B<undef> is returned
 983 (which means that the case mappings, if any, for the code point are those
 984 returned by L</charinfo()>).
 985
 986 Otherwise, a reference to a hash giving the mappings (or a reference to a hash
 987 of such hashes, explained below) is returned with the following keys and their
 988 meanings:
 989
 990 The keys in the bottom layer hash with the meanings of their values are:
 991
 992 =over
 993
 994 =item B<code>
 995
 996 the input L</code point argument> expressed in hexadecimal, with leading zeros
 997 added if necessary to make it contain at least four hexdigits
 998
 999 =item B<lower>
1000
1001 one or more codes (separated by spaces) that taken in order give the
1002 code points for the lower case of I<code>.
1003 Each has at least four hexdigits.
1004
1005 =item B<title>
1006
1007 one or more codes (separated by spaces) that taken in order give the
1008 code points for the title case of I<code>.
1009 Each has at least four hexdigits.
1010
1011 =item B<upper>
1012
1013 one or more codes (separated by spaces) that taken in order give the
1014 code points for the upper case of I<code>.
1015 Each has at least four hexdigits.
1016
1017 =item B<condition>
1018
1019 the conditions for the mappings to be valid.
1020 If B<undef>, the mappings are always valid.
1021 When defined, this field is a list of conditions,
1022 all of which must be true for the mappings to be valid.
1023 The list consists of one or more
1024 I<locales> (see below)
1025 and/or I<contexts> (explained in the next paragraph),
1026 separated by spaces.
1027 (Other than as used to separate elements, spaces are to be ignored.)
1028 Case distinctions in the condition list are not significant.
1029 Conditions preceded by "NON_" represent the negation of the condition.
1030
1031 A I<context> is one of those defined in the Unicode standard.
1032 For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1033 available at
1034 L<http://www.unicode.org/versions/Unicode5.1.0/>.
1035 These are for context-sensitive casing.
1036
1037 =back
1038
1039 The hash described above is returned for locale-independent casing, where
1040 at least one of the mappings has length longer than one.  If B<undef> is
1041 returned, the code point may have mappings, but if so, all are length one,
1042 and are returned by L</charinfo()>.
1043 Note that when this function does return a value, it will be for the complete
1044 set of mappings for a code point, even those whose length is one.
1045
1046 If there are additional casing rules that apply only in certain locales,
1047 an additional key for each will be defined in the returned hash.  Each such key
1048 will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1049 followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1050 and a variant code).  You can find the lists of all possible locales, see
1051 L<Locale::Country> and L<Locale::Language>.
1052 (In Unicode 6.0, the only locales returned by this function
1053 are C<lt>, C<tr>, and C<az>.)
1054
1055 Each locale key is a reference to a hash that has the form above, and gives
1056 the casing rules for that particular locale, which take precedence over the
1057 locale-independent ones when in that locale.
1058
1059 If the only casing for a code point is locale-dependent, then the returned
1060 hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1061 will contain only locale keys.
1062
1063 For more information about case mappings see
1064 L<http://www.unicode.org/unicode/reports/tr21/>
1065
1066 =cut
1067
1068 my %CASESPEC;
1069
1070 sub _casespec {
1071     unless (%CASESPEC) {
1072         if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
1073             local $_;
1074             while (<$CASESPECFH>) {
1075                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
1076                     my ($hexcode, $lower, $title, $upper, $condition) =
1077                         ($1, $2, $3, $4, $5);
1078                     my $code = hex($hexcode);
1079                     if (exists $CASESPEC{$code}) {
1080                         if (exists $CASESPEC{$code}->{code}) {
1081                             my ($oldlower,
1082                                 $oldtitle,
1083                                 $oldupper,
1084                                 $oldcondition) =
1085                                     @{$CASESPEC{$code}}{qw(lower
1086                                                            title
1087                                                            upper
1088                                                            condition)};
1089                             if (defined $oldcondition) {
1090                                 my ($oldlocale) =
1091                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
1092                                 delete $CASESPEC{$code};
1093                                 $CASESPEC{$code}->{$oldlocale} =
1094                                 { code      => $hexcode,
1095                                   lower     => $oldlower,
1096                                   title     => $oldtitle,
1097                                   upper     => $oldupper,
1098                                   condition => $oldcondition };
1099                             }
1100                         }
1101                         my ($locale) =
1102                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1103                         $CASESPEC{$code}->{$locale} =
1104                         { code      => $hexcode,
1105                           lower     => $lower,
1106                           title     => $title,
1107                           upper     => $upper,
1108                           condition => $condition };
1109                     } else {
1110                         $CASESPEC{$code} =
1111                         { code      => $hexcode,
1112                           lower     => $lower,
1113                           title     => $title,
1114                           upper     => $upper,
1115                           condition => $condition };
1116                     }
1117                 }
1118             }
1119             close($CASESPECFH);
1120         }
1121     }
1122 }
1123
1124 sub casespec {
1125     my $arg  = shift;
1126     my $code = _getcode($arg);
1127     croak __PACKAGE__, "::casespec: unknown code '$arg'"
1128         unless defined $code;
1129
1130     _casespec() unless %CASESPEC;
1131
1132     return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
1133 }
1134
1135 =head2 B<namedseq()>
1136
1137     use Unicode::UCD 'namedseq';
1138
1139     my $namedseq = namedseq("KATAKANA LETTER AINU P");
1140     my @namedseq = namedseq("KATAKANA LETTER AINU P");
1141     my %namedseq = namedseq();
1142
1143 If used with a single argument in a scalar context, returns the string
1144 consisting of the code points of the named sequence, or B<undef> if no
1145 named sequence by that name exists.  If used with a single argument in
1146 a list context, it returns the list of the ordinals of the code points.  If used
1147 with no
1148 arguments in a list context, returns a hash with the names of the
1149 named sequences as the keys and the named sequences as strings as
1150 the values.  Otherwise, it returns B<undef> or an empty list depending
1151 on the context.
1152
1153 This function only operates on officially approved (not provisional) named
1154 sequences.
1155
1156 Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
1157 sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
1158 LETTER AINU P")> will return the same string this function does, but will also
1159 operate on character names that aren't named sequences, without you having to
1160 know which are which.  See L<charnames>.
1161
1162 =cut
1163
1164 my %NAMEDSEQ;
1165
1166 sub _namedseq {
1167     unless (%NAMEDSEQ) {
1168         if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
1169             local $_;
1170             while (<$NAMEDSEQFH>) {
1171                 if (/^ [0-9A-F]+ \  /x) {
1172                     chomp;
1173                     my ($sequence, $name) = split /\t/;
1174                     my @s = map { chr(hex($_)) } split(' ', $sequence);
1175                     $NAMEDSEQ{$name} = join("", @s);
1176                 }
1177             }
1178             close($NAMEDSEQFH);
1179         }
1180     }
1181 }
1182
1183 sub namedseq {
1184
1185     # Use charnames::string_vianame() which now returns this information,
1186     # unless the caller wants the hash returned, in which case we read it in,
1187     # and thereafter use it instead of calling charnames, as it is faster.
1188
1189     my $wantarray = wantarray();
1190     if (defined $wantarray) {
1191         if ($wantarray) {
1192             if (@_ == 0) {
1193                 _namedseq() unless %NAMEDSEQ;
1194                 return %NAMEDSEQ;
1195             } elsif (@_ == 1) {
1196                 my $s;
1197                 if (%NAMEDSEQ) {
1198                     $s = $NAMEDSEQ{ $_[0] };
1199                 }
1200                 else {
1201                     $s = charnames::string_vianame($_[0]);
1202                 }
1203                 return defined $s ? map { ord($_) } split('', $s) : ();
1204             }
1205         } elsif (@_ == 1) {
1206             return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
1207             return charnames::string_vianame($_[0]);
1208         }
1209     }
1210     return;
1211 }
1212
1213 my %NUMERIC;
1214
1215 sub _numeric {
1216
1217     # Unicode 6.0 instituted the rule that only digits in a consecutive
1218     # block of 10 would be considered decimal digits.  Before that, the only
1219     # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
1220     # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
1221     # ONE.  The code could be modified to handle that, but not bothering, as
1222     # in TUS 6.0, U+19DA was changed to Nt=Di.
1223     if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
1224         croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
1225     }
1226     my @numbers = _read_table("unicore/To/Nv.pl");
1227     foreach my $entry (@numbers) {
1228         my ($start, $end, $value) = @$entry;
1229
1230         # If value contains a slash, convert to decimal
1231         if ((my @rational = split /\//, $value) == 2) {
1232             my $real = $rational[0] / $rational[1];
1233             $value = $real;
1234         }
1235
1236         for my $i ($start .. $end) {
1237             $NUMERIC{$i} = $value;
1238         }
1239     }
1240
1241     # Decided unsafe to use these that aren't officially part of the Unicode
1242     # standard.
1243     #use Math::Trig;
1244     #my $pi = acos(-1.0);
1245     #$NUMERIC{0x03C0} = $pi;
1246
1247     # Euler's constant, not to be confused with Euler's number
1248     #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
1249
1250     # Euler's number
1251     #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
1252
1253     return;
1254 }
1255
1256 =pod
1257
1258 =head2 num
1259
1260 C<num> returns the numeric value of the input Unicode string; or C<undef> if it
1261 doesn't think the entire string has a completely valid, safe numeric value.
1262
1263 If the string is just one character in length, the Unicode numeric value
1264 is returned if it has one, or C<undef> otherwise.  Note that this need
1265 not be a whole number.  C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
1266 example returns -0.5.
1267
1268 =cut
1269
1270 #A few characters to which Unicode doesn't officially
1271 #assign a numeric value are considered numeric by C<num>.
1272 #These are:
1273
1274 # EULER CONSTANT             0.5772...  (this is NOT Euler's number)
1275 # SCRIPT SMALL E             2.71828... (this IS Euler's number)
1276 # GREEK SMALL LETTER PI      3.14159...
1277
1278 =pod
1279
1280 If the string is more than one character, C<undef> is returned unless
1281 all its characters are decimal digits (that is they would match C<\d+>),
1282 from the same script.  For example if you have an ASCII '0' and a Bengali
1283 '3', mixed together, they aren't considered a valid number, and C<undef>
1284 is returned.  A further restriction is that the digits all have to be of
1285 the same form.  A half-width digit mixed with a full-width one will
1286 return C<undef>.  The Arabic script has two sets of digits;  C<num> will
1287 return C<undef> unless all the digits in the string come from the same
1288 set.
1289
1290 C<num> errs on the side of safety, and there may be valid strings of
1291 decimal digits that it doesn't recognize.  Note that Unicode defines
1292 a number of "digit" characters that aren't "decimal digit" characters.
1293 "Decimal digits" have the property that they have a positional value, i.e.,
1294 there is a units position, a 10's position, a 100's, etc, AND they are
1295 arranged in Unicode in blocks of 10 contiguous code points.  The Chinese
1296 digits, for example, are not in such a contiguous block, and so Unicode
1297 doesn't view them as decimal digits, but merely digits, and so C<\d> will not
1298 match them.  A single-character string containing one of these digits will
1299 have its decimal value returned by C<num>, but any longer string containing
1300 only these digits will return C<undef>.
1301
1302 Strings of multiple sub- and superscripts are not recognized as numbers.  You
1303 can use either of the compatibility decompositions in Unicode::Normalize to
1304 change these into digits, and then call C<num> on the result.
1305
1306 =cut
1307
1308 # To handle sub, superscripts, this could if called in list context,
1309 # consider those, and return the <decomposition> type in the second
1310 # array element.
1311
1312 sub num {
1313     my $string = $_[0];
1314
1315     _numeric unless %NUMERIC;
1316
1317     my $length = length($string);
1318     return $NUMERIC{ord($string)} if $length == 1;
1319     return if $string =~ /\D/;
1320     my $first_ord = ord(substr($string, 0, 1));
1321     my $value = $NUMERIC{$first_ord};
1322     my $zero_ord = $first_ord - $value;
1323
1324     for my $i (1 .. $length -1) {
1325         my $ord = ord(substr($string, $i, 1));
1326         my $digit = $ord - $zero_ord;
1327         return unless $digit >= 0 && $digit <= 9;
1328         $value = $value * 10 + $digit;
1329     }
1330     return $value;
1331 }
1332
1333
1334
1335 =head2 Unicode::UCD::UnicodeVersion
1336
1337 This returns the version of the Unicode Character Database, in other words, the
1338 version of the Unicode standard the database implements.  The version is a
1339 string of numbers delimited by dots (C<'.'>).
1340
1341 =cut
1342
1343 my $UNICODEVERSION;
1344
1345 sub UnicodeVersion {
1346     unless (defined $UNICODEVERSION) {
1347         openunicode(\$VERSIONFH, "version");
1348         chomp($UNICODEVERSION = <$VERSIONFH>);
1349         close($VERSIONFH);
1350         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1351             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1352     }
1353     return $UNICODEVERSION;
1354 }
1355
1356 =head2 B<Blocks versus Scripts>
1357
1358 The difference between a block and a script is that scripts are closer
1359 to the linguistic notion of a set of code points required to present
1360 languages, while block is more of an artifact of the Unicode code point
1361 numbering and separation into blocks of (mostly) 256 code points.
1362
1363 For example the Latin B<script> is spread over several B<blocks>, such
1364 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1365 C<Latin Extended-B>.  On the other hand, the Latin script does not
1366 contain all the characters of the C<Basic Latin> block (also known as
1367 ASCII): it includes only the letters, and not, for example, the digits
1368 or the punctuation.
1369
1370 For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1371
1372 For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1373
1374 =head2 B<Matching Scripts and Blocks>
1375
1376 Scripts are matched with the regular-expression construct
1377 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
1378 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
1379 any of the 256 code points in the Tibetan block).
1380
1381
1382 =head2 Implementation Note
1383
1384 The first use of charinfo() opens a read-only filehandle to the Unicode
1385 Character Database (the database is included in the Perl distribution).
1386 The filehandle is then kept open for further queries.  In other words,
1387 if you are wondering where one of your filehandles went, that's where.
1388
1389 =head1 BUGS
1390
1391 Does not yet support EBCDIC platforms.
1392
1393 =head1 AUTHOR
1394
1395 Jarkko Hietaniemi
1396
1397 =cut
1398
1399 1;