lib/Unicode/UCD.pm

   1 package Unicode::UCD;
   2
   3 use strict;
   4 use warnings;
   5 use charnames ();
   6
   7 our $VERSION = '0.30';
   8
   9 use Storable qw(dclone);
  10
  11 require Exporter;
  12
  13 our @ISA = qw(Exporter);
  14
  15 our @EXPORT_OK = qw(charinfo
  16                     charblock charscript
  17                     charblocks charscripts
  18                     charinrange
  19                     general_categories bidi_types
  20                     compexcl
  21                     casefold casespec
  22                     namedseq);
  23
  24 use Carp;
  25
  26 =head1 NAME
  27
  28 Unicode::UCD - Unicode character database
  29
  30 =head1 SYNOPSIS
  31
  32     use Unicode::UCD 'charinfo';
  33     my $charinfo   = charinfo($codepoint);
  34
  35     use Unicode::UCD 'casefold';
  36     my $casefold = casefold(0xFB00);
  37
  38     use Unicode::UCD 'casespec';
  39     my $casespec = casespec(0xFB00);
  40
  41     use Unicode::UCD 'charblock';
  42     my $charblock  = charblock($codepoint);
  43
  44     use Unicode::UCD 'charscript';
  45     my $charscript = charscript($codepoint);
  46
  47     use Unicode::UCD 'charblocks';
  48     my $charblocks = charblocks();
  49
  50     use Unicode::UCD 'charscripts';
  51     my $charscripts = charscripts();
  52
  53     use Unicode::UCD qw(charscript charinrange);
  54     my $range = charscript($script);
  55     print "looks like $script\n" if charinrange($range, $codepoint);
  56
  57     use Unicode::UCD qw(general_categories bidi_types);
  58     my $categories = general_categories();
  59     my $types = bidi_types();
  60
  61     use Unicode::UCD 'compexcl';
  62     my $compexcl = compexcl($codepoint);
  63
  64     use Unicode::UCD 'namedseq';
  65     my $namedseq = namedseq($named_sequence_name);
  66
  67     my $unicode_version = Unicode::UCD::UnicodeVersion();
  68
  69 =head1 DESCRIPTION
  70
  71 The Unicode::UCD module offers a series of functions that
  72 provide a simple interface to the Unicode
  73 Character Database.
  74
  75 =head2 code point argument
  76
  77 Some of the functions are called with a I<code point argument>, which is either
  78 a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
  79 followed by hexadecimals designating a Unicode code point.  In other words, if
  80 you want a code point to be interpreted as a hexadecimal number, you must
  81 prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
  82 interpreted as a decimal code point.  Also note that Unicode is B<not> limited
  83 to 16 bits (the number of Unicode code points is open-ended, in theory
  84 unlimited): you may have more than 4 hexdigits.
  85 =cut
  86
  87 my $UNICODEFH;
  88 my $BLOCKSFH;
  89 my $SCRIPTSFH;
  90 my $VERSIONFH;
  91 my $COMPEXCLFH;
  92 my $CASEFOLDFH;
  93 my $CASESPECFH;
  94 my $NAMEDSEQFH;
  95
  96 sub openunicode {
  97     my ($rfh, @path) = @_;
  98     my $f;
  99     unless (defined $$rfh) {
 100         for my $d (@INC) {
 101             use File::Spec;
 102             $f = File::Spec->catfile($d, "unicore", @path);
 103             last if open($$rfh, $f);
 104             undef $f;
 105         }
 106         croak __PACKAGE__, ": failed to find ",
 107               File::Spec->catfile(@path), " in @INC"
 108             unless defined $f;
 109     }
 110     return $f;
 111 }
 112
 113 =head2 B<charinfo()>
 114
 115     use Unicode::UCD 'charinfo';
 116
 117     my $charinfo = charinfo(0x41);
 118
 119 This returns information about the input L</code point argument>
 120 as a reference to a hash of fields as defined by the Unicode
 121 standard.  If the L</code point argument> is not assigned in the standard
 122 (i.e., has the general category C<Cn> meaning C<Unassigned>)
 123 or is a non-character (meaning it is guaranteed to never be assigned in
 124 the standard),
 125 B<undef> is returned.
 126
 127 Fields that aren't applicable to the particular code point argument exist in the
 128 returned hash, and are empty.
 129
 130 The keys in the hash with the meanings of their values are:
 131
 132 =over
 133
 134 =item B<code>
 135
 136 the input L</code point argument> expressed in hexadecimal, with leading zeros
 137 added if necessary to make it contain at least four hexdigits
 138
 139 =item B<name>
 140
 141 name of I<code>, all IN UPPER CASE.
 142 Some control-type code points do not have names.
 143 This field will be empty for C<Surrogate> and C<Private Use> code points,
 144 and for the others without a name,
 145 it will contain a description enclosed in angle brackets, like
 146 C<E<lt>controlE<gt>>.
 147
 148
 149 =item B<category>
 150
 151 The short name of the general category of I<code>.
 152 This will match one of the keys in the hash returned by L</general_categories()>.
 153
 154 =item B<combining>
 155
 156 the combining class number for I<code> used in the Canonical Ordering Algorithm.
 157 For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
 158 available at
 159 L<http://www.unicode.org/versions/Unicode5.1.0/>
 160
 161 =item B<bidi>
 162
 163 bidirectional type of I<code>.
 164 This will match one of the keys in the hash returned by L</bidi_types()>.
 165
 166 =item B<decomposition>
 167
 168 is empty if I<code> has no decomposition; or is one or more codes
 169 (separated by spaces) that taken in order represent a decomposition for
 170 I<code>.  Each has at least four hexdigits.
 171 The codes may be preceded by a word enclosed in angle brackets then a space,
 172 like C<E<lt>compatE<gt> >, giving the type of decomposition
 173
 174 =item B<decimal>
 175
 176 if I<code> is a decimal digit this is its integer numeric value
 177
 178 =item B<digit>
 179
 180 if I<code> represents a whole number, this is its integer numeric value
 181
 182 =item B<numeric>
 183
 184 if I<code> represents a whole or rational number, this is its numeric value.
 185 Rational values are expressed as a string like C<1/4>.
 186
 187 =item B<mirrored>
 188
 189 C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
 190
 191 =item B<unicode10>
 192
 193 name of I<code> in the Unicode 1.0 standard if one
 194 existed for this code point and is different from the current name
 195
 196 =item B<comment>
 197
 198 ISO 10646 comment field.
 199 It appears in parentheses in the ISO 10646 names list,
 200 or contains an asterisk to indicate there is
 201 a note for this code point in Annex P of that standard.
 202
 203 =item B<upper>
 204
 205 is empty if there is no single code point uppercase mapping for I<code>;
 206 otherwise it is that mapping expressed as at least four hexdigits.
 207 (L</casespec()> should be used in addition to B<charinfo()>
 208 for case mappings when the calling program can cope with multiple code point
 209 mappings.)
 210
 211 =item B<lower>
 212
 213 is empty if there is no single code point lowercase mapping for I<code>;
 214 otherwise it is that mapping expressed as at least four hexdigits.
 215 (L</casespec()> should be used in addition to B<charinfo()>
 216 for case mappings when the calling program can cope with multiple code point
 217 mappings.)
 218
 219 =item B<title>
 220
 221 is empty if there is no single code point titlecase mapping for I<code>;
 222 otherwise it is that mapping expressed as at least four hexdigits.
 223 (L</casespec()> should be used in addition to B<charinfo()>
 224 for case mappings when the calling program can cope with multiple code point
 225 mappings.)
 226
 227 =item B<block>
 228
 229 block I<code> belongs to (used in \p{In...}).
 230 See L</Blocks versus Scripts>.
 231
 232
 233 =item B<script>
 234
 235 script I<code> belongs to.
 236 See L</Blocks versus Scripts>.
 237
 238 =back
 239
 240 Note that you cannot do (de)composition and casing based solely on the
 241 I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
 242 you will need also the L</compexcl()>, and L</casespec()> functions.
 243
 244 =cut
 245
 246 # NB: This function is nearly duplicated in charnames.pm
 247 sub _getcode {
 248     my $arg = shift;
 249
 250     if ($arg =~ /^[1-9]\d*$/) {
 251         return $arg;
 252     } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
 253         return hex($1);
 254     }
 255
 256     return;
 257 }
 258
 259 # Lingua::KO::Hangul::Util not part of the standard distribution
 260 # but it will be used if available.
 261
 262 eval { require Lingua::KO::Hangul::Util };
 263 my $hasHangulUtil = ! $@;
 264 if ($hasHangulUtil) {
 265     Lingua::KO::Hangul::Util->import();
 266 }
 267
 268 sub hangul_decomp { # internal: called from charinfo
 269     if ($hasHangulUtil) {
 270         my @tmp = decomposeHangul(shift);
 271         return sprintf("%04X %04X",      @tmp) if @tmp == 2;
 272         return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
 273     }
 274     return;
 275 }
 276
 277 sub hangul_charname { # internal: called from charinfo
 278     return sprintf("HANGUL SYLLABLE-%04X", shift);
 279 }
 280
 281 sub han_charname { # internal: called from charinfo
 282     return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
 283 }
 284
 285 # Overwritten by data in file
 286 my %first_last = (
 287    'CJK Ideograph Extension A' => [ 0x3400,   0x4DB5   ],
 288    'CJK Ideograph'             => [ 0x4E00,   0x9FA5   ],
 289    'CJK Ideograph Extension B' => [ 0x20000,  0x2A6D6  ],
 290 );
 291
 292 get_charinfo_ranges();
 293
 294 sub get_charinfo_ranges {
 295    my @blocks = keys %first_last;
 296
 297    my $fh;
 298    openunicode( \$fh, 'UnicodeData.txt' );
 299    if( defined $fh ){
 300       while( my $line = <$fh> ){
 301          next unless $line =~ /(?:First|Last)/;
 302          if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
 303             my ($number,$block,$type);
 304             ($number,$block) = split /;/, $line;
 305             $block =~ s/<|>//g;
 306             ($block,$type) = split /, /, $block;
 307             my $index = $type eq 'First' ? 0 : 1;
 308             $first_last{ $block }->[$index] = hex $number;
 309          }
 310       }
 311    }
 312 }
 313
 314 my @CharinfoRanges = (
 315 # block name
 316 # [ first, last, coderef to name, coderef to decompose ],
 317 # CJK Ideographs Extension A
 318   [ @{ $first_last{'CJK Ideograph Extension A'} },        \&han_charname,   undef  ],
 319 # CJK Ideographs
 320   [ @{ $first_last{'CJK Ideograph'} },                    \&han_charname,   undef  ],
 321 # Hangul Syllables
 322   [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
 323 # Non-Private Use High Surrogates
 324   [ 0xD800,   0xDB7F,   undef,   undef  ],
 325 # Private Use High Surrogates
 326   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 327 # Low Surrogates
 328   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 329 # The Private Use Area
 330   [ 0xE000,   0xF8FF,   undef,   undef  ],
 331 # CJK Ideographs Extension B
 332   [ @{ $first_last{'CJK Ideograph Extension B'} },        \&han_charname,   undef  ],
 333 # Plane 15 Private Use Area
 334   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 335 # Plane 16 Private Use Area
 336   [ 0x100000, 0x10FFFD, undef,   undef  ],
 337 );
 338
 339 sub charinfo {
 340     my $arg  = shift;
 341     my $code = _getcode($arg);
 342     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 343         unless defined $code;
 344     my $hexk = sprintf("%06X", $code);
 345     my($rcode,$rname,$rdec);
 346     foreach my $range (@CharinfoRanges){
 347       if ($range->[0] <= $code && $code <= $range->[1]) {
 348         $rcode = $hexk;
 349         $rcode =~ s/^0+//;
 350         $rcode =  sprintf("%04X", hex($rcode));
 351         $rname = $range->[2] ? $range->[2]->($code) : '';
 352         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 353         $hexk  = sprintf("%06X", $range->[0]); # replace by the first
 354         last;
 355       }
 356     }
 357     openunicode(\$UNICODEFH, "UnicodeData.txt");
 358     if (defined $UNICODEFH) {
 359         use Search::Dict 1.02;
 360         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
 361             my $line = <$UNICODEFH>;
 362             return unless defined $line;
 363             chomp $line;
 364             my %prop;
 365             @prop{qw(
 366                      code name category
 367                      combining bidi decomposition
 368                      decimal digit numeric
 369                      mirrored unicode10 comment
 370                      upper lower title
 371                     )} = split(/;/, $line, -1);
 372             $hexk =~ s/^0+//;
 373             $hexk =  sprintf("%04X", hex($hexk));
 374             if ($prop{code} eq $hexk) {
 375                 $prop{block}  = charblock($code);
 376                 $prop{script} = charscript($code);
 377                 if(defined $rname){
 378                     $prop{code} = $rcode;
 379                     $prop{name} = $rname;
 380                     $prop{decomposition} = $rdec;
 381                 }
 382                 return \%prop;
 383             }
 384         }
 385     }
 386     return;
 387 }
 388
 389 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 390     my ($table, $lo, $hi, $code) = @_;
 391
 392     return if $lo > $hi;
 393
 394     my $mid = int(($lo+$hi) / 2);
 395
 396     if ($table->[$mid]->[0] < $code) {
 397         if ($table->[$mid]->[1] >= $code) {
 398             return $table->[$mid]->[2];
 399         } else {
 400             _search($table, $mid + 1, $hi, $code);
 401         }
 402     } elsif ($table->[$mid]->[0] > $code) {
 403         _search($table, $lo, $mid - 1, $code);
 404     } else {
 405         return $table->[$mid]->[2];
 406     }
 407 }
 408
 409 sub charinrange {
 410     my ($range, $arg) = @_;
 411     my $code = _getcode($arg);
 412     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 413         unless defined $code;
 414     _search($range, 0, $#$range, $code);
 415 }
 416
 417 =head2 B<charblock()>
 418
 419     use Unicode::UCD 'charblock';
 420
 421     my $charblock = charblock(0x41);
 422     my $charblock = charblock(1234);
 423     my $charblock = charblock(0x263a);
 424     my $charblock = charblock("U+263a");
 425
 426     my $range     = charblock('Armenian');
 427
 428 With a L</code point argument> charblock() returns the I<block> the code point
 429 belongs to, e.g.  C<Basic Latin>.
 430 If the code point is unassigned, this returns the block it would belong to if
 431 it were assigned (which it may in future versions of the Unicode Standard).
 432
 433 See also L</Blocks versus Scripts>.
 434
 435 If supplied with an argument that can't be a code point, charblock() tries
 436 to do the opposite and interpret the argument as a code point block. The
 437 return value is a I<range>: an anonymous list of lists that contain
 438 I<start-of-range>, I<end-of-range> code point pairs. You can test whether
 439 a code point is in a range using the L</charinrange()> function. If the
 440 argument is not a known code point block, B<undef> is returned.
 441
 442 =cut
 443
 444 my @BLOCKS;
 445 my %BLOCKS;
 446
 447 sub _charblocks {
 448     unless (@BLOCKS) {
 449         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 450             local $_;
 451             while (<$BLOCKSFH>) {
 452                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 453                     my ($lo, $hi) = (hex($1), hex($2));
 454                     my $subrange = [ $lo, $hi, $3 ];
 455                     push @BLOCKS, $subrange;
 456                     push @{$BLOCKS{$3}}, $subrange;
 457                 }
 458             }
 459             close($BLOCKSFH);
 460         }
 461     }
 462 }
 463
 464 sub charblock {
 465     my $arg = shift;
 466
 467     _charblocks() unless @BLOCKS;
 468
 469     my $code = _getcode($arg);
 470
 471     if (defined $code) {
 472         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 473     } else {
 474         if (exists $BLOCKS{$arg}) {
 475             return dclone $BLOCKS{$arg};
 476         } else {
 477             return;
 478         }
 479     }
 480 }
 481
 482 =head2 B<charscript()>
 483
 484     use Unicode::UCD 'charscript';
 485
 486     my $charscript = charscript(0x41);
 487     my $charscript = charscript(1234);
 488     my $charscript = charscript("U+263a");
 489
 490     my $range      = charscript('Thai');
 491
 492 With a L</code point argument> charscript() returns the I<script> the
 493 code point belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 494 If the code point is unassigned, it returns B<undef>
 495
 496 If supplied with an argument that can't be a code point, charscript() tries
 497 to do the opposite and interpret the argument as a code point script. The
 498 return value is a I<range>: an anonymous list of lists that contain
 499 I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
 500 code point is in a range using the L</charinrange()> function. If the
 501 argument is not a known code point script, B<undef> is returned.
 502
 503 See also L</Blocks versus Scripts>.
 504
 505 =cut
 506
 507 my @SCRIPTS;
 508 my %SCRIPTS;
 509
 510 sub _charscripts {
 511     unless (@SCRIPTS) {
 512         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 513             local $_;
 514             while (<$SCRIPTSFH>) {
 515                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 516                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 517                     my $script = lc($3);
 518                     $script =~ s/\b(\w)/uc($1)/ge;
 519                     my $subrange = [ $lo, $hi, $script ];
 520                     push @SCRIPTS, $subrange;
 521                     push @{$SCRIPTS{$script}}, $subrange;
 522                 }
 523             }
 524             close($SCRIPTSFH);
 525             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 526         }
 527     }
 528 }
 529
 530 sub charscript {
 531     my $arg = shift;
 532
 533     _charscripts() unless @SCRIPTS;
 534
 535     my $code = _getcode($arg);
 536
 537     if (defined $code) {
 538         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 539     } else {
 540         if (exists $SCRIPTS{$arg}) {
 541             return dclone $SCRIPTS{$arg};
 542         } else {
 543             return;
 544         }
 545     }
 546 }
 547
 548 =head2 B<charblocks()>
 549
 550     use Unicode::UCD 'charblocks';
 551
 552     my $charblocks = charblocks();
 553
 554 charblocks() returns a reference to a hash with the known block names
 555 as the keys, and the code point ranges (see L</charblock()>) as the values.
 556
 557 See also L</Blocks versus Scripts>.
 558
 559 =cut
 560
 561 sub charblocks {
 562     _charblocks() unless %BLOCKS;
 563     return dclone \%BLOCKS;
 564 }
 565
 566 =head2 B<charscripts()>
 567
 568     use Unicode::UCD 'charscripts';
 569
 570     my $charscripts = charscripts();
 571
 572 charscripts() returns a reference to a hash with the known script
 573 names as the keys, and the code point ranges (see L</charscript()>) as
 574 the values.
 575
 576 See also L</Blocks versus Scripts>.
 577
 578 =cut
 579
 580 sub charscripts {
 581     _charscripts() unless %SCRIPTS;
 582     return dclone \%SCRIPTS;
 583 }
 584
 585 =head2 B<charinrange()>
 586
 587 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 588 can also test whether a code point is in the I<range> as returned by
 589 L</charblock()> and L</charscript()> or as the values of the hash returned
 590 by L</charblocks()> and L</charscripts()> by using charinrange():
 591
 592     use Unicode::UCD qw(charscript charinrange);
 593
 594     $range = charscript('Hiragana');
 595     print "looks like hiragana\n" if charinrange($range, $codepoint);
 596
 597 =cut
 598
 599 my %GENERAL_CATEGORIES =
 600  (
 601     'L'  =>         'Letter',
 602     'LC' =>         'CasedLetter',
 603     'Lu' =>         'UppercaseLetter',
 604     'Ll' =>         'LowercaseLetter',
 605     'Lt' =>         'TitlecaseLetter',
 606     'Lm' =>         'ModifierLetter',
 607     'Lo' =>         'OtherLetter',
 608     'M'  =>         'Mark',
 609     'Mn' =>         'NonspacingMark',
 610     'Mc' =>         'SpacingMark',
 611     'Me' =>         'EnclosingMark',
 612     'N'  =>         'Number',
 613     'Nd' =>         'DecimalNumber',
 614     'Nl' =>         'LetterNumber',
 615     'No' =>         'OtherNumber',
 616     'P'  =>         'Punctuation',
 617     'Pc' =>         'ConnectorPunctuation',
 618     'Pd' =>         'DashPunctuation',
 619     'Ps' =>         'OpenPunctuation',
 620     'Pe' =>         'ClosePunctuation',
 621     'Pi' =>         'InitialPunctuation',
 622     'Pf' =>         'FinalPunctuation',
 623     'Po' =>         'OtherPunctuation',
 624     'S'  =>         'Symbol',
 625     'Sm' =>         'MathSymbol',
 626     'Sc' =>         'CurrencySymbol',
 627     'Sk' =>         'ModifierSymbol',
 628     'So' =>         'OtherSymbol',
 629     'Z'  =>         'Separator',
 630     'Zs' =>         'SpaceSeparator',
 631     'Zl' =>         'LineSeparator',
 632     'Zp' =>         'ParagraphSeparator',
 633     'C'  =>         'Other',
 634     'Cc' =>         'Control',
 635     'Cf' =>         'Format',
 636     'Cs' =>         'Surrogate',
 637     'Co' =>         'PrivateUse',
 638     'Cn' =>         'Unassigned',
 639  );
 640
 641 sub general_categories {
 642     return dclone \%GENERAL_CATEGORIES;
 643 }
 644
 645 =head2 B<general_categories()>
 646
 647     use Unicode::UCD 'general_categories';
 648
 649     my $categories = general_categories();
 650
 651 This returns a reference to a hash which has short
 652 general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
 653 names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
 654 C<Symbol>) as values.  The hash is reversible in case you need to go
 655 from the long names to the short names.  The general category is the
 656 one returned from
 657 L</charinfo()> under the C<category> key.
 658
 659 =cut
 660
 661 my %BIDI_TYPES =
 662  (
 663    'L'   => 'Left-to-Right',
 664    'LRE' => 'Left-to-Right Embedding',
 665    'LRO' => 'Left-to-Right Override',
 666    'R'   => 'Right-to-Left',
 667    'AL'  => 'Right-to-Left Arabic',
 668    'RLE' => 'Right-to-Left Embedding',
 669    'RLO' => 'Right-to-Left Override',
 670    'PDF' => 'Pop Directional Format',
 671    'EN'  => 'European Number',
 672    'ES'  => 'European Number Separator',
 673    'ET'  => 'European Number Terminator',
 674    'AN'  => 'Arabic Number',
 675    'CS'  => 'Common Number Separator',
 676    'NSM' => 'Non-Spacing Mark',
 677    'BN'  => 'Boundary Neutral',
 678    'B'   => 'Paragraph Separator',
 679    'S'   => 'Segment Separator',
 680    'WS'  => 'Whitespace',
 681    'ON'  => 'Other Neutrals',
 682  );
 683
 684 =head2 B<bidi_types()>
 685
 686     use Unicode::UCD 'bidi_types';
 687
 688     my $categories = bidi_types();
 689
 690 This returns a reference to a hash which has the short
 691 bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
 692 names (such as C<Left-to-Right>, C<Right-to-Left>) as values.  The
 693 hash is reversible in case you need to go from the long names to the
 694 short names.  The bidi type is the one returned from
 695 L</charinfo()>
 696 under the C<bidi> key.  For the exact meaning of the various bidi classes
 697 the Unicode TR9 is recommended reading:
 698 L<http://www.unicode.org/reports/tr9/>
 699 (as of Unicode 5.0.0)
 700
 701 =cut
 702
 703 sub bidi_types {
 704     return dclone \%BIDI_TYPES;
 705 }
 706
 707 =head2 B<compexcl()>
 708
 709     use Unicode::UCD 'compexcl';
 710
 711     my $compexcl = compexcl(0x09dc);
 712
 713 This routine is included for backwards compatibility, but as of Perl 5.12, for
 714 most purposes it is probably more convenient to use one of the following
 715 instead:
 716
 717     my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
 718     my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
 719
 720 or even
 721
 722     my $compexcl = chr(0x09dc) =~ /\p{CE};
 723     my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
 724
 725 The first two forms return B<true> if the L</code point argument> should not
 726 be produced by composition normalization.  The final two forms
 727 additionally require that this fact not otherwise be determinable from
 728 the Unicode data base for them to return B<true>.
 729
 730 This routine behaves identically to the final two forms.  That is,
 731 it does not return B<true> if the code point has a decomposition
 732 consisting of another single code point, nor if its decomposition starts
 733 with a code point whose combining class is non-zero.  Code points that meet
 734 either of these conditions should also not be produced by composition
 735 normalization, which is probably why you should use the
 736 C<Full_Composition_Exclusion> property instead, as shown above.
 737
 738 The routine returns B<false> otherwise.
 739
 740 =cut
 741
 742 sub compexcl {
 743     my $arg  = shift;
 744     my $code = _getcode($arg);
 745     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 746         unless defined $code;
 747
 748     no warnings "utf8";     # So works on surrogates and non-Unicode code points
 749     return chr($code) =~ /\p{Composition_Exclusion}/;
 750 }
 751
 752 =head2 B<casefold()>
 753
 754     use Unicode::UCD 'casefold';
 755
 756     my $casefold = casefold(0xDF);
 757     if (defined $casefold) {
 758         my @full_fold_hex = split / /, $casefold->{'full'};
 759         my $full_fold_string =
 760                     join "", map {chr(hex($_))} @full_fold_hex;
 761         my @turkic_fold_hex =
 762                         split / /, ($casefold->{'turkic'} ne "")
 763                                         ? $casefold->{'turkic'}
 764                                         : $casefold->{'full'};
 765         my $turkic_fold_string =
 766                         join "", map {chr(hex($_))} @turkic_fold_hex;
 767     }
 768     if (defined $casefold && $casefold->{'simple'} ne "") {
 769         my $simple_fold_hex = $casefold->{'simple'};
 770         my $simple_fold_string = chr(hex($simple_fold_hex));
 771     }
 772
 773 This returns the (almost) locale-independent case folding of the
 774 character specified by the L</code point argument>.
 775
 776 If there is no case folding for that code point, B<undef> is returned.
 777
 778 If there is a case folding for that code point, a reference to a hash
 779 with the following fields is returned:
 780
 781 =over
 782
 783 =item B<code>
 784
 785 the input L</code point argument> expressed in hexadecimal, with leading zeros
 786 added if necessary to make it contain at least four hexdigits
 787
 788 =item B<full>
 789
 790 one or more codes (separated by spaces) that taken in order give the
 791 code points for the case folding for I<code>.
 792 Each has at least four hexdigits.
 793
 794 =item B<simple>
 795
 796 is empty, or is exactly one code with at least four hexdigits which can be used
 797 as an alternative case folding when the calling program cannot cope with the
 798 fold being a sequence of multiple code points.  If I<full> is just one code
 799 point, then I<simple> equals I<full>.  If there is no single code point folding
 800 defined for I<code>, then I<simple> is the empty string.  Otherwise, it is an
 801 inferior, but still better-than-nothing alternative folding to I<full>.
 802
 803 =item B<mapping>
 804
 805 is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
 806 otherwise.  It can be considered to be the simplest possible folding for
 807 I<code>.  It is defined primarily for backwards compatibility.
 808
 809 =item B<status>
 810
 811 is C<C> (for C<common>) if the best possible fold is a single code point
 812 (I<simple> equals I<full> equals I<mapping>).  It is C<S> if there are distinct
 813 folds, I<simple> and I<full> (I<mapping> equals I<simple>).  And it is C<F> if
 814 there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).  Note
 815 that this
 816 describes the contents of I<mapping>.  It is defined primarily for backwards
 817 compatibility.
 818
 819 On versions 3.1 and earlier of Unicode, I<status> can also be
 820 C<I> which is the same as C<C> but is a special case for dotted uppercase I and
 821 dotless lowercase i:
 822
 823 =over
 824
 825 =item B<*>
 826
 827 If you use this C<I> mapping, the result is case-insensitive,
 828 but dotless and dotted I's are not distinguished
 829
 830 =item B<*>
 831
 832 If you exclude this C<I> mapping, the result is not fully case-insensitive, but
 833 dotless and dotted I's are distinguished
 834
 835 =back
 836
 837 =item B<turkic>
 838
 839 contains any special folding for Turkic languages.  For versions of Unicode
 840 starting with 3.2, this field is empty unless I<code> has a different folding
 841 in Turkic languages, in which case it is one or more codes (separated by
 842 spaces) that taken in order give the code points for the case folding for
 843 I<code> in those languages.
 844 Each code has at least four hexdigits.
 845 Note that this folding does not maintain canonical equivalence without
 846 additional processing.
 847
 848 For versions of Unicode 3.1 and earlier, this field is empty unless there is a
 849 special folding for Turkic languages, in which case I<status> is C<I>, and
 850 I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
 851
 852 =back
 853
 854 Programs that want complete generality and the best folding results should use
 855 the folding contained in the I<full> field.  But note that the fold for some
 856 code points will be a sequence of multiple code points.
 857
 858 Programs that can't cope with the fold mapping being multiple code points can
 859 use the folding contained in the I<simple> field, with the loss of some
 860 generality.  In Unicode 5.1, about 7% of the defined foldings have no single
 861 code point folding.
 862
 863 The I<mapping> and I<status> fields are provided for backwards compatibility for
 864 existing programs.  They contain the same values as in previous versions of
 865 this function.
 866
 867 Locale is not completely independent.  The I<turkic> field contains results to
 868 use when the locale is a Turkic language.
 869
 870 For more information about case mappings see
 871 L<http://www.unicode.org/unicode/reports/tr21>
 872
 873 =cut
 874
 875 my %CASEFOLD;
 876
 877 sub _casefold {
 878     unless (%CASEFOLD) {
 879         if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
 880             local $_;
 881             while (<$CASEFOLDFH>) {
 882                 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 883                     my $code = hex($1);
 884                     $CASEFOLD{$code}{'code'} = $1;
 885                     $CASEFOLD{$code}{'turkic'} = "" unless
 886                                             defined $CASEFOLD{$code}{'turkic'};
 887                     if ($2 eq 'C' || $2 eq 'I') {       # 'I' is only on 3.1 and
 888                                                         # earlier Unicodes
 889                                                         # Both entries there (I
 890                                                         # only checked 3.1) are
 891                                                         # the same as C, and
 892                                                         # there are no other
 893                                                         # entries for those
 894                                                         # codepoints, so treat
 895                                                         # as if C, but override
 896                                                         # the turkic one for
 897                                                         # 'I'.
 898                         $CASEFOLD{$code}{'status'} = $2;
 899                         $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
 900                         $CASEFOLD{$code}{'mapping'} = $3;
 901                         $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
 902                     } elsif ($2 eq 'F') {
 903                         $CASEFOLD{$code}{'full'} = $3;
 904                         unless (defined $CASEFOLD{$code}{'simple'}) {
 905                                 $CASEFOLD{$code}{'simple'} = "";
 906                                 $CASEFOLD{$code}{'mapping'} = $3;
 907                                 $CASEFOLD{$code}{'status'} = $2;
 908                         }
 909                     } elsif ($2 eq 'S') {
 910
 911
 912                         # There can't be a simple without a full, and simple
 913                         # overrides all but full
 914
 915                         $CASEFOLD{$code}{'simple'} = $3;
 916                         $CASEFOLD{$code}{'mapping'} = $3;
 917                         $CASEFOLD{$code}{'status'} = $2;
 918                     } elsif ($2 eq 'T') {
 919                         $CASEFOLD{$code}{'turkic'} = $3;
 920                     } # else can't happen because only [CIFST] are possible
 921                 }
 922             }
 923             close($CASEFOLDFH);
 924         }
 925     }
 926 }
 927
 928 sub casefold {
 929     my $arg  = shift;
 930     my $code = _getcode($arg);
 931     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 932         unless defined $code;
 933
 934     _casefold() unless %CASEFOLD;
 935
 936     return $CASEFOLD{$code};
 937 }
 938
 939 =head2 B<casespec()>
 940
 941     use Unicode::UCD 'casespec';
 942
 943     my $casespec = casespec(0xFB00);
 944
 945 This returns the potentially locale-dependent case mappings of the L</code point
 946 argument>.  The mappings may be longer than a single code point (which the basic
 947 Unicode case mappings as returned by L</charinfo()> never are).
 948
 949 If there are no case mappings for the L</code point argument>, or if all three
 950 possible mappings (I<lower>, I<title> and I<upper>) result in single code
 951 points and are locale independent and unconditional, B<undef> is returned
 952 (which means that the case mappings, if any, for the code point are those
 953 returned by L</charinfo()>).
 954
 955 Otherwise, a reference to a hash giving the mappings (or a reference to a hash
 956 of such hashes, explained below) is returned with the following keys and their
 957 meanings:
 958
 959 The keys in the bottom layer hash with the meanings of their values are:
 960
 961 =over
 962
 963 =item B<code>
 964
 965 the input L</code point argument> expressed in hexadecimal, with leading zeros
 966 added if necessary to make it contain at least four hexdigits
 967
 968 =item B<lower>
 969
 970 one or more codes (separated by spaces) that taken in order give the
 971 code points for the lower case of I<code>.
 972 Each has at least four hexdigits.
 973
 974 =item B<title>
 975
 976 one or more codes (separated by spaces) that taken in order give the
 977 code points for the title case of I<code>.
 978 Each has at least four hexdigits.
 979
 980 =item B<upper>
 981
 982 one or more codes (separated by spaces) that taken in order give the
 983 code points for the upper case of I<code>.
 984 Each has at least four hexdigits.
 985
 986 =item B<condition>
 987
 988 the conditions for the mappings to be valid.
 989 If B<undef>, the mappings are always valid.
 990 When defined, this field is a list of conditions,
 991 all of which must be true for the mappings to be valid.
 992 The list consists of one or more
 993 I<locales> (see below)
 994 and/or I<contexts> (explained in the next paragraph),
 995 separated by spaces.
 996 (Other than as used to separate elements, spaces are to be ignored.)
 997 Case distinctions in the condition list are not significant.
 998 Conditions preceded by "NON_" represent the negation of the condition.
 999
1000 A I<context> is one of those defined in the Unicode standard.
1001 For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1002 available at
1003 L<http://www.unicode.org/versions/Unicode5.1.0/>.
1004 These are for context-sensitive casing.
1005
1006 =back
1007
1008 The hash described above is returned for locale-independent casing, where
1009 at least one of the mappings has length longer than one.  If B<undef> is
1010 returned, the code point may have mappings, but if so, all are length one,
1011 and are returned by L</charinfo()>.
1012 Note that when this function does return a value, it will be for the complete
1013 set of mappings for a code point, even those whose length is one.
1014
1015 If there are additional casing rules that apply only in certain locales,
1016 an additional key for each will be defined in the returned hash.  Each such key
1017 will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1018 followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1019 and a variant code).  You can find the lists of all possible locales, see
1020 L<Locale::Country> and L<Locale::Language>.
1021 (In Unicode 5.1, the only locales returned by this function
1022 are C<lt>, C<tr>, and C<az>.)
1023
1024 Each locale key is a reference to a hash that has the form above, and gives
1025 the casing rules for that particular locale, which take precedence over the
1026 locale-independent ones when in that locale.
1027
1028 If the only casing for a code point is locale-dependent, then the returned
1029 hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1030 will contain only locale keys.
1031
1032 For more information about case mappings see
1033 L<http://www.unicode.org/unicode/reports/tr21/>
1034
1035 =cut
1036
1037 my %CASESPEC;
1038
1039 sub _casespec {
1040     unless (%CASESPEC) {
1041         if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
1042             local $_;
1043             while (<$CASESPECFH>) {
1044                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
1045                     my ($hexcode, $lower, $title, $upper, $condition) =
1046                         ($1, $2, $3, $4, $5);
1047                     my $code = hex($hexcode);
1048                     if (exists $CASESPEC{$code}) {
1049                         if (exists $CASESPEC{$code}->{code}) {
1050                             my ($oldlower,
1051                                 $oldtitle,
1052                                 $oldupper,
1053                                 $oldcondition) =
1054                                     @{$CASESPEC{$code}}{qw(lower
1055                                                            title
1056                                                            upper
1057                                                            condition)};
1058                             if (defined $oldcondition) {
1059                                 my ($oldlocale) =
1060                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
1061                                 delete $CASESPEC{$code};
1062                                 $CASESPEC{$code}->{$oldlocale} =
1063                                 { code      => $hexcode,
1064                                   lower     => $oldlower,
1065                                   title     => $oldtitle,
1066                                   upper     => $oldupper,
1067                                   condition => $oldcondition };
1068                             }
1069                         }
1070                         my ($locale) =
1071                             ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1072                         $CASESPEC{$code}->{$locale} =
1073                         { code      => $hexcode,
1074                           lower     => $lower,
1075                           title     => $title,
1076                           upper     => $upper,
1077                           condition => $condition };
1078                     } else {
1079                         $CASESPEC{$code} =
1080                         { code      => $hexcode,
1081                           lower     => $lower,
1082                           title     => $title,
1083                           upper     => $upper,
1084                           condition => $condition };
1085                     }
1086                 }
1087             }
1088             close($CASESPECFH);
1089         }
1090     }
1091 }
1092
1093 sub casespec {
1094     my $arg  = shift;
1095     my $code = _getcode($arg);
1096     croak __PACKAGE__, "::casespec: unknown code '$arg'"
1097         unless defined $code;
1098
1099     _casespec() unless %CASESPEC;
1100
1101     return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
1102 }
1103
1104 =head2 B<namedseq()>
1105
1106     use Unicode::UCD 'namedseq';
1107
1108     my $namedseq = namedseq("KATAKANA LETTER AINU P");
1109     my @namedseq = namedseq("KATAKANA LETTER AINU P");
1110     my %namedseq = namedseq();
1111
1112 If used with a single argument in a scalar context, returns the string
1113 consisting of the code points of the named sequence, or B<undef> if no
1114 named sequence by that name exists.  If used with a single argument in
1115 a list context, it returns the list of the ordinals of the code points.  If used
1116 with no
1117 arguments in a list context, returns a hash with the names of the
1118 named sequences as the keys and the named sequences as strings as
1119 the values.  Otherwise, it returns B<undef> or an empty list depending
1120 on the context.
1121
1122 This function only operates on officially approved (not provisional) named
1123 sequences.
1124
1125 Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
1126 sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
1127 LETTER AINU P")> will return the same string this function does, but will also
1128 operate on character names that aren't named sequences, without you having to
1129 know which are which.  See L<charnames>.
1130
1131 =cut
1132
1133 my %NAMEDSEQ;
1134
1135 sub _namedseq {
1136     unless (%NAMEDSEQ) {
1137         if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
1138             local $_;
1139             while (<$NAMEDSEQFH>) {
1140                 if (/^ [0-9A-F]+ \  /x) {
1141                     chomp;
1142                     my ($sequence, $name) = split /\t/;
1143                     my @s = map { chr(hex($_)) } split(' ', $sequence);
1144                     $NAMEDSEQ{$name} = join("", @s);
1145                 }
1146             }
1147             close($NAMEDSEQFH);
1148         }
1149     }
1150 }
1151
1152 sub namedseq {
1153
1154     # Use charnames::string_vianame() which now returns this information,
1155     # unless the caller wants the hash returned, in which case we read it in,
1156     # and thereafter use it instead of calling charnames, as it is faster.
1157
1158     my $wantarray = wantarray();
1159     if (defined $wantarray) {
1160         if ($wantarray) {
1161             if (@_ == 0) {
1162                 _namedseq() unless %NAMEDSEQ;
1163                 return %NAMEDSEQ;
1164             } elsif (@_ == 1) {
1165                 my $s;
1166                 if (%NAMEDSEQ) {
1167                     $s = $NAMEDSEQ{ $_[0] };
1168                 }
1169                 else {
1170                     $s = charnames::string_vianame($_[0]);
1171                 }
1172                 return defined $s ? map { ord($_) } split('', $s) : ();
1173             }
1174         } elsif (@_ == 1) {
1175             return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
1176             return charnames::string_vianame($_[0]);
1177         }
1178     }
1179     return;
1180 }
1181
1182 =head2 Unicode::UCD::UnicodeVersion
1183
1184 This returns the version of the Unicode Character Database, in other words, the
1185 version of the Unicode standard the database implements.  The version is a
1186 string of numbers delimited by dots (C<'.'>).
1187
1188 =cut
1189
1190 my $UNICODEVERSION;
1191
1192 sub UnicodeVersion {
1193     unless (defined $UNICODEVERSION) {
1194         openunicode(\$VERSIONFH, "version");
1195         chomp($UNICODEVERSION = <$VERSIONFH>);
1196         close($VERSIONFH);
1197         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1198             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1199     }
1200     return $UNICODEVERSION;
1201 }
1202
1203 =head2 B<Blocks versus Scripts>
1204
1205 The difference between a block and a script is that scripts are closer
1206 to the linguistic notion of a set of code points required to present
1207 languages, while block is more of an artifact of the Unicode code point
1208 numbering and separation into blocks of (mostly) 256 code points.
1209
1210 For example the Latin B<script> is spread over several B<blocks>, such
1211 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1212 C<Latin Extended-B>.  On the other hand, the Latin script does not
1213 contain all the characters of the C<Basic Latin> block (also known as
1214 ASCII): it includes only the letters, and not, for example, the digits
1215 or the punctuation.
1216
1217 For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1218
1219 For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1220
1221 =head2 B<Matching Scripts and Blocks>
1222
1223 Scripts are matched with the regular-expression construct
1224 C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
1225 while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
1226 any of the 256 code points in the Tibetan block).
1227
1228
1229 =head2 Implementation Note
1230
1231 The first use of charinfo() opens a read-only filehandle to the Unicode
1232 Character Database (the database is included in the Perl distribution).
1233 The filehandle is then kept open for further queries.  In other words,
1234 if you are wondering where one of your filehandles went, that's where.
1235
1236 =head1 BUGS
1237
1238 Does not yet support EBCDIC platforms.
1239
1240 =head1 AUTHOR
1241
1242 Jarkko Hietaniemi
1243
1244 =cut
1245
1246 1;