lib/UnicodeCD.pm

   1 package UnicodeCD;
   2
   3 use strict;
   4 use warnings;
   5
   6 our $VERSION = '0.2';
   7
   8 require Exporter;
   9
  10 our @ISA = qw(Exporter);
  11
  12 our @EXPORT_OK = qw(charinfo
  13                     charblock charscript
  14                     charblocks charscripts
  15                     charinrange
  16                     compexcl
  17                     casefold casespec);
  18
  19 use Carp;
  20
  21 =head1 NAME
  22
  23 UnicodeCD - Unicode character database
  24
  25 =head1 SYNOPSIS
  26
  27     use UnicodeCD 'charinfo';
  28     my $charinfo   = charinfo($codepoint);
  29
  30     use UnicodeCD 'charblock';
  31     my $charblock  = charblock($codepoint);
  32
  33     use UnicodeCD 'charscript';
  34     my $charscript = charblock($codepoint);
  35
  36     use UnicodeCD 'charblocks';
  37     my $charblocks = charblocks();
  38
  39     use UnicodeCD 'charscripts';
  40     my %charscripts = charscripts();
  41
  42     use UnicodeCD qw(charscript charinrange);
  43     my $range = charscript($script);
  44     print "looks like $script\n" if charinrange($range, $codepoint);
  45
  46     use UnicodeCD 'compexcl';
  47     my $compexcl = compexcl($codepoint);
  48
  49     my $unicode_version = UnicodeCD::UnicodeVersion();
  50
  51 =head1 DESCRIPTION
  52
  53 The UnicodeCD module offers a simple interface to the Unicode Character
  54 Database.
  55
  56 =cut
  57
  58 my $UNICODEFH;
  59 my $BLOCKSFH;
  60 my $SCRIPTSFH;
  61 my $VERSIONFH;
  62 my $COMPEXCLFH;
  63 my $CASEFOLDFH;
  64 my $CASESPECFH;
  65
  66 sub openunicode {
  67     my ($rfh, @path) = @_;
  68     my $f;
  69     unless (defined $$rfh) {
  70         for my $d (@INC) {
  71             use File::Spec;
  72             $f = File::Spec->catfile($d, "unicode", @path);
  73             last if open($$rfh, $f);
  74             undef $f;
  75         }
  76         croak __PACKAGE__, ": failed to find ",
  77               File::Spec->catfile(@path), " in @INC"
  78             unless defined $f;
  79     }
  80     return $f;
  81 }
  82
  83 =head2 charinfo
  84
  85     use UnicodeCD 'charinfo';
  86
  87     my $charinfo = charinfo(0x41);
  88
  89 charinfo() returns a reference to a hash that has the following fields
  90 as defined by the Unicode standard:
  91
  92     key
  93
  94     code             code point with at least four hexdigits
  95     name             name of the character IN UPPER CASE
  96     category         general category of the character
  97     combining        classes used in the Canonical Ordering Algorithm
  98     bidi             bidirectional category
  99     decomposition    character decomposition mapping
 100     decimal          if decimal digit this is the integer numeric value
 101     digit            if digit this is the numeric value
 102     numeric          if numeric is the integer or rational numeric value
 103     mirrored         if mirrored in bidirectional text
 104     unicode10        Unicode 1.0 name if existed and different
 105     comment          ISO 10646 comment field
 106     upper            uppercase equivalent mapping
 107     lower            lowercase equivalent mapping
 108     title            titlecase equivalent mapping
 109
 110     block            block the character belongs to (used in \p{In...})
 111     script           script the character belongs to
 112
 113 If no match is found, a reference to an empty hash is returned.
 114
 115 The C<block> property is the same as as returned by charinfo().  It is
 116 not defined in the Unicode Character Database proper (Chapter 4 of the
 117 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
 118 of TUS3).  Similarly for the C<script> property.
 119
 120 Note that you cannot do (de)composition and casing based solely on the
 121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
 122 you will need also the compexcl(), casefold(), and casespec() functions.
 123
 124 =cut
 125
 126 sub _getcode {
 127     my $arg = shift;
 128
 129     if ($arg =~ /^\d+$/) {
 130         return $arg;
 131     } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
 132         return hex($1);
 133     }
 134
 135     return;
 136 }
 137
 138 sub han_charname {
 139     my $arg  = shift;
 140     my $code = _getcode($arg);
 141     croak __PACKAGE__, "::han_charname: unknown code '$arg'"
 142         unless defined $code;
 143     croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
 144         unless 0x3400  <= $code && $code <= 0x4DB5
 145             || 0x4E00  <= $code && $code <= 0x9FA5
 146             || 0x20000 <= $code && $code <= 0x2A6D6;
 147     sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
 148 }
 149
 150 my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
 151     "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
 152     "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
 153   );
 154
 155 my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
 156     "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
 157     "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
 158     "YU", "EU", "YI", "I",
 159   );
 160
 161 my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
 162     "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
 163     "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
 164     "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
 165   );
 166
 167 my %HangulConst = (
 168    SBase  => 0xAC00,
 169    LBase  => 0x1100,
 170    VBase  => 0x1161,
 171    TBase  => 0x11A7,
 172    LCount => 19,     # scalar @JamoL
 173    VCount => 21,     # scalar @JamoV
 174    TCount => 28,     # scalar @JamoT
 175    NCount => 588,    # VCount * TCount
 176    SCount => 11172,  # LCount * NCount
 177    Final  => 0xD7A3, # SBase -1 + SCount
 178   );
 179
 180 sub hangul_charname {
 181     my $arg  = shift;
 182     my $code = _getcode($arg);
 183     croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
 184         unless defined $code;
 185     croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
 186         unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
 187     my $SIndex = $code - $HangulConst{SBase};
 188     my $LIndex = int( $SIndex / $HangulConst{NCount});
 189     my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
 190     my $TIndex =      $SIndex % $HangulConst{TCount};
 191     return join('',
 192         "HANGUL SYLLABLE ",
 193         $JamoL[$LIndex],
 194         $JamoV[$VIndex],
 195         $JamoT[$TIndex],
 196       );
 197 }
 198
 199 sub hangul_decomp {
 200     my $arg  = shift;
 201     my $code = _getcode($arg);
 202     croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
 203         unless defined $code;
 204     croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
 205         unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
 206     my $SIndex = $code - $HangulConst{SBase};
 207     my $LIndex = int( $SIndex / $HangulConst{NCount});
 208     my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
 209     my $TIndex =      $SIndex % $HangulConst{TCount};
 210
 211     return join(" ",
 212         sprintf("%04X", $HangulConst{LBase} + $LIndex),
 213         sprintf("%04X", $HangulConst{VBase} + $VIndex),
 214       $TIndex ?
 215         sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
 216     );
 217 }
 218
 219 my @CharinfoRanges = (
 220 # block name
 221 # [ first, last, coderef to name, coderef to decompose ],
 222 # CJK Ideographs Extension A
 223   [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
 224 # CJK Ideographs
 225   [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
 226 # Hangul Syllables
 227   [ 0xAC00,   0xD7A3,   \&hangul_charname, \&hangul_decomp  ],
 228 # Non-Private Use High Surrogates
 229   [ 0xD800,   0xDB7F,   undef,   undef  ],
 230 # Private Use High Surrogates
 231   [ 0xDB80,   0xDBFF,   undef,   undef  ],
 232 # Low Surrogates
 233   [ 0xDC00,   0xDFFF,   undef,   undef  ],
 234 # The Private Use Area
 235   [ 0xE000,   0xF8FF,   undef,   undef  ],
 236 # CJK Ideographs Extension B
 237   [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
 238 # Plane 15 Private Use Area
 239   [ 0xF0000,  0xFFFFD,  undef,   undef  ],
 240 # Plane 16 Private Use Area
 241   [ 0x100000, 0x10FFFD, undef,   undef  ],
 242 );
 243
 244 sub charinfo {
 245     my $arg  = shift;
 246     my $code = _getcode($arg);
 247     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 248         unless defined $code;
 249     my $hexk = sprintf("%04X", $code);
 250     my($rcode,$rname,$rdec);
 251     foreach my $range (@CharinfoRanges){
 252       if ($range->[0] <= $code && $code <= $range->[1]) {
 253         $rcode = $hexk;
 254         $rname = $range->[2] ? $range->[2]->($code) : '';
 255         $rdec  = $range->[3] ? $range->[3]->($code) : '';
 256         $hexk  = sprintf("%04X", $range->[0]); # replace by the first
 257         last;
 258       }
 259     }
 260     openunicode(\$UNICODEFH, "Unicode.txt");
 261     if (defined $UNICODEFH) {
 262         use Search::Dict;
 263         if (look($UNICODEFH, "$hexk;")) {
 264             my $line = <$UNICODEFH>;
 265             chomp $line;
 266             my %prop;
 267             @prop{qw(
 268                      code name category
 269                      combining bidi decomposition
 270                      decimal digit numeric
 271                      mirrored unicode10 comment
 272                      upper lower title
 273                     )} = split(/;/, $line, -1);
 274             if ($prop{code} eq $hexk) {
 275                 $prop{block}  = charblock($code);
 276                 $prop{script} = charscript($code);
 277                 if(defined $rname){
 278                     $prop{code} = $rcode;
 279                     $prop{name} = $rname;
 280                     $prop{decomposition} = $rdec;
 281                 }
 282                 return \%prop;
 283             }
 284         }
 285     }
 286     return;
 287 }
 288
 289 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
 290     my ($table, $lo, $hi, $code) = @_;
 291
 292     return if $lo > $hi;
 293
 294     my $mid = int(($lo+$hi) / 2);
 295
 296     if ($table->[$mid]->[0] < $code) {
 297         if ($table->[$mid]->[1] >= $code) {
 298             return $table->[$mid]->[2];
 299         } else {
 300             _search($table, $mid + 1, $hi, $code);
 301         }
 302     } elsif ($table->[$mid]->[0] > $code) {
 303         _search($table, $lo, $mid - 1, $code);
 304     } else {
 305         return $table->[$mid]->[2];
 306     }
 307 }
 308
 309 sub charinrange {
 310     my ($range, $arg) = @_;
 311     my $code = _getcode($arg);
 312     croak __PACKAGE__, "::charinrange: unknown code '$arg'"
 313         unless defined $code;
 314     _search($range, 0, $#$range, $code);
 315 }
 316
 317 =head2 charblock
 318
 319     use UnicodeCD 'charblock';
 320
 321     my $charblock = charblock(0x41);
 322     my $charblock = charblock(1234);
 323     my $charblock = charblock("0x263a");
 324     my $charblock = charblock("U+263a");
 325
 326     my $ranges    = charblock('Armenian');
 327
 328 With a B<code point argument> charblock() returns the block the character
 329 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
 330 positions within all blocks are defined.
 331
 332 If supplied with an argument that can't be a code point, charblock()
 333 tries to do the opposite and interpret the argument as a character
 334 block.  The return value is a I<range>: an anonymous list that
 335 contains anonymous lists, which in turn contain I<start-of-range>,
 336 I<end-of-range> code point pairs.  You can test whether a code point
 337 is in a range using the L</charinrange> function.  If the argument is
 338 not a known charater block, C<undef> is returned.
 339
 340 =cut
 341
 342 my @BLOCKS;
 343 my %BLOCKS;
 344
 345 sub _charblocks {
 346     unless (@BLOCKS) {
 347         if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
 348             while (<$BLOCKSFH>) {
 349                 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
 350                     my ($lo, $hi) = (hex($1), hex($2));
 351                     my $subrange = [ $lo, $hi, $3 ];
 352                     push @BLOCKS, $subrange;
 353                     push @{$BLOCKS{$3}}, $subrange;
 354                 }
 355             }
 356             close($BLOCKSFH);
 357         }
 358     }
 359 }
 360
 361 sub charblock {
 362     my $arg = shift;
 363
 364     _charblocks() unless @BLOCKS;
 365
 366     my $code = _getcode($arg);
 367
 368     if (defined $code) {
 369         _search(\@BLOCKS, 0, $#BLOCKS, $code);
 370     } else {
 371         if (exists $BLOCKS{$arg}) {
 372             return $BLOCKS{$arg};
 373         } else {
 374             return;
 375         }
 376     }
 377 }
 378
 379 =head2 charscript
 380
 381     use UnicodeCD 'charscript';
 382
 383     my $charscript = charscript(0x41);
 384     my $charscript = charscript(1234);
 385     my $charscript = charscript("U+263a");
 386
 387     my $ranges     = charscript('Thai');
 388
 389 With a B<code point argument> charscript() returns the script the
 390 character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 391
 392 If supplied with an argument that can't be a code point, charscript()
 393 tries to do the opposite and interpret the argument as a character
 394 script.  The return value is a I<range>: an anonymous list that
 395 contains anonymous lists, which in turn contain I<start-of-range>,
 396 I<end-of-range> code point pairs.  You can test whether a code point
 397 is in a range using the L</charinrange> function.  If the argument is
 398 not a known charater script, C<undef> is returned.
 399
 400 =cut
 401
 402 my @SCRIPTS;
 403 my %SCRIPTS;
 404
 405 sub _charscripts {
 406     unless (@SCRIPTS) {
 407         if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
 408             while (<$SCRIPTSFH>) {
 409                 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
 410                     my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
 411                     my $script = lc($3);
 412                     $script =~ s/\b(\w)/uc($1)/ge;
 413                     my $subrange = [ $lo, $hi, $script ];
 414                     push @SCRIPTS, $subrange;
 415                     push @{$SCRIPTS{$script}}, $subrange;
 416                 }
 417             }
 418             close($SCRIPTSFH);
 419             @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
 420         }
 421     }
 422 }
 423
 424 sub charscript {
 425     my $arg = shift;
 426
 427     _charscripts() unless @SCRIPTS;
 428
 429     my $code = _getcode($arg);
 430
 431     if (defined $code) {
 432         _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
 433     } else {
 434         if (exists $SCRIPTS{$arg}) {
 435             return $SCRIPTS{$arg};
 436         } else {
 437             return;
 438         }
 439     }
 440 }
 441
 442 =head2 charblocks
 443
 444     use UnicodeCD 'charblocks';
 445
 446     my $charblocks = charblocks();
 447
 448 charblocks() returns a reference to a hash with the known block names
 449 as the keys, and the code point ranges (see L</charblock>) as the values.
 450
 451 =cut
 452
 453 sub charblocks {
 454     _charblocks() unless %BLOCKS;
 455     return \%BLOCKS;
 456 }
 457
 458 =head2 charscripts
 459
 460     use UnicodeCD 'charscripts';
 461
 462     my %charscripts = charscripts();
 463
 464 charscripts() returns a hash with the known script names as the keys,
 465 and the code point ranges (see L</charscript>) as the values.
 466
 467 =cut
 468
 469 sub charscripts {
 470     _charscripts() unless %SCRIPTS;
 471     return \%SCRIPTS;
 472 }
 473
 474 =head2 Blocks versus Scripts
 475
 476 The difference between a block and a script is that scripts are closer
 477 to the linguistic notion of a set of characters required to present
 478 languages, while block is more of an artifact of the Unicode character
 479 numbering and separation into blocks of 256 characters.
 480
 481 For example the Latin B<script> is spread over several B<blocks>, such
 482 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
 483 C<Latin Extended-B>.  On the other hand, the Latin script does not
 484 contain all the characters of the C<Basic Latin> block (also known as
 485 the ASCII): it includes only the letters, not for example the digits
 486 or the punctuation.
 487
 488 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
 489
 490 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
 491
 492 =head2 Matching Scripts and Blocks
 493
 494 Both scripts and blocks can be matched using the regular expression
 495 construct C<\p{In...}> and its negation C<\P{In...}>.
 496
 497 The name of the script or the block comes after the C<In>, for example
 498 C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
 499 removed from the names for the C<\p{In...}>, for example
 500 C<LatinExtendedA> instead of C<Latin Extended-A>.
 501
 502 There are a few cases where there exists both a script and a block by
 503 the same name, in these cases the block version has C<Block> appended:
 504 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
 505
 506 =head2 Code Point Arguments
 507
 508 A <code point argument> is either a decimal or a hexadecimal scalar,
 509 or "U+" followed by hexadecimals.
 510
 511 =head2 charinrange
 512
 513 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
 514 can also test whether a code point is in the I<range> as returned by
 515 L</charblock> and L</charscript> or as the values of the hash returned
 516 by L</charblocks> and L</charscripts> by using charinrange():
 517
 518     use UnicodeCD qw(charscript charinrange);
 519
 520     $range = charscript('Hiragana');
 521     print "looks like hiragana\n" if charinrange($range, $codepoint);
 522
 523 =cut
 524
 525 =head2 compexcl
 526
 527     use UnicodeCD 'compexcl';
 528
 529     my $compexcl = compexcl("09dc");
 530
 531 The compexcl() returns the composition exclusion (that is, if the
 532 character should not be produced during a precomposition) of the
 533 character specified by a B<code point argument>.
 534
 535 If there is a composition exclusion for the character, true is
 536 returned.  Otherwise, false is returned.
 537
 538 =cut
 539
 540 my %COMPEXCL;
 541
 542 sub _compexcl {
 543     unless (%COMPEXCL) {
 544         if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
 545             while (<$COMPEXCLFH>) {
 546                 if (/^([0-9A-F]+) \# /) {
 547                     my $code = hex($1);
 548                     $COMPEXCL{$code} = undef;
 549                 }
 550             }
 551             close($COMPEXCLFH);
 552         }
 553     }
 554 }
 555
 556 sub compexcl {
 557     my $arg  = shift;
 558     my $code = _getcode($arg);
 559     croak __PACKAGE__, "::compexcl: unknown code '$arg'"
 560         unless defined $code;
 561
 562     _compexcl() unless %COMPEXCL;
 563
 564     return exists $COMPEXCL{$code};
 565 }
 566
 567 =head2 casefold
 568
 569     use UnicodeCD 'casefold';
 570
 571     my %casefold = casefold("09dc");
 572
 573 The casefold() returns the locale-independent case folding of the
 574 character specified by a B<code point argument>.
 575
 576 If there is a case folding for that character, a reference to a hash
 577 with the following fields is returned:
 578
 579     key
 580
 581     code             code point with at least four hexdigits
 582     status           "C", "F", "S", or "I"
 583     mapping          one or more codes separated by spaces
 584
 585 The meaning of the I<status> is as follows:
 586
 587    C                 common case folding, common mappings shared
 588                      by both simple and full mappings
 589    F                 full case folding, mappings that cause strings
 590                      to grow in length. Multiple characters are separated
 591                      by spaces
 592    S                 simple case folding, mappings to single characters
 593                      where different from F
 594    I                 special case for dotted uppercase I and
 595                      dotless lowercase i
 596                      - If this mapping is included, the result is
 597                        case-insensitive, but dotless and dotted I's
 598                        are not distinguished
 599                      - If this mapping is excluded, the result is not
 600                        fully case-insensitive, but dotless and dotted
 601                        I's are distinguished
 602
 603 If there is no case folding for that character, C<undef> is returned.
 604
 605 For more information about case mappings see
 606 http://www.unicode.org/unicode/reports/tr21/
 607
 608 =cut
 609
 610 my %CASEFOLD;
 611
 612 sub _casefold {
 613     unless (%CASEFOLD) {
 614         if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
 615             while (<$CASEFOLDFH>) {
 616                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
 617                     my $code = hex($1);
 618                     $CASEFOLD{$code} = { code    => $1,
 619                                          status  => $2,
 620                                          mapping => $3 };
 621                 }
 622             }
 623             close($CASEFOLDFH);
 624         }
 625     }
 626 }
 627
 628 sub casefold {
 629     my $arg  = shift;
 630     my $code = _getcode($arg);
 631     croak __PACKAGE__, "::casefold: unknown code '$arg'"
 632         unless defined $code;
 633
 634     _casefold() unless %CASEFOLD;
 635
 636     return $CASEFOLD{$code};
 637 }
 638
 639 =head2 casespec
 640
 641     use UnicodeCD 'casespec';
 642
 643     my %casespec = casespec("09dc");
 644
 645 The casespec() returns the potentially locale-dependent case mapping
 646 of the character specified by a B<code point argument>.  The mapping
 647 may change the length of the string (which the basic Unicode case
 648 mappings as returned by charinfo() never do).
 649
 650 If there is a case folding for that character, a reference to a hash
 651 with the following fields is returned:
 652
 653     key
 654
 655     code             code point with at least four hexdigits
 656     lower            lowercase
 657     title            titlecase
 658     upper            uppercase
 659     condition        condition list (may be undef)
 660
 661 The C<condition> is optional.  Where present, it consists of one or
 662 more I<locales> or I<contexts>, separated by spaces (other than as
 663 used to separate elements, spaces are to be ignored).  A condition
 664 list overrides the normal behavior if all of the listed conditions are
 665 true.  Case distinctions in the condition list are not significant.
 666 Conditions preceded by "NON_" represent the negation of the condition
 667
 668 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
 669 followed by a "_" and a 2-letter ISO language code (possibly followed
 670 by a "_" and a variant code).  You can find the lists of those codes,
 671 see L<Locale::Country> and L<Locale::Language>.
 672
 673 A I<context> is one of the following choices:
 674
 675     FINAL            The letter is not followed by a letter of
 676                      general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
 677     MODERN           The mapping is only used for modern text
 678     AFTER_i          The last base character was "i" (U+0069)
 679
 680 For more information about case mappings see
 681 http://www.unicode.org/unicode/reports/tr21/
 682
 683 =cut
 684
 685 my %CASESPEC;
 686
 687 sub _casespec {
 688     unless (%CASESPEC) {
 689         if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
 690             while (<$CASESPECFH>) {
 691                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
 692                     my $code = hex($1);
 693                     $CASESPEC{$code} = { code      => $1,
 694                                          lower     => $2,
 695                                          title     => $3,
 696                                          upper     => $4,
 697                                          condition => $5 };
 698                 }
 699             }
 700             close($CASESPECFH);
 701         }
 702     }
 703 }
 704
 705 sub casespec {
 706     my $arg  = shift;
 707     my $code = _getcode($arg);
 708     croak __PACKAGE__, "::casespec: unknown code '$arg'"
 709         unless defined $code;
 710
 711     _casespec() unless %CASESPEC;
 712
 713     return $CASESPEC{$code};
 714 }
 715
 716 =head2 UnicodeCD::UnicodeVersion
 717
 718 UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
 719 Database, in other words, the version of the Unicode standard the
 720 database implements.
 721
 722 =cut
 723
 724 my $UNICODEVERSION;
 725
 726 sub UnicodeVersion {
 727     unless (defined $UNICODEVERSION) {
 728         openunicode(\$VERSIONFH, "version");
 729         chomp($UNICODEVERSION = <$VERSIONFH>);
 730         close($VERSIONFH);
 731         croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
 732             unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
 733     }
 734     return $UNICODEVERSION;
 735 }
 736
 737 =head2 Implementation Note
 738
 739 The first use of charinfo() opens a read-only filehandle to the Unicode
 740 Character Database (the database is included in the Perl distribution).
 741 The filehandle is then kept open for further queries.
 742
 743 =head1 AUTHOR
 744
 745 Jarkko Hietaniemi
 746
 747 =cut
 748
 749 1;