10 our @ISA = qw(Exporter);
11 our @EXPORT_OK = qw(charinfo
13 charblocks charscripts
22 UnicodeCD - Unicode character database
26 use UnicodeCD 'charinfo';
27 my $charinfo = charinfo($codepoint);
29 use UnicodeCD 'charblock';
30 my $charblock = charblock($codepoint);
32 use UnicodeCD 'charscript';
33 my $charscript = charblock($codepoint);
35 use UnicodeCD 'charblocks';
36 my $charblocks = charblocks();
38 use UnicodeCD 'charscripts';
39 my %charscripts = charscripts();
41 use UnicodeCD qw(charscript charinrange);
42 my $range = charscript($script);
43 print "looks like $script\n" if charinrange($range, $codepoint);
45 use UnicodeCD 'compexcl';
46 my $compexcl = compexcl($codepoint);
48 my $unicode_version = UnicodeCD::UnicodeVersion();
52 The UnicodeCD module offers a simple interface to the Unicode Character
66 my ($rfh, @path) = @_;
68 unless (defined $$rfh) {
71 $f = File::Spec->catfile($d, "unicode", @path);
72 last if open($$rfh, $f);
75 croak __PACKAGE__, ": failed to find ",
76 File::Spec->catfile(@path), " in @INC"
84 use UnicodeCD 'charinfo';
86 my $charinfo = charinfo(0x41);
88 charinfo() returns a reference to a hash that has the following fields
89 as defined by the Unicode standard:
93 code code point with at least four hexdigits
94 name name of the character IN UPPER CASE
95 category general category of the character
96 combining classes used in the Canonical Ordering Algorithm
97 bidi bidirectional category
98 decomposition character decomposition mapping
99 decimal if decimal digit this is the integer numeric value
100 digit if digit this is the numeric value
101 numeric if numeric is the integer or rational numeric value
102 mirrored if mirrored in bidirectional text
103 unicode10 Unicode 1.0 name if existed and different
104 comment ISO 10646 comment field
105 upper uppercase equivalent mapping
106 lower lowercase equivalent mapping
107 title titlecase equivalent mapping
109 block block the character belongs to (used in \p{In...})
110 script script the character belongs to
112 If no match is found, a reference to an empty hash is returned.
114 The C<block> property is the same as as returned by charinfo(). It is
115 not defined in the Unicode Character Database proper (Chapter 4 of the
116 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
117 of TUS3). Similarly for the C<script> property.
119 Note that you cannot do (de)composition and casing based solely on the
120 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
121 you will need also the compexcl(), casefold(), and casespec() functions.
128 if ($arg =~ /^\d+$/) {
130 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
139 my $code = _getcode($arg);
140 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
141 unless defined $code;
142 croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
143 unless 0x3400 <= $code && $code <= 0x4DB5
144 || 0x4E00 <= $code && $code <= 0x9FA5
145 || 0x20000 <= $code && $code <= 0x2A6D6;
146 sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
149 my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
150 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
151 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
154 my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
155 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
156 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
157 "YU", "EU", "YI", "I",
160 my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
161 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
162 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
163 "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
171 LCount => 19, # scalar @JamoL
172 VCount => 21, # scalar @JamoV
173 TCount => 28, # scalar @JamoT
174 NCount => 588, # VCount * TCount
175 SCount => 11172, # LCount * NCount
176 Final => 0xD7A3, # SBase -1 + SCount
179 sub hangul_charname {
181 my $code = _getcode($arg);
182 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
183 unless defined $code;
184 croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
185 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
186 my $SIndex = $code - $HangulConst{SBase};
187 my $LIndex = int( $SIndex / $HangulConst{NCount});
188 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
189 my $TIndex = $SIndex % $HangulConst{TCount};
200 my $code = _getcode($arg);
201 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
202 unless defined $code;
203 croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
204 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
205 my $SIndex = $code - $HangulConst{SBase};
206 my $LIndex = int( $SIndex / $HangulConst{NCount});
207 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
208 my $TIndex = $SIndex % $HangulConst{TCount};
211 sprintf("%04X", $HangulConst{LBase} + $LIndex),
212 sprintf("%04X", $HangulConst{VBase} + $VIndex),
214 sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
218 my @CharinfoRanges = (
220 # [ first, last, coderef to name, coderef to decompose ],
221 # CJK Ideographs Extension A
222 [ 0x3400, 0x4DB5, \&han_charname, undef ],
224 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
226 [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
227 # Non-Private Use High Surrogates
228 [ 0xD800, 0xDB7F, undef, undef ],
229 # Private Use High Surrogates
230 [ 0xDB80, 0xDBFF, undef, undef ],
232 [ 0xDC00, 0xDFFF, undef, undef ],
233 # The Private Use Area
234 [ 0xE000, 0xF8FF, undef, undef ],
235 # CJK Ideographs Extension B
236 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
237 # Plane 15 Private Use Area
238 [ 0xF0000, 0xFFFFD, undef, undef ],
239 # Plane 16 Private Use Area
240 [ 0x100000, 0x10FFFD, undef, undef ],
245 my $code = _getcode($arg);
246 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
247 unless defined $code;
248 my $hexk = sprintf("%04X", $code);
249 my($rcode,$rname,$rdec);
250 foreach my $range (@CharinfoRanges){
251 if($range->[0] <= $code && $code <= $range->[1]){
253 $rname = $range->[2] ? $range->[2]->($code) : '';
254 $rdec = $range->[3] ? $range->[3]->($code) : '';
255 $hexk = sprintf("%04X",$range->[0]); # replace by the first
259 openunicode(\$UNICODEFH, "Unicode.sort"); # sorted
260 if (defined $UNICODEFH) {
262 if (look($UNICODEFH, "$hexk;") >= 0) {
263 my $line = <$UNICODEFH>;
268 combining bidi decomposition
269 decimal digit numeric
270 mirrored unicode10 comment
272 )} = split(/;/, $line, -1);
273 if ($prop{code} eq $hexk) {
274 $prop{block} = charblock($code);
275 $prop{script} = charscript($code);
277 $prop{code} = $rcode;
278 $prop{name} = $rname;
279 $prop{decomposition} = $rdec;
288 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
289 my ($table, $lo, $hi, $code) = @_;
293 my $mid = int(($lo+$hi) / 2);
295 if ($table->[$mid]->[0] < $code) {
296 if ($table->[$mid]->[1] >= $code) {
297 return $table->[$mid]->[2];
299 _search($table, $mid + 1, $hi, $code);
301 } elsif ($table->[$mid]->[0] > $code) {
302 _search($table, $lo, $mid - 1, $code);
304 return $table->[$mid]->[2];
309 my ($range, $arg) = @_;
310 my $code = _getcode($arg);
311 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
312 unless defined $code;
313 _search($range, 0, $#$range, $code);
318 use UnicodeCD 'charblock';
320 my $charblock = charblock(0x41);
321 my $charblock = charblock(1234);
322 my $charblock = charblock("0x263a");
323 my $charblock = charblock("U+263a");
325 my $ranges = charblock('Armenian');
327 With a B<code point argument> charblock() returns the block the character
328 belongs to, e.g. C<Basic Latin>. Note that not all the character
329 positions within all blocks are defined.
331 If supplied with an argument that can't be a code point, charblock()
332 tries to do the opposite and interpret the argument as a character
333 block. The return value is a I<range>: an anonymous list that
334 contains anonymous lists, which in turn contain I<start-of-range>,
335 I<end-of-range> code point pairs. You can test whether a code point
336 is in a range using the L</charinrange> function. If the argument is
337 not a known charater block, C<undef> is returned.
346 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
347 while (<$BLOCKSFH>) {
348 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
349 my ($lo, $hi) = (hex($1), hex($2));
350 my $subrange = [ $lo, $hi, $3 ];
351 push @BLOCKS, $subrange;
352 push @{$BLOCKS{$3}}, $subrange;
363 _charblocks() unless @BLOCKS;
365 my $code = _getcode($arg);
368 _search(\@BLOCKS, 0, $#BLOCKS, $code);
370 if (exists $BLOCKS{$arg}) {
371 return $BLOCKS{$arg};
380 use UnicodeCD 'charscript';
382 my $charscript = charscript(0x41);
383 my $charscript = charscript(1234);
384 my $charscript = charscript("U+263a");
386 my $ranges = charscript('Thai');
388 With a B<code point argument> charscript() returns the script the
389 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
391 If supplied with an argument that can't be a code point, charscript()
392 tries to do the opposite and interpret the argument as a character
393 script. The return value is a I<range>: an anonymous list that
394 contains anonymous lists, which in turn contain I<start-of-range>,
395 I<end-of-range> code point pairs. You can test whether a code point
396 is in a range using the L</charinrange> function. If the argument is
397 not a known charater script, C<undef> is returned.
406 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
407 while (<$SCRIPTSFH>) {
408 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
409 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
411 $script =~ s/\b(\w)/uc($1)/ge;
412 my $subrange = [ $lo, $hi, $script ];
413 push @SCRIPTS, $subrange;
414 push @{$SCRIPTS{$script}}, $subrange;
418 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
426 _charscripts() unless @SCRIPTS;
428 my $code = _getcode($arg);
431 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
433 if (exists $SCRIPTS{$arg}) {
434 return $SCRIPTS{$arg};
443 use UnicodeCD 'charblocks';
445 my $charblocks = charblocks();
447 charblocks() returns a reference to a hash with the known block names
448 as the keys, and the code point ranges (see L</charblock>) as the values.
453 _charblocks() unless %BLOCKS;
459 use UnicodeCD 'charscripts';
461 my %charscripts = charscripts();
463 charscripts() returns a hash with the known script names as the keys,
464 and the code point ranges (see L</charscript>) as the values.
469 _charscripts() unless %SCRIPTS;
473 =head2 Blocks versus Scripts
475 The difference between a block and a script is that scripts are closer
476 to the linguistic notion of a set of characters required to present
477 languages, while block is more of an artifact of the Unicode character
478 numbering and separation into blocks of 256 characters.
480 For example the Latin B<script> is spread over several B<blocks>, such
481 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
482 C<Latin Extended-B>. On the other hand, the Latin script does not
483 contain all the characters of the C<Basic Latin> block (also known as
484 the ASCII): it includes only the letters, not for example the digits
487 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
489 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
491 =head2 Matching Scripts and Blocks
493 Both scripts and blocks can be matched using the regular expression
494 construct C<\p{In...}> and its negation C<\P{In...}>.
496 The name of the script or the block comes after the C<In>, for example
497 C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
498 removed from the names for the C<\p{In...}>, for example
499 C<LatinExtendedA> instead of C<Latin Extended-A>.
501 There are a few cases where there exists both a script and a block by
502 the same name, in these cases the block version has C<Block> appended:
503 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
505 =head2 Code Point Arguments
507 A <code point argument> is either a decimal or a hexadecimal scalar,
508 or "U+" followed by hexadecimals.
512 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
513 can also test whether a code point is in the I<range> as returned by
514 L</charblock> and L</charscript> or as the values of the hash returned
515 by L</charblocks> and L</charscripts> by using charinrange():
517 use UnicodeCD qw(charscript charinrange);
519 $range = charscript('Hiragana');
520 print "looks like hiragana\n" if charinrange($range, $codepoint);
526 use UnicodeCD 'compexcl';
528 my $compexcl = compexcl("09dc");
530 The compexcl() returns the composition exclusion (that is, if the
531 character should not be produced during a precomposition) of the
532 character specified by a B<code point argument>.
534 If there is a composition exclusion for the character, true is
535 returned. Otherwise, false is returned.
543 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
544 while (<$COMPEXCLFH>) {
545 if (/^([0-9A-F]+) \# /) {
547 $COMPEXCL{$code} = undef;
557 my $code = _getcode($arg);
559 _compexcl() unless %COMPEXCL;
561 return exists $COMPEXCL{$code};
566 use UnicodeCD 'casefold';
568 my %casefold = casefold("09dc");
570 The casefold() returns the locale-independent case folding of the
571 character specified by a B<code point argument>.
573 If there is a case folding for that character, a reference to a hash
574 with the following fields is returned:
578 code code point with at least four hexdigits
579 status "C", "F", "S", or "I"
580 mapping one or more codes separated by spaces
582 The meaning of the I<status> is as follows:
584 C common case folding, common mappings shared
585 by both simple and full mappings
586 F full case folding, mappings that cause strings
587 to grow in length. Multiple characters are separated
589 S simple case folding, mappings to single characters
590 where different from F
591 I special case for dotted uppercase I and
593 - If this mapping is included, the result is
594 case-insensitive, but dotless and dotted I's
595 are not distinguished
596 - If this mapping is excluded, the result is not
597 fully case-insensitive, but dotless and dotted
598 I's are distinguished
600 If there is no case folding for that character, C<undef> is returned.
602 For more information about case mappings see
603 http://www.unicode.org/unicode/reports/tr21/
611 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
612 while (<$CASEFOLDFH>) {
613 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
615 $CASEFOLD{$code} = { code => $1,
627 my $code = _getcode($arg);
629 _casefold() unless %CASEFOLD;
631 return $CASEFOLD{$code};
636 use UnicodeCD 'casespec';
638 my %casespec = casespec("09dc");
640 The casespec() returns the potentially locale-dependent case mapping
641 of the character specified by a B<code point argument>. The mapping
642 may change the length of the string (which the basic Unicode case
643 mappings as returned by charinfo() never do).
645 If there is a case folding for that character, a reference to a hash
646 with the following fields is returned:
650 code code point with at least four hexdigits
654 condition condition list (may be undef)
656 The C<condition> is optional. Where present, it consists of one or
657 more I<locales> or I<contexts>, separated by spaces (other than as
658 used to separate elements, spaces are to be ignored). A condition
659 list overrides the normal behavior if all of the listed conditions are
660 true. Case distinctions in the condition list are not significant.
661 Conditions preceded by "NON_" represent the negation of the condition
663 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
664 followed by a "_" and a 2-letter ISO language code (possibly followed
665 by a "_" and a variant code). You can find the lists of those codes,
666 see L<Locale::Country> and L<Locale::Language>.
668 A I<context> is one of the following choices:
670 FINAL The letter is not followed by a letter of
671 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
672 MODERN The mapping is only used for modern text
673 AFTER_i The last base character was "i" (U+0069)
675 For more information about case mappings see
676 http://www.unicode.org/unicode/reports/tr21/
684 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
685 while (<$CASESPECFH>) {
686 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
688 $CASESPEC{$code} = { code => $1,
702 my $code = _getcode($arg);
704 _casespec() unless %CASESPEC;
706 return $CASESPEC{$code};
709 =head2 UnicodeCD::UnicodeVersion
711 UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
712 Database, in other words, the version of the Unicode standard the
720 unless (defined $UNICODEVERSION) {
721 openunicode(\$VERSIONFH, "version");
722 chomp($UNICODEVERSION = <$VERSIONFH>);
724 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
725 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
727 return $UNICODEVERSION;
730 =head2 Implementation Note
732 The first use of charinfo() opens a read-only filehandle to the Unicode
733 Character Database (the database is included in the Perl distribution).
734 The filehandle is then kept open for further queries.