10 our @ISA = qw(Exporter);
12 our @EXPORT_OK = qw(charinfo
14 charblocks charscripts
23 UnicodeCD - Unicode character database
27 use UnicodeCD 'charinfo';
28 my $charinfo = charinfo($codepoint);
30 use UnicodeCD 'charblock';
31 my $charblock = charblock($codepoint);
33 use UnicodeCD 'charscript';
34 my $charscript = charblock($codepoint);
36 use UnicodeCD 'charblocks';
37 my $charblocks = charblocks();
39 use UnicodeCD 'charscripts';
40 my %charscripts = charscripts();
42 use UnicodeCD qw(charscript charinrange);
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
46 use UnicodeCD 'compexcl';
47 my $compexcl = compexcl($codepoint);
49 my $unicode_version = UnicodeCD::UnicodeVersion();
53 The UnicodeCD module offers a simple interface to the Unicode Character
67 my ($rfh, @path) = @_;
69 unless (defined $$rfh) {
72 $f = File::Spec->catfile($d, "unicode", @path);
73 last if open($$rfh, $f);
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
85 use UnicodeCD 'charinfo';
87 my $charinfo = charinfo(0x41);
89 charinfo() returns a reference to a hash that has the following fields
90 as defined by the Unicode standard:
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
110 block block the character belongs to (used in \p{In...})
111 script script the character belongs to
113 If no match is found, a reference to an empty hash is returned.
115 The C<block> property is the same as as returned by charinfo(). It is
116 not defined in the Unicode Character Database proper (Chapter 4 of the
117 Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
118 of TUS3). Similarly for the C<script> property.
120 Note that you cannot do (de)composition and casing based solely on the
121 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
122 you will need also the compexcl(), casefold(), and casespec() functions.
129 if ($arg =~ /^\d+$/) {
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
140 my $code = _getcode($arg);
141 croak __PACKAGE__, "::han_charname: unknown code '$arg'"
142 unless defined $code;
143 croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
144 unless 0x3400 <= $code && $code <= 0x4DB5
145 || 0x4E00 <= $code && $code <= 0x9FA5
146 || 0x20000 <= $code && $code <= 0x2A6D6;
147 sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
150 my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
151 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
152 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
155 my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
156 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
157 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
158 "YU", "EU", "YI", "I",
161 my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
162 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
163 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
164 "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
172 LCount => 19, # scalar @JamoL
173 VCount => 21, # scalar @JamoV
174 TCount => 28, # scalar @JamoT
175 NCount => 588, # VCount * TCount
176 SCount => 11172, # LCount * NCount
177 Final => 0xD7A3, # SBase -1 + SCount
180 sub hangul_charname {
182 my $code = _getcode($arg);
183 croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
184 unless defined $code;
185 croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
186 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
187 my $SIndex = $code - $HangulConst{SBase};
188 my $LIndex = int( $SIndex / $HangulConst{NCount});
189 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
190 my $TIndex = $SIndex % $HangulConst{TCount};
201 my $code = _getcode($arg);
202 croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
203 unless defined $code;
204 croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
205 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
206 my $SIndex = $code - $HangulConst{SBase};
207 my $LIndex = int( $SIndex / $HangulConst{NCount});
208 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
209 my $TIndex = $SIndex % $HangulConst{TCount};
212 sprintf("%04X", $HangulConst{LBase} + $LIndex),
213 sprintf("%04X", $HangulConst{VBase} + $VIndex),
215 sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
219 my @CharinfoRanges = (
221 # [ first, last, coderef to name, coderef to decompose ],
222 # CJK Ideographs Extension A
223 [ 0x3400, 0x4DB5, \&han_charname, undef ],
225 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
227 [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
228 # Non-Private Use High Surrogates
229 [ 0xD800, 0xDB7F, undef, undef ],
230 # Private Use High Surrogates
231 [ 0xDB80, 0xDBFF, undef, undef ],
233 [ 0xDC00, 0xDFFF, undef, undef ],
234 # The Private Use Area
235 [ 0xE000, 0xF8FF, undef, undef ],
236 # CJK Ideographs Extension B
237 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
238 # Plane 15 Private Use Area
239 [ 0xF0000, 0xFFFFD, undef, undef ],
240 # Plane 16 Private Use Area
241 [ 0x100000, 0x10FFFD, undef, undef ],
246 my $code = _getcode($arg);
247 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
248 unless defined $code;
249 my $hexk = sprintf("%04X", $code);
250 my($rcode,$rname,$rdec);
251 foreach my $range (@CharinfoRanges){
252 if ($range->[0] <= $code && $code <= $range->[1]) {
254 $rname = $range->[2] ? $range->[2]->($code) : '';
255 $rdec = $range->[3] ? $range->[3]->($code) : '';
256 $hexk = sprintf("%04X", $range->[0]); # replace by the first
260 openunicode(\$UNICODEFH, "Unicode.txt");
261 if (defined $UNICODEFH) {
263 if (look($UNICODEFH, "$hexk;")) {
264 my $line = <$UNICODEFH>;
269 combining bidi decomposition
270 decimal digit numeric
271 mirrored unicode10 comment
273 )} = split(/;/, $line, -1);
274 if ($prop{code} eq $hexk) {
275 $prop{block} = charblock($code);
276 $prop{script} = charscript($code);
278 $prop{code} = $rcode;
279 $prop{name} = $rname;
280 $prop{decomposition} = $rdec;
289 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
290 my ($table, $lo, $hi, $code) = @_;
294 my $mid = int(($lo+$hi) / 2);
296 if ($table->[$mid]->[0] < $code) {
297 if ($table->[$mid]->[1] >= $code) {
298 return $table->[$mid]->[2];
300 _search($table, $mid + 1, $hi, $code);
302 } elsif ($table->[$mid]->[0] > $code) {
303 _search($table, $lo, $mid - 1, $code);
305 return $table->[$mid]->[2];
310 my ($range, $arg) = @_;
311 my $code = _getcode($arg);
312 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
313 unless defined $code;
314 _search($range, 0, $#$range, $code);
319 use UnicodeCD 'charblock';
321 my $charblock = charblock(0x41);
322 my $charblock = charblock(1234);
323 my $charblock = charblock("0x263a");
324 my $charblock = charblock("U+263a");
326 my $ranges = charblock('Armenian');
328 With a B<code point argument> charblock() returns the block the character
329 belongs to, e.g. C<Basic Latin>. Note that not all the character
330 positions within all blocks are defined.
332 If supplied with an argument that can't be a code point, charblock()
333 tries to do the opposite and interpret the argument as a character
334 block. The return value is a I<range>: an anonymous list that
335 contains anonymous lists, which in turn contain I<start-of-range>,
336 I<end-of-range> code point pairs. You can test whether a code point
337 is in a range using the L</charinrange> function. If the argument is
338 not a known charater block, C<undef> is returned.
347 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
348 while (<$BLOCKSFH>) {
349 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
350 my ($lo, $hi) = (hex($1), hex($2));
351 my $subrange = [ $lo, $hi, $3 ];
352 push @BLOCKS, $subrange;
353 push @{$BLOCKS{$3}}, $subrange;
364 _charblocks() unless @BLOCKS;
366 my $code = _getcode($arg);
369 _search(\@BLOCKS, 0, $#BLOCKS, $code);
371 if (exists $BLOCKS{$arg}) {
372 return $BLOCKS{$arg};
381 use UnicodeCD 'charscript';
383 my $charscript = charscript(0x41);
384 my $charscript = charscript(1234);
385 my $charscript = charscript("U+263a");
387 my $ranges = charscript('Thai');
389 With a B<code point argument> charscript() returns the script the
390 character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
392 If supplied with an argument that can't be a code point, charscript()
393 tries to do the opposite and interpret the argument as a character
394 script. The return value is a I<range>: an anonymous list that
395 contains anonymous lists, which in turn contain I<start-of-range>,
396 I<end-of-range> code point pairs. You can test whether a code point
397 is in a range using the L</charinrange> function. If the argument is
398 not a known charater script, C<undef> is returned.
407 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
408 while (<$SCRIPTSFH>) {
409 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
410 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
412 $script =~ s/\b(\w)/uc($1)/ge;
413 my $subrange = [ $lo, $hi, $script ];
414 push @SCRIPTS, $subrange;
415 push @{$SCRIPTS{$script}}, $subrange;
419 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
427 _charscripts() unless @SCRIPTS;
429 my $code = _getcode($arg);
432 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
434 if (exists $SCRIPTS{$arg}) {
435 return $SCRIPTS{$arg};
444 use UnicodeCD 'charblocks';
446 my $charblocks = charblocks();
448 charblocks() returns a reference to a hash with the known block names
449 as the keys, and the code point ranges (see L</charblock>) as the values.
454 _charblocks() unless %BLOCKS;
460 use UnicodeCD 'charscripts';
462 my %charscripts = charscripts();
464 charscripts() returns a hash with the known script names as the keys,
465 and the code point ranges (see L</charscript>) as the values.
470 _charscripts() unless %SCRIPTS;
474 =head2 Blocks versus Scripts
476 The difference between a block and a script is that scripts are closer
477 to the linguistic notion of a set of characters required to present
478 languages, while block is more of an artifact of the Unicode character
479 numbering and separation into blocks of 256 characters.
481 For example the Latin B<script> is spread over several B<blocks>, such
482 as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
483 C<Latin Extended-B>. On the other hand, the Latin script does not
484 contain all the characters of the C<Basic Latin> block (also known as
485 the ASCII): it includes only the letters, not for example the digits
488 For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
490 For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
492 =head2 Matching Scripts and Blocks
494 Both scripts and blocks can be matched using the regular expression
495 construct C<\p{In...}> and its negation C<\P{In...}>.
497 The name of the script or the block comes after the C<In>, for example
498 C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
499 removed from the names for the C<\p{In...}>, for example
500 C<LatinExtendedA> instead of C<Latin Extended-A>.
502 There are a few cases where there exists both a script and a block by
503 the same name, in these cases the block version has C<Block> appended:
504 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
506 =head2 Code Point Arguments
508 A <code point argument> is either a decimal or a hexadecimal scalar,
509 or "U+" followed by hexadecimals.
513 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
514 can also test whether a code point is in the I<range> as returned by
515 L</charblock> and L</charscript> or as the values of the hash returned
516 by L</charblocks> and L</charscripts> by using charinrange():
518 use UnicodeCD qw(charscript charinrange);
520 $range = charscript('Hiragana');
521 print "looks like hiragana\n" if charinrange($range, $codepoint);
527 use UnicodeCD 'compexcl';
529 my $compexcl = compexcl("09dc");
531 The compexcl() returns the composition exclusion (that is, if the
532 character should not be produced during a precomposition) of the
533 character specified by a B<code point argument>.
535 If there is a composition exclusion for the character, true is
536 returned. Otherwise, false is returned.
544 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
545 while (<$COMPEXCLFH>) {
546 if (/^([0-9A-F]+) \# /) {
548 $COMPEXCL{$code} = undef;
558 my $code = _getcode($arg);
559 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
560 unless defined $code;
562 _compexcl() unless %COMPEXCL;
564 return exists $COMPEXCL{$code};
569 use UnicodeCD 'casefold';
571 my %casefold = casefold("09dc");
573 The casefold() returns the locale-independent case folding of the
574 character specified by a B<code point argument>.
576 If there is a case folding for that character, a reference to a hash
577 with the following fields is returned:
581 code code point with at least four hexdigits
582 status "C", "F", "S", or "I"
583 mapping one or more codes separated by spaces
585 The meaning of the I<status> is as follows:
587 C common case folding, common mappings shared
588 by both simple and full mappings
589 F full case folding, mappings that cause strings
590 to grow in length. Multiple characters are separated
592 S simple case folding, mappings to single characters
593 where different from F
594 I special case for dotted uppercase I and
596 - If this mapping is included, the result is
597 case-insensitive, but dotless and dotted I's
598 are not distinguished
599 - If this mapping is excluded, the result is not
600 fully case-insensitive, but dotless and dotted
601 I's are distinguished
603 If there is no case folding for that character, C<undef> is returned.
605 For more information about case mappings see
606 http://www.unicode.org/unicode/reports/tr21/
614 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
615 while (<$CASEFOLDFH>) {
616 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
618 $CASEFOLD{$code} = { code => $1,
630 my $code = _getcode($arg);
631 croak __PACKAGE__, "::casefold: unknown code '$arg'"
632 unless defined $code;
634 _casefold() unless %CASEFOLD;
636 return $CASEFOLD{$code};
641 use UnicodeCD 'casespec';
643 my %casespec = casespec("09dc");
645 The casespec() returns the potentially locale-dependent case mapping
646 of the character specified by a B<code point argument>. The mapping
647 may change the length of the string (which the basic Unicode case
648 mappings as returned by charinfo() never do).
650 If there is a case folding for that character, a reference to a hash
651 with the following fields is returned:
655 code code point with at least four hexdigits
659 condition condition list (may be undef)
661 The C<condition> is optional. Where present, it consists of one or
662 more I<locales> or I<contexts>, separated by spaces (other than as
663 used to separate elements, spaces are to be ignored). A condition
664 list overrides the normal behavior if all of the listed conditions are
665 true. Case distinctions in the condition list are not significant.
666 Conditions preceded by "NON_" represent the negation of the condition
668 A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
669 followed by a "_" and a 2-letter ISO language code (possibly followed
670 by a "_" and a variant code). You can find the lists of those codes,
671 see L<Locale::Country> and L<Locale::Language>.
673 A I<context> is one of the following choices:
675 FINAL The letter is not followed by a letter of
676 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
677 MODERN The mapping is only used for modern text
678 AFTER_i The last base character was "i" (U+0069)
680 For more information about case mappings see
681 http://www.unicode.org/unicode/reports/tr21/
689 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
690 while (<$CASESPECFH>) {
691 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
693 $CASESPEC{$code} = { code => $1,
707 my $code = _getcode($arg);
708 croak __PACKAGE__, "::casespec: unknown code '$arg'"
709 unless defined $code;
711 _casespec() unless %CASESPEC;
713 return $CASESPEC{$code};
716 =head2 UnicodeCD::UnicodeVersion
718 UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
719 Database, in other words, the version of the Unicode standard the
727 unless (defined $UNICODEVERSION) {
728 openunicode(\$VERSIONFH, "version");
729 chomp($UNICODEVERSION = <$VERSIONFH>);
731 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
732 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
734 return $UNICODEVERSION;
737 =head2 Implementation Note
739 The first use of charinfo() opens a read-only filehandle to the Unicode
740 Character Database (the database is included in the Perl distribution).
741 The filehandle is then kept open for further queries.