This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
installperl
[perl5.git] / lib / UnicodeCD.pm
CommitLineData
1189d1e4 1package UnicodeCD;
561c79ed
JH
2
3use strict;
4use warnings;
5
10a6ecd2 6our $VERSION = '0.1';
561c79ed
JH
7
8require Exporter;
9
10our @ISA = qw(Exporter);
10a6ecd2
JH
11our @EXPORT_OK = qw(charinfo
12 charblock charscript
13 charblocks charscripts
b08cd201
JH
14 charinrange
15 compexcl
16 casefold casespec);
561c79ed
JH
17
18use Carp;
19
20=head1 NAME
21
1189d1e4 22UnicodeCD - Unicode character database
561c79ed
JH
23
24=head1 SYNOPSIS
25
1189d1e4 26 use UnicodeCD 'charinfo';
b08cd201 27 my $charinfo = charinfo($codepoint);
561c79ed 28
1189d1e4 29 use UnicodeCD 'charblock';
e882dd67
JH
30 my $charblock = charblock($codepoint);
31
1189d1e4 32 use UnicodeCD 'charscript';
e882dd67 33 my $charscript = charblock($codepoint);
561c79ed 34
e145285f
JH
35 use UnicodeCD 'charblocks';
36 my $charblocks = charblocks();
37
38 use UnicodeCD 'charscripts';
39 my %charscripts = charscripts();
40
41 use UnicodeCD qw(charscript charinrange);
42 my $range = charscript($script);
43 print "looks like $script\n" if charinrange($range, $codepoint);
44
45 use UnicodeCD 'compexcl';
46 my $compexcl = compexcl($codepoint);
47
48 my $unicode_version = UnicodeCD::UnicodeVersion();
49
561c79ed
JH
50=head1 DESCRIPTION
51
e145285f 52The UnicodeCD module offers a simple interface to the Unicode Character
561c79ed
JH
53Database.
54
55=cut
56
10a6ecd2
JH
57my $UNICODEFH;
58my $BLOCKSFH;
59my $SCRIPTSFH;
60my $VERSIONFH;
b08cd201
JH
61my $COMPEXCLFH;
62my $CASEFOLDFH;
63my $CASESPECFH;
561c79ed
JH
64
65sub openunicode {
66 my ($rfh, @path) = @_;
67 my $f;
68 unless (defined $$rfh) {
69 for my $d (@INC) {
70 use File::Spec;
71 $f = File::Spec->catfile($d, "unicode", @path);
32c16050 72 last if open($$rfh, $f);
e882dd67 73 undef $f;
561c79ed 74 }
e882dd67
JH
75 croak __PACKAGE__, ": failed to find ",
76 File::Spec->catfile(@path), " in @INC"
77 unless defined $f;
561c79ed
JH
78 }
79 return $f;
80}
81
82=head2 charinfo
83
1189d1e4 84 use UnicodeCD 'charinfo';
561c79ed 85
b08cd201 86 my $charinfo = charinfo(0x41);
561c79ed 87
b08cd201
JH
88charinfo() returns a reference to a hash that has the following fields
89as defined by the Unicode standard:
561c79ed
JH
90
91 key
92
93 code code point with at least four hexdigits
94 name name of the character IN UPPER CASE
95 category general category of the character
96 combining classes used in the Canonical Ordering Algorithm
97 bidi bidirectional category
98 decomposition character decomposition mapping
99 decimal if decimal digit this is the integer numeric value
100 digit if digit this is the numeric value
101 numeric if numeric is the integer or rational numeric value
102 mirrored if mirrored in bidirectional text
103 unicode10 Unicode 1.0 name if existed and different
104 comment ISO 10646 comment field
105 upper uppercase equivalent mapping
106 lower lowercase equivalent mapping
107 title titlecase equivalent mapping
e882dd67 108
561c79ed 109 block block the character belongs to (used in \p{In...})
e882dd67 110 script script the character belongs to
561c79ed 111
b08cd201 112If no match is found, a reference to an empty hash is returned.
561c79ed 113
32c16050
JH
114The C<block> property is the same as as returned by charinfo(). It is
115not defined in the Unicode Character Database proper (Chapter 4 of the
116Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67 117of TUS3). Similarly for the C<script> property.
32c16050
JH
118
119Note that you cannot do (de)composition and casing based solely on the
120above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 121you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed
JH
122
123=cut
124
10a6ecd2
JH
125sub _getcode {
126 my $arg = shift;
127
128 if ($arg =~ /^\d+$/) {
129 return $arg;
130 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
131 return hex($1);
132 }
133
134 return;
135}
136
561c79ed 137sub charinfo {
10a6ecd2
JH
138 my $arg = shift;
139 my $code = _getcode($arg);
140 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
141 unless defined $code;
561c79ed
JH
142 my $hexk = sprintf("%04X", $code);
143
10a6ecd2
JH
144 openunicode(\$UNICODEFH, "Unicode.txt");
145 if (defined $UNICODEFH) {
561c79ed 146 use Search::Dict;
10a6ecd2
JH
147 if (look($UNICODEFH, "$hexk;") >= 0) {
148 my $line = <$UNICODEFH>;
561c79ed
JH
149 chomp $line;
150 my %prop;
151 @prop{qw(
152 code name category
153 combining bidi decomposition
154 decimal digit numeric
155 mirrored unicode10 comment
156 upper lower title
157 )} = split(/;/, $line, -1);
158 if ($prop{code} eq $hexk) {
a196fbfd
JH
159 $prop{block} = charblock($code);
160 $prop{script} = charscript($code);
b08cd201 161 return \%prop;
561c79ed
JH
162 }
163 }
164 }
165 return;
166}
167
e882dd67
JH
168sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
169 my ($table, $lo, $hi, $code) = @_;
170
171 return if $lo > $hi;
172
173 my $mid = int(($lo+$hi) / 2);
174
175 if ($table->[$mid]->[0] < $code) {
10a6ecd2 176 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
177 return $table->[$mid]->[2];
178 } else {
179 _search($table, $mid + 1, $hi, $code);
180 }
181 } elsif ($table->[$mid]->[0] > $code) {
182 _search($table, $lo, $mid - 1, $code);
183 } else {
184 return $table->[$mid]->[2];
185 }
186}
187
10a6ecd2
JH
188sub charinrange {
189 my ($range, $arg) = @_;
190 my $code = _getcode($arg);
191 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
192 unless defined $code;
193 _search($range, 0, $#$range, $code);
194}
195
354a27bf 196=head2 charblock
561c79ed 197
1189d1e4 198 use UnicodeCD 'charblock';
561c79ed
JH
199
200 my $charblock = charblock(0x41);
10a6ecd2
JH
201 my $charblock = charblock(1234);
202 my $charblock = charblock("0x263a");
203 my $charblock = charblock("U+263a");
204
205 my $ranges = charblock('Armenian');
206
207With a B<code point argument> charblock() returns the block the character
208belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 209positions within all blocks are defined.
10a6ecd2
JH
210
211If supplied with an argument that can't be a code point, charblock()
212tries to do the opposite and interpret the argument as a character
213block. The return value is a I<range>: an anonymous list that
214contains anonymous lists, which in turn contain I<start-of-range>,
215I<end-of-range> code point pairs. You can test whether a code point
216is in a range using the L</charinrange> function. If the argument is
217not a known charater block, C<undef> is returned.
561c79ed 218
561c79ed
JH
219=cut
220
221my @BLOCKS;
10a6ecd2 222my %BLOCKS;
561c79ed 223
10a6ecd2 224sub _charblocks {
561c79ed 225 unless (@BLOCKS) {
10a6ecd2
JH
226 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
227 while (<$BLOCKSFH>) {
2796c109 228 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
229 my ($lo, $hi) = (hex($1), hex($2));
230 my $subrange = [ $lo, $hi, $3 ];
231 push @BLOCKS, $subrange;
232 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
233 }
234 }
10a6ecd2 235 close($BLOCKSFH);
561c79ed
JH
236 }
237 }
10a6ecd2
JH
238}
239
240sub charblock {
241 my $arg = shift;
242
243 _charblocks() unless @BLOCKS;
244
245 my $code = _getcode($arg);
561c79ed 246
10a6ecd2
JH
247 if (defined $code) {
248 _search(\@BLOCKS, 0, $#BLOCKS, $code);
249 } else {
250 if (exists $BLOCKS{$arg}) {
251 return $BLOCKS{$arg};
252 } else {
253 return;
254 }
255 }
e882dd67
JH
256}
257
258=head2 charscript
259
1189d1e4 260 use UnicodeCD 'charscript';
e882dd67
JH
261
262 my $charscript = charscript(0x41);
10a6ecd2
JH
263 my $charscript = charscript(1234);
264 my $charscript = charscript("U+263a");
e882dd67 265
10a6ecd2
JH
266 my $ranges = charscript('Thai');
267
268With a B<code point argument> charscript() returns the script the
b08cd201 269character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2
JH
270
271If supplied with an argument that can't be a code point, charscript()
272tries to do the opposite and interpret the argument as a character
273script. The return value is a I<range>: an anonymous list that
274contains anonymous lists, which in turn contain I<start-of-range>,
275I<end-of-range> code point pairs. You can test whether a code point
276is in a range using the L</charinrange> function. If the argument is
277not a known charater script, C<undef> is returned.
e882dd67 278
e882dd67
JH
279=cut
280
281my @SCRIPTS;
10a6ecd2 282my %SCRIPTS;
e882dd67 283
10a6ecd2 284sub _charscripts {
e882dd67 285 unless (@SCRIPTS) {
10a6ecd2
JH
286 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
287 while (<$SCRIPTSFH>) {
e882dd67 288 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
289 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
290 my $script = lc($3);
291 $script =~ s/\b(\w)/uc($1)/ge;
292 my $subrange = [ $lo, $hi, $script ];
293 push @SCRIPTS, $subrange;
294 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
295 }
296 }
10a6ecd2 297 close($SCRIPTSFH);
e882dd67
JH
298 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
299 }
300 }
10a6ecd2
JH
301}
302
303sub charscript {
304 my $arg = shift;
305
306 _charscripts() unless @SCRIPTS;
e882dd67 307
10a6ecd2
JH
308 my $code = _getcode($arg);
309
310 if (defined $code) {
311 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
312 } else {
313 if (exists $SCRIPTS{$arg}) {
314 return $SCRIPTS{$arg};
315 } else {
316 return;
317 }
318 }
319}
320
321=head2 charblocks
322
1189d1e4 323 use UnicodeCD 'charblocks';
10a6ecd2 324
b08cd201 325 my $charblocks = charblocks();
10a6ecd2 326
b08cd201
JH
327charblocks() returns a reference to a hash with the known block names
328as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2
JH
329
330=cut
331
332sub charblocks {
b08cd201
JH
333 _charblocks() unless %BLOCKS;
334 return \%BLOCKS;
10a6ecd2
JH
335}
336
337=head2 charscripts
338
1189d1e4 339 use UnicodeCD 'charscripts';
10a6ecd2
JH
340
341 my %charscripts = charscripts();
342
343charscripts() returns a hash with the known script names as the keys,
344and the code point ranges (see L</charscript>) as the values.
345
346=cut
347
348sub charscripts {
b08cd201
JH
349 _charscripts() unless %SCRIPTS;
350 return \%SCRIPTS;
561c79ed
JH
351}
352
10a6ecd2 353=head2 Blocks versus Scripts
ad9cab37 354
10a6ecd2
JH
355The difference between a block and a script is that scripts are closer
356to the linguistic notion of a set of characters required to present
357languages, while block is more of an artifact of the Unicode character
358numbering and separation into blocks of 256 characters.
3aa957f9
JH
359
360For example the Latin B<script> is spread over several B<blocks>, such
361as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
362C<Latin Extended-B>. On the other hand, the Latin script does not
363contain all the characters of the C<Basic Latin> block (also known as
364the ASCII): it includes only the letters, not for example the digits
365or the punctuation.
ad9cab37 366
3aa957f9 367For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37
JH
368
369For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
370
3aa957f9
JH
371=head2 Matching Scripts and Blocks
372
373Both scripts and blocks can be matched using the regular expression
374construct C<\p{In...}> and its negation C<\P{In...}>.
375
376The name of the script or the block comes after the C<In>, for example
377C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2
JH
378removed from the names for the C<\p{In...}>, for example
379C<LatinExtendedA> instead of C<Latin Extended-A>.
380
381There are a few cases where there exists both a script and a block by
382the same name, in these cases the block version has C<Block> appended:
383C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
384
b08cd201
JH
385=head2 Code Point Arguments
386
387A <code point argument> is either a decimal or a hexadecimal scalar,
388or "U+" followed by hexadecimals.
389
10a6ecd2
JH
390=head2 charinrange
391
392In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
393can also test whether a code point is in the I<range> as returned by
394L</charblock> and L</charscript> or as the values of the hash returned
e618509d 395by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 396
1189d1e4 397 use UnicodeCD qw(charscript charinrange);
10a6ecd2
JH
398
399 $range = charscript('Hiragana');
e145285f 400 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
401
402=cut
403
b08cd201
JH
404=head2 compexcl
405
1189d1e4 406 use UnicodeCD 'compexcl';
b08cd201
JH
407
408 my $compexcl = compexcl("09dc");
409
410The compexcl() returns the composition exclusion (that is, if the
9046a8ae
SC
411character should not be produced during a precomposition) of the
412character specified by a B<code point argument>.
b08cd201
JH
413
414If there is a composition exclusion for the character, true is
415returned. Otherwise, false is returned.
416
417=cut
418
419my %COMPEXCL;
420
421sub _compexcl {
422 unless (%COMPEXCL) {
423 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
424 while (<$COMPEXCLFH>) {
425 if (/^([0-9A-F]+) \# /) {
426 my $code = hex($1);
427 $COMPEXCL{$code} = undef;
428 }
429 }
430 close($COMPEXCLFH);
431 }
432 }
433}
434
435sub compexcl {
436 my $arg = shift;
437 my $code = _getcode($arg);
438
439 _compexcl() unless %COMPEXCL;
440
441 return exists $COMPEXCL{$code};
442}
443
444=head2 casefold
445
1189d1e4 446 use UnicodeCD 'casefold';
b08cd201
JH
447
448 my %casefold = casefold("09dc");
449
450The casefold() returns the locale-independent case folding of the
451character specified by a B<code point argument>.
452
453If there is a case folding for that character, a reference to a hash
454with the following fields is returned:
455
456 key
457
458 code code point with at least four hexdigits
459 status "C", "F", "S", or "I"
460 mapping one or more codes separated by spaces
461
462The meaning of the I<status> is as follows:
463
464 C common case folding, common mappings shared
465 by both simple and full mappings
466 F full case folding, mappings that cause strings
467 to grow in length. Multiple characters are separated
468 by spaces
469 S simple case folding, mappings to single characters
470 where different from F
471 I special case for dotted uppercase I and
472 dotless lowercase i
473 - If this mapping is included, the result is
474 case-insensitive, but dotless and dotted I's
475 are not distinguished
476 - If this mapping is excluded, the result is not
477 fully case-insensitive, but dotless and dotted
478 I's are distinguished
479
480If there is no case folding for that character, C<undef> is returned.
481
482For more information about case mappings see
483http://www.unicode.org/unicode/reports/tr21/
484
485=cut
486
487my %CASEFOLD;
488
489sub _casefold {
490 unless (%CASEFOLD) {
491 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
492 while (<$CASEFOLDFH>) {
493 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
494 my $code = hex($1);
495 $CASEFOLD{$code} = { code => $1,
496 status => $2,
497 mapping => $3 };
498 }
499 }
500 close($CASEFOLDFH);
501 }
502 }
503}
504
505sub casefold {
506 my $arg = shift;
507 my $code = _getcode($arg);
508
509 _casefold() unless %CASEFOLD;
510
511 return $CASEFOLD{$code};
512}
513
514=head2 casespec
515
1189d1e4 516 use UnicodeCD 'casespec';
b08cd201
JH
517
518 my %casespec = casespec("09dc");
519
520The casespec() returns the potentially locale-dependent case mapping
521of the character specified by a B<code point argument>. The mapping
522may change the length of the string (which the basic Unicode case
523mappings as returned by charinfo() never do).
524
525If there is a case folding for that character, a reference to a hash
526with the following fields is returned:
527
528 key
529
530 code code point with at least four hexdigits
531 lower lowercase
532 title titlecase
533 upper uppercase
534 condition condition list (may be undef)
535
536The C<condition> is optional. Where present, it consists of one or
537more I<locales> or I<contexts>, separated by spaces (other than as
538used to separate elements, spaces are to be ignored). A condition
539list overrides the normal behavior if all of the listed conditions are
540true. Case distinctions in the condition list are not significant.
541Conditions preceded by "NON_" represent the negation of the condition
542
543A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d
JH
544followed by a "_" and a 2-letter ISO language code (possibly followed
545by a "_" and a variant code). You can find the lists of those codes,
546see L<Locale::Country> and L<Locale::Language>.
b08cd201
JH
547
548A I<context> is one of the following choices:
549
550 FINAL The letter is not followed by a letter of
551 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
552 MODERN The mapping is only used for modern text
e618509d 553 AFTER_i The last base character was "i" (U+0069)
b08cd201
JH
554
555For more information about case mappings see
556http://www.unicode.org/unicode/reports/tr21/
557
558=cut
559
560my %CASESPEC;
561
562sub _casespec {
563 unless (%CASESPEC) {
564 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
565 while (<$CASESPECFH>) {
566 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
567 my $code = hex($1);
568 $CASESPEC{$code} = { code => $1,
569 lower => $2,
570 title => $3,
571 upper => $4,
572 condition => $5 };
573 }
574 }
575 close($CASESPECFH);
576 }
577 }
578}
579
580sub casespec {
581 my $arg = shift;
582 my $code = _getcode($arg);
583
584 _casespec() unless %CASESPEC;
585
586 return $CASESPEC{$code};
587}
588
1189d1e4 589=head2 UnicodeCD::UnicodeVersion
10a6ecd2 590
1189d1e4 591UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
10a6ecd2
JH
592Database, in other words, the version of the Unicode standard the
593database implements.
594
595=cut
596
597my $UNICODEVERSION;
598
599sub UnicodeVersion {
600 unless (defined $UNICODEVERSION) {
601 openunicode(\$VERSIONFH, "version");
602 chomp($UNICODEVERSION = <$VERSIONFH>);
603 close($VERSIONFH);
604 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
605 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
606 }
607 return $UNICODEVERSION;
608}
3aa957f9
JH
609
610=head2 Implementation Note
32c16050 611
ad9cab37
JH
612The first use of charinfo() opens a read-only filehandle to the Unicode
613Character Database (the database is included in the Perl distribution).
614The filehandle is then kept open for further queries.
32c16050 615
561c79ed
JH
616=head1 AUTHOR
617
618Jarkko Hietaniemi
619
620=cut
621
6221;