This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
whether DEPTH gets dumped in formats varies within 5.10.0
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
5
324f9e44 6our $VERSION = '0.26';
561c79ed 7
741297c1
JH
8use Storable qw(dclone);
9
561c79ed
JH
10require Exporter;
11
12our @ISA = qw(Exporter);
74f8133e 13
10a6ecd2
JH
14our @EXPORT_OK = qw(charinfo
15 charblock charscript
16 charblocks charscripts
b08cd201 17 charinrange
ea508aee 18 general_categories bidi_types
b08cd201 19 compexcl
a2bd7410
JH
20 casefold casespec
21 namedseq);
561c79ed
JH
22
23use Carp;
24
25=head1 NAME
26
55d7b906 27Unicode::UCD - Unicode character database
561c79ed
JH
28
29=head1 SYNOPSIS
30
55d7b906 31 use Unicode::UCD 'charinfo';
b08cd201 32 my $charinfo = charinfo($codepoint);
561c79ed 33
55d7b906 34 use Unicode::UCD 'charblock';
e882dd67
JH
35 my $charblock = charblock($codepoint);
36
55d7b906 37 use Unicode::UCD 'charscript';
65044554 38 my $charscript = charscript($codepoint);
561c79ed 39
55d7b906 40 use Unicode::UCD 'charblocks';
e145285f
JH
41 my $charblocks = charblocks();
42
55d7b906 43 use Unicode::UCD 'charscripts';
ea508aee 44 my $charscripts = charscripts();
e145285f 45
55d7b906 46 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
47 my $range = charscript($script);
48 print "looks like $script\n" if charinrange($range, $codepoint);
49
ea508aee
JH
50 use Unicode::UCD qw(general_categories bidi_types);
51 my $categories = general_categories();
52 my $types = bidi_types();
53
55d7b906 54 use Unicode::UCD 'compexcl';
e145285f
JH
55 my $compexcl = compexcl($codepoint);
56
a2bd7410
JH
57 use Unicode::UCD 'namedseq';
58 my $namedseq = namedseq($named_sequence_name);
59
55d7b906 60 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 61
561c79ed
JH
62=head1 DESCRIPTION
63
8b731da2
JH
64The Unicode::UCD module offers a simple interface to the Unicode
65Character Database.
561c79ed
JH
66
67=cut
68
10a6ecd2
JH
69my $UNICODEFH;
70my $BLOCKSFH;
71my $SCRIPTSFH;
72my $VERSIONFH;
b08cd201
JH
73my $COMPEXCLFH;
74my $CASEFOLDFH;
75my $CASESPECFH;
a2bd7410 76my $NAMEDSEQFH;
561c79ed
JH
77
78sub openunicode {
79 my ($rfh, @path) = @_;
80 my $f;
81 unless (defined $$rfh) {
82 for my $d (@INC) {
83 use File::Spec;
55d7b906 84 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 85 last if open($$rfh, $f);
e882dd67 86 undef $f;
561c79ed 87 }
e882dd67
JH
88 croak __PACKAGE__, ": failed to find ",
89 File::Spec->catfile(@path), " in @INC"
90 unless defined $f;
561c79ed
JH
91 }
92 return $f;
93}
94
95=head2 charinfo
96
55d7b906 97 use Unicode::UCD 'charinfo';
561c79ed 98
b08cd201 99 my $charinfo = charinfo(0x41);
561c79ed 100
b08cd201
JH
101charinfo() returns a reference to a hash that has the following fields
102as defined by the Unicode standard:
561c79ed
JH
103
104 key
105
106 code code point with at least four hexdigits
107 name name of the character IN UPPER CASE
108 category general category of the character
109 combining classes used in the Canonical Ordering Algorithm
ea508aee 110 bidi bidirectional type
561c79ed
JH
111 decomposition character decomposition mapping
112 decimal if decimal digit this is the integer numeric value
113 digit if digit this is the numeric value
114 numeric if numeric is the integer or rational numeric value
115 mirrored if mirrored in bidirectional text
116 unicode10 Unicode 1.0 name if existed and different
117 comment ISO 10646 comment field
118 upper uppercase equivalent mapping
119 lower lowercase equivalent mapping
120 title titlecase equivalent mapping
e882dd67 121
561c79ed 122 block block the character belongs to (used in \p{In...})
eb0cc9e3 123 script script the character belongs to
561c79ed 124
b08cd201 125If no match is found, a reference to an empty hash is returned.
561c79ed 126
d1be9408 127The C<block> property is the same as returned by charinfo(). It is
32c16050 128not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2
JH
129Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
130(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050
JH
131
132Note that you cannot do (de)composition and casing based solely on the
133above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 134you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed
JH
135
136=cut
137
0616d9cf 138# NB: This function is duplicated in charnames.pm
10a6ecd2
JH
139sub _getcode {
140 my $arg = shift;
141
dc0a4417 142 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 143 return $arg;
dc0a4417 144 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
145 return hex($1);
146 }
147
148 return;
149}
150
ac5ea531
JH
151# Lingua::KO::Hangul::Util not part of the standard distribution
152# but it will be used if available.
153
154eval { require Lingua::KO::Hangul::Util };
155my $hasHangulUtil = ! $@;
156if ($hasHangulUtil) {
157 Lingua::KO::Hangul::Util->import();
158}
9087a70b
ST
159
160sub hangul_decomp { # internal: called from charinfo
ac5ea531
JH
161 if ($hasHangulUtil) {
162 my @tmp = decomposeHangul(shift);
163 return sprintf("%04X %04X", @tmp) if @tmp == 2;
164 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
165 }
166 return;
167}
168
169sub hangul_charname { # internal: called from charinfo
170 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b
ST
171}
172
9087a70b
ST
173sub han_charname { # internal: called from charinfo
174 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b
ST
175}
176
324f9e44
RB
177my %first_last = (
178 'CJK Ideograph Extension A' => [ 0x3400, 0x4DB5 ],
179 'CJK Ideograph' => [ 0x4E00, 0x9FA5 ],
180 'CJK Ideograph Extension B' => [ 0x20000, 0x2A6D6 ],
181);
182
183get_charinfo_ranges();
184
185sub get_charinfo_ranges {
186 my @blocks = keys %first_last;
187
188 my $fh;
189 openunicode( \$fh, 'UnicodeData.txt' );
190 if( defined $fh ){
191 while( my $line = <$fh> ){
192 next unless $line =~ /(?:First|Last)/;
193 if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
194 my ($number,$block,$type);
195 ($number,$block) = split /;/, $line;
196 $block =~ s/<|>//g;
197 ($block,$type) = split /, /, $block;
198 my $index = $type eq 'First' ? 0 : 1;
199 $first_last{ $block }->[$index] = hex $number;
200 }
201 }
202 }
203}
204
a6fa416b
ST
205my @CharinfoRanges = (
206# block name
207# [ first, last, coderef to name, coderef to decompose ],
208# CJK Ideographs Extension A
324f9e44 209 [ @{ $first_last{'CJK Ideograph Extension A'} }, \&han_charname, undef ],
a6fa416b 210# CJK Ideographs
324f9e44 211 [ @{ $first_last{'CJK Ideograph'} }, \&han_charname, undef ],
a6fa416b 212# Hangul Syllables
ac5ea531 213 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b
ST
214# Non-Private Use High Surrogates
215 [ 0xD800, 0xDB7F, undef, undef ],
216# Private Use High Surrogates
217 [ 0xDB80, 0xDBFF, undef, undef ],
218# Low Surrogates
219 [ 0xDC00, 0xDFFF, undef, undef ],
220# The Private Use Area
221 [ 0xE000, 0xF8FF, undef, undef ],
222# CJK Ideographs Extension B
324f9e44 223 [ @{ $first_last{'CJK Ideograph Extension B'} }, \&han_charname, undef ],
a6fa416b
ST
224# Plane 15 Private Use Area
225 [ 0xF0000, 0xFFFFD, undef, undef ],
226# Plane 16 Private Use Area
227 [ 0x100000, 0x10FFFD, undef, undef ],
228);
229
561c79ed 230sub charinfo {
10a6ecd2
JH
231 my $arg = shift;
232 my $code = _getcode($arg);
233 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
234 unless defined $code;
e63dbbf9 235 my $hexk = sprintf("%06X", $code);
a6fa416b
ST
236 my($rcode,$rname,$rdec);
237 foreach my $range (@CharinfoRanges){
74f8133e 238 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 239 $rcode = $hexk;
e63dbbf9
JH
240 $rcode =~ s/^0+//;
241 $rcode = sprintf("%04X", hex($rcode));
a6fa416b
ST
242 $rname = $range->[2] ? $range->[2]->($code) : '';
243 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 244 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b
ST
245 last;
246 }
247 }
551b6b6f 248 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 249 if (defined $UNICODEFH) {
e63dbbf9
JH
250 use Search::Dict 1.02;
251 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 252 my $line = <$UNICODEFH>;
c5a29f40 253 return unless defined $line;
561c79ed
JH
254 chomp $line;
255 my %prop;
256 @prop{qw(
257 code name category
258 combining bidi decomposition
259 decimal digit numeric
260 mirrored unicode10 comment
261 upper lower title
262 )} = split(/;/, $line, -1);
e63dbbf9
JH
263 $hexk =~ s/^0+//;
264 $hexk = sprintf("%04X", hex($hexk));
561c79ed 265 if ($prop{code} eq $hexk) {
a196fbfd
JH
266 $prop{block} = charblock($code);
267 $prop{script} = charscript($code);
a6fa416b
ST
268 if(defined $rname){
269 $prop{code} = $rcode;
270 $prop{name} = $rname;
271 $prop{decomposition} = $rdec;
272 }
b08cd201 273 return \%prop;
561c79ed
JH
274 }
275 }
276 }
277 return;
278}
279
e882dd67
JH
280sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
281 my ($table, $lo, $hi, $code) = @_;
282
283 return if $lo > $hi;
284
285 my $mid = int(($lo+$hi) / 2);
286
287 if ($table->[$mid]->[0] < $code) {
10a6ecd2 288 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
289 return $table->[$mid]->[2];
290 } else {
291 _search($table, $mid + 1, $hi, $code);
292 }
293 } elsif ($table->[$mid]->[0] > $code) {
294 _search($table, $lo, $mid - 1, $code);
295 } else {
296 return $table->[$mid]->[2];
297 }
298}
299
10a6ecd2
JH
300sub charinrange {
301 my ($range, $arg) = @_;
302 my $code = _getcode($arg);
303 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
304 unless defined $code;
305 _search($range, 0, $#$range, $code);
306}
307
354a27bf 308=head2 charblock
561c79ed 309
55d7b906 310 use Unicode::UCD 'charblock';
561c79ed
JH
311
312 my $charblock = charblock(0x41);
10a6ecd2
JH
313 my $charblock = charblock(1234);
314 my $charblock = charblock("0x263a");
315 my $charblock = charblock("U+263a");
316
78bf21c2 317 my $range = charblock('Armenian');
10a6ecd2 318
78bf21c2 319With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 320belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 321positions within all blocks are defined.
10a6ecd2 322
78bf21c2
JH
323See also L</Blocks versus Scripts>.
324
eb0cc9e3
JH
325If supplied with an argument that can't be a code point, charblock() tries
326to do the opposite and interpret the argument as a character block. The
327return value is a I<range>: an anonymous list of lists that contain
a2bd7410
JH
328I<start-of-range>, I<end-of-range> code point pairs. You can test whether
329a code point is in a range using the L</charinrange> function. If the
3c4b39be 330argument is not a known character block, C<undef> is returned.
561c79ed 331
561c79ed
JH
332=cut
333
334my @BLOCKS;
10a6ecd2 335my %BLOCKS;
561c79ed 336
10a6ecd2 337sub _charblocks {
561c79ed 338 unless (@BLOCKS) {
10a6ecd2 339 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 340 local $_;
10a6ecd2 341 while (<$BLOCKSFH>) {
2796c109 342 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
343 my ($lo, $hi) = (hex($1), hex($2));
344 my $subrange = [ $lo, $hi, $3 ];
345 push @BLOCKS, $subrange;
346 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
347 }
348 }
10a6ecd2 349 close($BLOCKSFH);
561c79ed
JH
350 }
351 }
10a6ecd2
JH
352}
353
354sub charblock {
355 my $arg = shift;
356
357 _charblocks() unless @BLOCKS;
358
359 my $code = _getcode($arg);
561c79ed 360
10a6ecd2
JH
361 if (defined $code) {
362 _search(\@BLOCKS, 0, $#BLOCKS, $code);
363 } else {
364 if (exists $BLOCKS{$arg}) {
741297c1 365 return dclone $BLOCKS{$arg};
10a6ecd2
JH
366 } else {
367 return;
368 }
369 }
e882dd67
JH
370}
371
372=head2 charscript
373
55d7b906 374 use Unicode::UCD 'charscript';
e882dd67
JH
375
376 my $charscript = charscript(0x41);
10a6ecd2
JH
377 my $charscript = charscript(1234);
378 my $charscript = charscript("U+263a");
e882dd67 379
78bf21c2 380 my $range = charscript('Thai');
10a6ecd2 381
78bf21c2 382With a B<code point argument> charscript() returns the I<script> the
b08cd201 383character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 384
78bf21c2
JH
385See also L</Blocks versus Scripts>.
386
eb0cc9e3
JH
387If supplied with an argument that can't be a code point, charscript() tries
388to do the opposite and interpret the argument as a character script. The
389return value is a I<range>: an anonymous list of lists that contain
390I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
391code point is in a range using the L</charinrange> function. If the
3c4b39be 392argument is not a known character script, C<undef> is returned.
e882dd67 393
e882dd67
JH
394=cut
395
396my @SCRIPTS;
10a6ecd2 397my %SCRIPTS;
e882dd67 398
10a6ecd2 399sub _charscripts {
e882dd67 400 unless (@SCRIPTS) {
10a6ecd2 401 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 402 local $_;
10a6ecd2 403 while (<$SCRIPTSFH>) {
e882dd67 404 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
405 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
406 my $script = lc($3);
407 $script =~ s/\b(\w)/uc($1)/ge;
408 my $subrange = [ $lo, $hi, $script ];
409 push @SCRIPTS, $subrange;
410 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
411 }
412 }
10a6ecd2 413 close($SCRIPTSFH);
e882dd67
JH
414 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
415 }
416 }
10a6ecd2
JH
417}
418
419sub charscript {
420 my $arg = shift;
421
422 _charscripts() unless @SCRIPTS;
e882dd67 423
10a6ecd2
JH
424 my $code = _getcode($arg);
425
426 if (defined $code) {
427 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
428 } else {
429 if (exists $SCRIPTS{$arg}) {
741297c1 430 return dclone $SCRIPTS{$arg};
10a6ecd2
JH
431 } else {
432 return;
433 }
434 }
435}
436
437=head2 charblocks
438
55d7b906 439 use Unicode::UCD 'charblocks';
10a6ecd2 440
b08cd201 441 my $charblocks = charblocks();
10a6ecd2 442
b08cd201
JH
443charblocks() returns a reference to a hash with the known block names
444as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 445
78bf21c2
JH
446See also L</Blocks versus Scripts>.
447
10a6ecd2
JH
448=cut
449
450sub charblocks {
b08cd201 451 _charblocks() unless %BLOCKS;
741297c1 452 return dclone \%BLOCKS;
10a6ecd2
JH
453}
454
455=head2 charscripts
456
55d7b906 457 use Unicode::UCD 'charscripts';
10a6ecd2 458
ea508aee 459 my $charscripts = charscripts();
10a6ecd2 460
ea508aee
JH
461charscripts() returns a reference to a hash with the known script
462names as the keys, and the code point ranges (see L</charscript>) as
463the values.
10a6ecd2 464
78bf21c2
JH
465See also L</Blocks versus Scripts>.
466
10a6ecd2
JH
467=cut
468
469sub charscripts {
b08cd201 470 _charscripts() unless %SCRIPTS;
741297c1 471 return dclone \%SCRIPTS;
561c79ed
JH
472}
473
10a6ecd2 474=head2 Blocks versus Scripts
ad9cab37 475
10a6ecd2
JH
476The difference between a block and a script is that scripts are closer
477to the linguistic notion of a set of characters required to present
478languages, while block is more of an artifact of the Unicode character
eb0cc9e3 479numbering and separation into blocks of (mostly) 256 characters.
3aa957f9
JH
480
481For example the Latin B<script> is spread over several B<blocks>, such
482as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
483C<Latin Extended-B>. On the other hand, the Latin script does not
484contain all the characters of the C<Basic Latin> block (also known as
eb0cc9e3 485the ASCII): it includes only the letters, and not, for example, the digits
3aa957f9 486or the punctuation.
ad9cab37 487
3aa957f9 488For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37
JH
489
490For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
491
3aa957f9
JH
492=head2 Matching Scripts and Blocks
493
eb0cc9e3
JH
494Scripts are matched with the regular-expression construct
495C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
496while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
497any of the 256 code points in the Tibetan block).
10a6ecd2 498
b08cd201
JH
499=head2 Code Point Arguments
500
92e830a9
JH
501A I<code point argument> is either a decimal or a hexadecimal scalar
502designating a Unicode character, or C<U+> followed by hexadecimals
dc0a4417
JH
503designating a Unicode character. In other words, if you want a code
504point to be interpreted as a hexadecimal number, you must prefix it
43adb1d9 505with either C<0x> or C<U+>, because a string like e.g. C<123> will
dc0a4417
JH
506be interpreted as a decimal code point. Also note that Unicode is
507B<not> limited to 16 bits (the number of Unicode characters is
508open-ended, in theory unlimited): you may have more than 4 hexdigits.
b08cd201 509
10a6ecd2
JH
510=head2 charinrange
511
512In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
513can also test whether a code point is in the I<range> as returned by
514L</charblock> and L</charscript> or as the values of the hash returned
e618509d 515by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 516
55d7b906 517 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
518
519 $range = charscript('Hiragana');
e145285f 520 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
521
522=cut
523
ea508aee
JH
524my %GENERAL_CATEGORIES =
525 (
526 'L' => 'Letter',
527 'LC' => 'CasedLetter',
528 'Lu' => 'UppercaseLetter',
529 'Ll' => 'LowercaseLetter',
530 'Lt' => 'TitlecaseLetter',
531 'Lm' => 'ModifierLetter',
532 'Lo' => 'OtherLetter',
533 'M' => 'Mark',
534 'Mn' => 'NonspacingMark',
535 'Mc' => 'SpacingMark',
536 'Me' => 'EnclosingMark',
537 'N' => 'Number',
538 'Nd' => 'DecimalNumber',
539 'Nl' => 'LetterNumber',
540 'No' => 'OtherNumber',
541 'P' => 'Punctuation',
542 'Pc' => 'ConnectorPunctuation',
543 'Pd' => 'DashPunctuation',
544 'Ps' => 'OpenPunctuation',
545 'Pe' => 'ClosePunctuation',
546 'Pi' => 'InitialPunctuation',
547 'Pf' => 'FinalPunctuation',
548 'Po' => 'OtherPunctuation',
549 'S' => 'Symbol',
550 'Sm' => 'MathSymbol',
551 'Sc' => 'CurrencySymbol',
552 'Sk' => 'ModifierSymbol',
553 'So' => 'OtherSymbol',
554 'Z' => 'Separator',
555 'Zs' => 'SpaceSeparator',
556 'Zl' => 'LineSeparator',
557 'Zp' => 'ParagraphSeparator',
558 'C' => 'Other',
559 'Cc' => 'Control',
560 'Cf' => 'Format',
561 'Cs' => 'Surrogate',
562 'Co' => 'PrivateUse',
563 'Cn' => 'Unassigned',
564 );
565
566sub general_categories {
567 return dclone \%GENERAL_CATEGORIES;
568}
569
570=head2 general_categories
571
572 use Unicode::UCD 'general_categories';
573
574 my $categories = general_categories();
575
576The general_categories() returns a reference to a hash which has short
577general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
578names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
579C<Symbol>) as values. The hash is reversible in case you need to go
580from the long names to the short names. The general category is the
581one returned from charinfo() under the C<category> key.
582
583=cut
584
585my %BIDI_TYPES =
586 (
587 'L' => 'Left-to-Right',
588 'LRE' => 'Left-to-Right Embedding',
589 'LRO' => 'Left-to-Right Override',
590 'R' => 'Right-to-Left',
591 'AL' => 'Right-to-Left Arabic',
592 'RLE' => 'Right-to-Left Embedding',
593 'RLO' => 'Right-to-Left Override',
594 'PDF' => 'Pop Directional Format',
595 'EN' => 'European Number',
596 'ES' => 'European Number Separator',
597 'ET' => 'European Number Terminator',
598 'AN' => 'Arabic Number',
599 'CS' => 'Common Number Separator',
600 'NSM' => 'Non-Spacing Mark',
601 'BN' => 'Boundary Neutral',
602 'B' => 'Paragraph Separator',
603 'S' => 'Segment Separator',
604 'WS' => 'Whitespace',
605 'ON' => 'Other Neutrals',
606 );
607
608sub bidi_types {
609 return dclone \%BIDI_TYPES;
610}
611
612=head2 bidi_types
613
614 use Unicode::UCD 'bidi_types';
615
616 my $categories = bidi_types();
617
618The bidi_types() returns a reference to a hash which has the short
619bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
620names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
621hash is reversible in case you need to go from the long names to the
622short names. The bidi type is the one returned from charinfo()
623under the C<bidi> key. For the exact meaning of the various bidi classes
624the Unicode TR9 is recommended reading:
625http://www.unicode.org/reports/tr9/tr9-17.html
626(as of Unicode 5.0.0)
627
628=cut
629
b08cd201
JH
630=head2 compexcl
631
55d7b906 632 use Unicode::UCD 'compexcl';
b08cd201
JH
633
634 my $compexcl = compexcl("09dc");
635
636The compexcl() returns the composition exclusion (that is, if the
9046a8ae
SC
637character should not be produced during a precomposition) of the
638character specified by a B<code point argument>.
b08cd201
JH
639
640If there is a composition exclusion for the character, true is
641returned. Otherwise, false is returned.
642
643=cut
644
645my %COMPEXCL;
646
647sub _compexcl {
648 unless (%COMPEXCL) {
551b6b6f 649 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 650 local $_;
b08cd201 651 while (<$COMPEXCLFH>) {
822ebcc8 652 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201
JH
653 my $code = hex($1);
654 $COMPEXCL{$code} = undef;
655 }
656 }
657 close($COMPEXCLFH);
658 }
659 }
660}
661
662sub compexcl {
663 my $arg = shift;
664 my $code = _getcode($arg);
74f8133e
JH
665 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
666 unless defined $code;
b08cd201
JH
667
668 _compexcl() unless %COMPEXCL;
669
670 return exists $COMPEXCL{$code};
671}
672
673=head2 casefold
674
55d7b906 675 use Unicode::UCD 'casefold';
b08cd201 676
82c0b05b 677 my $casefold = casefold("00DF");
b08cd201
JH
678
679The casefold() returns the locale-independent case folding of the
680character specified by a B<code point argument>.
681
682If there is a case folding for that character, a reference to a hash
683with the following fields is returned:
684
685 key
686
687 code code point with at least four hexdigits
688 status "C", "F", "S", or "I"
689 mapping one or more codes separated by spaces
690
691The meaning of the I<status> is as follows:
692
693 C common case folding, common mappings shared
694 by both simple and full mappings
695 F full case folding, mappings that cause strings
696 to grow in length. Multiple characters are separated
697 by spaces
698 S simple case folding, mappings to single characters
699 where different from F
700 I special case for dotted uppercase I and
701 dotless lowercase i
702 - If this mapping is included, the result is
703 case-insensitive, but dotless and dotted I's
704 are not distinguished
705 - If this mapping is excluded, the result is not
706 fully case-insensitive, but dotless and dotted
707 I's are distinguished
708
709If there is no case folding for that character, C<undef> is returned.
710
711For more information about case mappings see
712http://www.unicode.org/unicode/reports/tr21/
713
714=cut
715
716my %CASEFOLD;
717
718sub _casefold {
719 unless (%CASEFOLD) {
551b6b6f 720 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 721 local $_;
b08cd201
JH
722 while (<$CASEFOLDFH>) {
723 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
724 my $code = hex($1);
725 $CASEFOLD{$code} = { code => $1,
726 status => $2,
727 mapping => $3 };
728 }
729 }
730 close($CASEFOLDFH);
731 }
732 }
733}
734
735sub casefold {
736 my $arg = shift;
737 my $code = _getcode($arg);
74f8133e
JH
738 croak __PACKAGE__, "::casefold: unknown code '$arg'"
739 unless defined $code;
b08cd201
JH
740
741 _casefold() unless %CASEFOLD;
742
743 return $CASEFOLD{$code};
744}
745
746=head2 casespec
747
55d7b906 748 use Unicode::UCD 'casespec';
b08cd201 749
82c0b05b 750 my $casespec = casespec("FB00");
b08cd201
JH
751
752The casespec() returns the potentially locale-dependent case mapping
753of the character specified by a B<code point argument>. The mapping
754may change the length of the string (which the basic Unicode case
755mappings as returned by charinfo() never do).
756
757If there is a case folding for that character, a reference to a hash
758with the following fields is returned:
759
760 key
761
762 code code point with at least four hexdigits
763 lower lowercase
764 title titlecase
765 upper uppercase
766 condition condition list (may be undef)
767
768The C<condition> is optional. Where present, it consists of one or
769more I<locales> or I<contexts>, separated by spaces (other than as
770used to separate elements, spaces are to be ignored). A condition
771list overrides the normal behavior if all of the listed conditions are
772true. Case distinctions in the condition list are not significant.
82c0b05b 773Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 774
f499c386
JH
775Note that when there are multiple case folding definitions for a
776single code point because of different locales, the value returned by
777casespec() is a hash reference which has the locales as the keys and
778hash references as described above as the values.
779
b08cd201 780A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d
JH
781followed by a "_" and a 2-letter ISO language code (possibly followed
782by a "_" and a variant code). You can find the lists of those codes,
783see L<Locale::Country> and L<Locale::Language>.
b08cd201
JH
784
785A I<context> is one of the following choices:
786
787 FINAL The letter is not followed by a letter of
788 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
789 MODERN The mapping is only used for modern text
e618509d 790 AFTER_i The last base character was "i" (U+0069)
b08cd201
JH
791
792For more information about case mappings see
793http://www.unicode.org/unicode/reports/tr21/
794
795=cut
796
797my %CASESPEC;
798
799sub _casespec {
800 unless (%CASESPEC) {
551b6b6f 801 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 802 local $_;
b08cd201
JH
803 while (<$CASESPECFH>) {
804 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
805 my ($hexcode, $lower, $title, $upper, $condition) =
806 ($1, $2, $3, $4, $5);
807 my $code = hex($hexcode);
808 if (exists $CASESPEC{$code}) {
809 if (exists $CASESPEC{$code}->{code}) {
810 my ($oldlower,
811 $oldtitle,
812 $oldupper,
813 $oldcondition) =
814 @{$CASESPEC{$code}}{qw(lower
815 title
816 upper
817 condition)};
822ebcc8
JH
818 if (defined $oldcondition) {
819 my ($oldlocale) =
f499c386 820 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
821 delete $CASESPEC{$code};
822 $CASESPEC{$code}->{$oldlocale} =
823 { code => $hexcode,
824 lower => $oldlower,
825 title => $oldtitle,
826 upper => $oldupper,
827 condition => $oldcondition };
f499c386
JH
828 }
829 }
830 my ($locale) =
831 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
832 $CASESPEC{$code}->{$locale} =
833 { code => $hexcode,
834 lower => $lower,
835 title => $title,
836 upper => $upper,
837 condition => $condition };
838 } else {
839 $CASESPEC{$code} =
840 { code => $hexcode,
841 lower => $lower,
842 title => $title,
843 upper => $upper,
844 condition => $condition };
845 }
b08cd201
JH
846 }
847 }
848 close($CASESPECFH);
849 }
850 }
851}
852
853sub casespec {
854 my $arg = shift;
855 my $code = _getcode($arg);
74f8133e
JH
856 croak __PACKAGE__, "::casespec: unknown code '$arg'"
857 unless defined $code;
b08cd201
JH
858
859 _casespec() unless %CASESPEC;
860
741297c1 861 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
862}
863
a2bd7410
JH
864=head2 namedseq()
865
866 use Unicode::UCD 'namedseq';
867
868 my $namedseq = namedseq("KATAKANA LETTER AINU P");
869 my @namedseq = namedseq("KATAKANA LETTER AINU P");
870 my %namedseq = namedseq();
871
872If used with a single argument in a scalar context, returns the string
873consisting of the code points of the named sequence, or C<undef> if no
874named sequence by that name exists. If used with a single argument in
875a list context, returns list of the code points. If used with no
876arguments in a list context, returns a hash with the names of the
877named sequences as the keys and the named sequences as strings as
878the values. Otherwise, returns C<undef> or empty list depending
879on the context.
880
881(New from Unicode 4.1.0)
882
883=cut
884
885my %NAMEDSEQ;
886
887sub _namedseq {
888 unless (%NAMEDSEQ) {
889 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
890 local $_;
891 while (<$NAMEDSEQFH>) {
892 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
893 my ($n, $s) = ($1, $2);
894 my @s = map { chr(hex($_)) } split(' ', $s);
895 $NAMEDSEQ{$n} = join("", @s);
896 }
897 }
898 close($NAMEDSEQFH);
899 }
900 }
901}
902
903sub namedseq {
904 _namedseq() unless %NAMEDSEQ;
905 my $wantarray = wantarray();
906 if (defined $wantarray) {
907 if ($wantarray) {
908 if (@_ == 0) {
909 return %NAMEDSEQ;
910 } elsif (@_ == 1) {
911 my $s = $NAMEDSEQ{ $_[0] };
912 return defined $s ? map { ord($_) } split('', $s) : ();
913 }
914 } elsif (@_ == 1) {
915 return $NAMEDSEQ{ $_[0] };
916 }
917 }
918 return;
919}
920
55d7b906 921=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 922
55d7b906
JH
923Unicode::UCD::UnicodeVersion() returns the version of the Unicode
924Character Database, in other words, the version of the Unicode
78bf21c2
JH
925standard the database implements. The version is a string
926of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
927
928=cut
929
930my $UNICODEVERSION;
931
932sub UnicodeVersion {
933 unless (defined $UNICODEVERSION) {
934 openunicode(\$VERSIONFH, "version");
935 chomp($UNICODEVERSION = <$VERSIONFH>);
936 close($VERSIONFH);
937 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
938 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
939 }
940 return $UNICODEVERSION;
941}
3aa957f9
JH
942
943=head2 Implementation Note
32c16050 944
ad9cab37
JH
945The first use of charinfo() opens a read-only filehandle to the Unicode
946Character Database (the database is included in the Perl distribution).
78bf21c2
JH
947The filehandle is then kept open for further queries. In other words,
948if you are wondering where one of your filehandles went, that's where.
32c16050 949
8b731da2
JH
950=head1 BUGS
951
952Does not yet support EBCDIC platforms.
953
561c79ed
JH
954=head1 AUTHOR
955
956Jarkko Hietaniemi
957
958=cut
959
9601;