This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
UCD.pm: Don't use NamedSequences.txt, saves disk
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
98ef7649 5use charnames ();
561c79ed 6
7da24786 7our $VERSION = '0.29';
561c79ed 8
741297c1
JH
9use Storable qw(dclone);
10
561c79ed
JH
11require Exporter;
12
13our @ISA = qw(Exporter);
74f8133e 14
10a6ecd2
JH
15our @EXPORT_OK = qw(charinfo
16 charblock charscript
17 charblocks charscripts
b08cd201 18 charinrange
ea508aee 19 general_categories bidi_types
b08cd201 20 compexcl
a2bd7410
JH
21 casefold casespec
22 namedseq);
561c79ed
JH
23
24use Carp;
25
26=head1 NAME
27
55d7b906 28Unicode::UCD - Unicode character database
561c79ed
JH
29
30=head1 SYNOPSIS
31
55d7b906 32 use Unicode::UCD 'charinfo';
b08cd201 33 my $charinfo = charinfo($codepoint);
561c79ed 34
956cae9a
KW
35 use Unicode::UCD 'casefold';
36 my $casefold = casefold(0xFB00);
37
5d8e6e41
KW
38 use Unicode::UCD 'casespec';
39 my $casespec = casespec(0xFB00);
40
55d7b906 41 use Unicode::UCD 'charblock';
e882dd67
JH
42 my $charblock = charblock($codepoint);
43
55d7b906 44 use Unicode::UCD 'charscript';
65044554 45 my $charscript = charscript($codepoint);
561c79ed 46
55d7b906 47 use Unicode::UCD 'charblocks';
e145285f
JH
48 my $charblocks = charblocks();
49
55d7b906 50 use Unicode::UCD 'charscripts';
ea508aee 51 my $charscripts = charscripts();
e145285f 52
55d7b906 53 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
54 my $range = charscript($script);
55 print "looks like $script\n" if charinrange($range, $codepoint);
56
ea508aee
JH
57 use Unicode::UCD qw(general_categories bidi_types);
58 my $categories = general_categories();
59 my $types = bidi_types();
60
55d7b906 61 use Unicode::UCD 'compexcl';
e145285f
JH
62 my $compexcl = compexcl($codepoint);
63
a2bd7410
JH
64 use Unicode::UCD 'namedseq';
65 my $namedseq = namedseq($named_sequence_name);
66
55d7b906 67 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 68
561c79ed
JH
69=head1 DESCRIPTION
70
a452d459
KW
71The Unicode::UCD module offers a series of functions that
72provide a simple interface to the Unicode
8b731da2 73Character Database.
561c79ed 74
a452d459
KW
75=head2 code point argument
76
77Some of the functions are called with a I<code point argument>, which is either
78a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
79followed by hexadecimals designating a Unicode code point. In other words, if
80you want a code point to be interpreted as a hexadecimal number, you must
81prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
82interpreted as a decimal code point. Also note that Unicode is B<not> limited
83to 16 bits (the number of Unicode code points is open-ended, in theory
84unlimited): you may have more than 4 hexdigits.
561c79ed
JH
85=cut
86
10a6ecd2
JH
87my $UNICODEFH;
88my $BLOCKSFH;
89my $SCRIPTSFH;
90my $VERSIONFH;
b08cd201
JH
91my $COMPEXCLFH;
92my $CASEFOLDFH;
93my $CASESPECFH;
a2bd7410 94my $NAMEDSEQFH;
561c79ed
JH
95
96sub openunicode {
97 my ($rfh, @path) = @_;
98 my $f;
99 unless (defined $$rfh) {
100 for my $d (@INC) {
101 use File::Spec;
55d7b906 102 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 103 last if open($$rfh, $f);
e882dd67 104 undef $f;
561c79ed 105 }
e882dd67
JH
106 croak __PACKAGE__, ": failed to find ",
107 File::Spec->catfile(@path), " in @INC"
108 unless defined $f;
561c79ed
JH
109 }
110 return $f;
111}
112
a452d459 113=head2 B<charinfo()>
561c79ed 114
55d7b906 115 use Unicode::UCD 'charinfo';
561c79ed 116
b08cd201 117 my $charinfo = charinfo(0x41);
561c79ed 118
a452d459
KW
119This returns information about the input L</code point argument>
120as a reference to a hash of fields as defined by the Unicode
121standard. If the L</code point argument> is not assigned in the standard
122(i.e., has the general category C<Cn> meaning C<Unassigned>)
123or is a non-character (meaning it is guaranteed to never be assigned in
124the standard),
125B<undef> is returned.
126
127Fields that aren't applicable to the particular code point argument exist in the
128returned hash, and are empty.
129
130The keys in the hash with the meanings of their values are:
131
132=over
133
134=item B<code>
135
136the input L</code point argument> expressed in hexadecimal, with leading zeros
137added if necessary to make it contain at least four hexdigits
138
139=item B<name>
140
141name of I<code>, all IN UPPER CASE.
142Some control-type code points do not have names.
143This field will be empty for C<Surrogate> and C<Private Use> code points,
144and for the others without a name,
145it will contain a description enclosed in angle brackets, like
146C<E<lt>controlE<gt>>.
147
148
149=item B<category>
150
151The short name of the general category of I<code>.
152This will match one of the keys in the hash returned by L</general_categories()>.
153
154=item B<combining>
155
156the combining class number for I<code> used in the Canonical Ordering Algorithm.
157For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
158available at
159L<http://www.unicode.org/versions/Unicode5.1.0/>
160
161=item B<bidi>
162
163bidirectional type of I<code>.
164This will match one of the keys in the hash returned by L</bidi_types()>.
165
166=item B<decomposition>
167
168is empty if I<code> has no decomposition; or is one or more codes
169(separated by spaces) that taken in order represent a decomposition for
170I<code>. Each has at least four hexdigits.
171The codes may be preceded by a word enclosed in angle brackets then a space,
172like C<E<lt>compatE<gt> >, giving the type of decomposition
173
174=item B<decimal>
175
176if I<code> is a decimal digit this is its integer numeric value
177
178=item B<digit>
179
180if I<code> represents a whole number, this is its integer numeric value
181
182=item B<numeric>
183
184if I<code> represents a whole or rational number, this is its numeric value.
185Rational values are expressed as a string like C<1/4>.
186
187=item B<mirrored>
188
189C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
190
191=item B<unicode10>
192
193name of I<code> in the Unicode 1.0 standard if one
194existed for this code point and is different from the current name
195
196=item B<comment>
197
198ISO 10646 comment field.
199It appears in parentheses in the ISO 10646 names list,
200or contains an asterisk to indicate there is
201a note for this code point in Annex P of that standard.
202
203=item B<upper>
204
205is empty if there is no single code point uppercase mapping for I<code>;
206otherwise it is that mapping expressed as at least four hexdigits.
207(L</casespec()> should be used in addition to B<charinfo()>
208for case mappings when the calling program can cope with multiple code point
209mappings.)
210
211=item B<lower>
212
213is empty if there is no single code point lowercase mapping for I<code>;
214otherwise it is that mapping expressed as at least four hexdigits.
215(L</casespec()> should be used in addition to B<charinfo()>
216for case mappings when the calling program can cope with multiple code point
217mappings.)
218
219=item B<title>
220
221is empty if there is no single code point titlecase mapping for I<code>;
222otherwise it is that mapping expressed as at least four hexdigits.
223(L</casespec()> should be used in addition to B<charinfo()>
224for case mappings when the calling program can cope with multiple code point
225mappings.)
226
227=item B<block>
228
229block I<code> belongs to (used in \p{In...}).
230See L</Blocks versus Scripts>.
231
232
233=item B<script>
234
235script I<code> belongs to.
236See L</Blocks versus Scripts>.
237
238=back
32c16050
JH
239
240Note that you cannot do (de)composition and casing based solely on the
a452d459
KW
241I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
242you will need also the L</compexcl()>, and L</casespec()> functions.
561c79ed
JH
243
244=cut
245
e10d7780 246# NB: This function is nearly duplicated in charnames.pm
10a6ecd2
JH
247sub _getcode {
248 my $arg = shift;
249
dc0a4417 250 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 251 return $arg;
dc0a4417 252 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
253 return hex($1);
254 }
255
256 return;
257}
258
ac5ea531
JH
259# Lingua::KO::Hangul::Util not part of the standard distribution
260# but it will be used if available.
261
262eval { require Lingua::KO::Hangul::Util };
263my $hasHangulUtil = ! $@;
264if ($hasHangulUtil) {
265 Lingua::KO::Hangul::Util->import();
266}
9087a70b
ST
267
268sub hangul_decomp { # internal: called from charinfo
ac5ea531
JH
269 if ($hasHangulUtil) {
270 my @tmp = decomposeHangul(shift);
271 return sprintf("%04X %04X", @tmp) if @tmp == 2;
272 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
273 }
274 return;
275}
276
277sub hangul_charname { # internal: called from charinfo
278 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b
ST
279}
280
9087a70b
ST
281sub han_charname { # internal: called from charinfo
282 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b
ST
283}
284
5d8e6e41 285# Overwritten by data in file
324f9e44
RB
286my %first_last = (
287 'CJK Ideograph Extension A' => [ 0x3400, 0x4DB5 ],
288 'CJK Ideograph' => [ 0x4E00, 0x9FA5 ],
289 'CJK Ideograph Extension B' => [ 0x20000, 0x2A6D6 ],
290);
291
292get_charinfo_ranges();
293
294sub get_charinfo_ranges {
295 my @blocks = keys %first_last;
296
297 my $fh;
298 openunicode( \$fh, 'UnicodeData.txt' );
299 if( defined $fh ){
300 while( my $line = <$fh> ){
301 next unless $line =~ /(?:First|Last)/;
302 if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
303 my ($number,$block,$type);
304 ($number,$block) = split /;/, $line;
305 $block =~ s/<|>//g;
306 ($block,$type) = split /, /, $block;
307 my $index = $type eq 'First' ? 0 : 1;
308 $first_last{ $block }->[$index] = hex $number;
309 }
310 }
311 }
312}
313
a6fa416b
ST
314my @CharinfoRanges = (
315# block name
316# [ first, last, coderef to name, coderef to decompose ],
317# CJK Ideographs Extension A
324f9e44 318 [ @{ $first_last{'CJK Ideograph Extension A'} }, \&han_charname, undef ],
a6fa416b 319# CJK Ideographs
324f9e44 320 [ @{ $first_last{'CJK Ideograph'} }, \&han_charname, undef ],
a6fa416b 321# Hangul Syllables
ac5ea531 322 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b
ST
323# Non-Private Use High Surrogates
324 [ 0xD800, 0xDB7F, undef, undef ],
325# Private Use High Surrogates
326 [ 0xDB80, 0xDBFF, undef, undef ],
327# Low Surrogates
328 [ 0xDC00, 0xDFFF, undef, undef ],
329# The Private Use Area
330 [ 0xE000, 0xF8FF, undef, undef ],
331# CJK Ideographs Extension B
324f9e44 332 [ @{ $first_last{'CJK Ideograph Extension B'} }, \&han_charname, undef ],
a6fa416b
ST
333# Plane 15 Private Use Area
334 [ 0xF0000, 0xFFFFD, undef, undef ],
335# Plane 16 Private Use Area
336 [ 0x100000, 0x10FFFD, undef, undef ],
337);
338
561c79ed 339sub charinfo {
10a6ecd2
JH
340 my $arg = shift;
341 my $code = _getcode($arg);
342 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
343 unless defined $code;
e63dbbf9 344 my $hexk = sprintf("%06X", $code);
a6fa416b
ST
345 my($rcode,$rname,$rdec);
346 foreach my $range (@CharinfoRanges){
74f8133e 347 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 348 $rcode = $hexk;
e63dbbf9
JH
349 $rcode =~ s/^0+//;
350 $rcode = sprintf("%04X", hex($rcode));
a6fa416b
ST
351 $rname = $range->[2] ? $range->[2]->($code) : '';
352 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 353 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b
ST
354 last;
355 }
356 }
551b6b6f 357 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 358 if (defined $UNICODEFH) {
e63dbbf9
JH
359 use Search::Dict 1.02;
360 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 361 my $line = <$UNICODEFH>;
c5a29f40 362 return unless defined $line;
561c79ed
JH
363 chomp $line;
364 my %prop;
365 @prop{qw(
366 code name category
367 combining bidi decomposition
368 decimal digit numeric
369 mirrored unicode10 comment
370 upper lower title
371 )} = split(/;/, $line, -1);
e63dbbf9
JH
372 $hexk =~ s/^0+//;
373 $hexk = sprintf("%04X", hex($hexk));
561c79ed 374 if ($prop{code} eq $hexk) {
a196fbfd
JH
375 $prop{block} = charblock($code);
376 $prop{script} = charscript($code);
a6fa416b
ST
377 if(defined $rname){
378 $prop{code} = $rcode;
379 $prop{name} = $rname;
380 $prop{decomposition} = $rdec;
381 }
b08cd201 382 return \%prop;
561c79ed
JH
383 }
384 }
385 }
386 return;
387}
388
e882dd67
JH
389sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
390 my ($table, $lo, $hi, $code) = @_;
391
392 return if $lo > $hi;
393
394 my $mid = int(($lo+$hi) / 2);
395
396 if ($table->[$mid]->[0] < $code) {
10a6ecd2 397 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
398 return $table->[$mid]->[2];
399 } else {
400 _search($table, $mid + 1, $hi, $code);
401 }
402 } elsif ($table->[$mid]->[0] > $code) {
403 _search($table, $lo, $mid - 1, $code);
404 } else {
405 return $table->[$mid]->[2];
406 }
407}
408
10a6ecd2
JH
409sub charinrange {
410 my ($range, $arg) = @_;
411 my $code = _getcode($arg);
412 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
413 unless defined $code;
414 _search($range, 0, $#$range, $code);
415}
416
a452d459 417=head2 B<charblock()>
561c79ed 418
55d7b906 419 use Unicode::UCD 'charblock';
561c79ed
JH
420
421 my $charblock = charblock(0x41);
10a6ecd2 422 my $charblock = charblock(1234);
a452d459 423 my $charblock = charblock(0x263a);
10a6ecd2
JH
424 my $charblock = charblock("U+263a");
425
78bf21c2 426 my $range = charblock('Armenian');
10a6ecd2 427
a452d459
KW
428With a L</code point argument> charblock() returns the I<block> the code point
429belongs to, e.g. C<Basic Latin>.
430If the code point is unassigned, this returns the block it would belong to if
431it were assigned (which it may in future versions of the Unicode Standard).
10a6ecd2 432
78bf21c2
JH
433See also L</Blocks versus Scripts>.
434
eb0cc9e3 435If supplied with an argument that can't be a code point, charblock() tries
a452d459 436to do the opposite and interpret the argument as a code point block. The
eb0cc9e3 437return value is a I<range>: an anonymous list of lists that contain
a2bd7410 438I<start-of-range>, I<end-of-range> code point pairs. You can test whether
a452d459
KW
439a code point is in a range using the L</charinrange()> function. If the
440argument is not a known code point block, B<undef> is returned.
561c79ed 441
561c79ed
JH
442=cut
443
444my @BLOCKS;
10a6ecd2 445my %BLOCKS;
561c79ed 446
10a6ecd2 447sub _charblocks {
561c79ed 448 unless (@BLOCKS) {
10a6ecd2 449 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 450 local $_;
10a6ecd2 451 while (<$BLOCKSFH>) {
2796c109 452 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
453 my ($lo, $hi) = (hex($1), hex($2));
454 my $subrange = [ $lo, $hi, $3 ];
455 push @BLOCKS, $subrange;
456 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
457 }
458 }
10a6ecd2 459 close($BLOCKSFH);
561c79ed
JH
460 }
461 }
10a6ecd2
JH
462}
463
464sub charblock {
465 my $arg = shift;
466
467 _charblocks() unless @BLOCKS;
468
469 my $code = _getcode($arg);
561c79ed 470
10a6ecd2
JH
471 if (defined $code) {
472 _search(\@BLOCKS, 0, $#BLOCKS, $code);
473 } else {
474 if (exists $BLOCKS{$arg}) {
741297c1 475 return dclone $BLOCKS{$arg};
10a6ecd2
JH
476 } else {
477 return;
478 }
479 }
e882dd67
JH
480}
481
a452d459 482=head2 B<charscript()>
e882dd67 483
55d7b906 484 use Unicode::UCD 'charscript';
e882dd67
JH
485
486 my $charscript = charscript(0x41);
10a6ecd2
JH
487 my $charscript = charscript(1234);
488 my $charscript = charscript("U+263a");
e882dd67 489
78bf21c2 490 my $range = charscript('Thai');
10a6ecd2 491
a452d459
KW
492With a L</code point argument> charscript() returns the I<script> the
493code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
494If the code point is unassigned, it returns B<undef>
78bf21c2 495
eb0cc9e3 496If supplied with an argument that can't be a code point, charscript() tries
a452d459 497to do the opposite and interpret the argument as a code point script. The
eb0cc9e3
JH
498return value is a I<range>: an anonymous list of lists that contain
499I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
a452d459
KW
500code point is in a range using the L</charinrange()> function. If the
501argument is not a known code point script, B<undef> is returned.
502
503See also L</Blocks versus Scripts>.
e882dd67 504
e882dd67
JH
505=cut
506
507my @SCRIPTS;
10a6ecd2 508my %SCRIPTS;
e882dd67 509
10a6ecd2 510sub _charscripts {
e882dd67 511 unless (@SCRIPTS) {
10a6ecd2 512 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 513 local $_;
10a6ecd2 514 while (<$SCRIPTSFH>) {
e882dd67 515 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
516 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
517 my $script = lc($3);
518 $script =~ s/\b(\w)/uc($1)/ge;
519 my $subrange = [ $lo, $hi, $script ];
520 push @SCRIPTS, $subrange;
521 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
522 }
523 }
10a6ecd2 524 close($SCRIPTSFH);
e882dd67
JH
525 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
526 }
527 }
10a6ecd2
JH
528}
529
530sub charscript {
531 my $arg = shift;
532
533 _charscripts() unless @SCRIPTS;
e882dd67 534
10a6ecd2
JH
535 my $code = _getcode($arg);
536
537 if (defined $code) {
538 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
539 } else {
540 if (exists $SCRIPTS{$arg}) {
741297c1 541 return dclone $SCRIPTS{$arg};
10a6ecd2
JH
542 } else {
543 return;
544 }
545 }
546}
547
a452d459 548=head2 B<charblocks()>
10a6ecd2 549
55d7b906 550 use Unicode::UCD 'charblocks';
10a6ecd2 551
b08cd201 552 my $charblocks = charblocks();
10a6ecd2 553
b08cd201 554charblocks() returns a reference to a hash with the known block names
a452d459 555as the keys, and the code point ranges (see L</charblock()>) as the values.
10a6ecd2 556
78bf21c2
JH
557See also L</Blocks versus Scripts>.
558
10a6ecd2
JH
559=cut
560
561sub charblocks {
b08cd201 562 _charblocks() unless %BLOCKS;
741297c1 563 return dclone \%BLOCKS;
10a6ecd2
JH
564}
565
a452d459 566=head2 B<charscripts()>
10a6ecd2 567
55d7b906 568 use Unicode::UCD 'charscripts';
10a6ecd2 569
ea508aee 570 my $charscripts = charscripts();
10a6ecd2 571
ea508aee 572charscripts() returns a reference to a hash with the known script
a452d459 573names as the keys, and the code point ranges (see L</charscript()>) as
ea508aee 574the values.
10a6ecd2 575
78bf21c2
JH
576See also L</Blocks versus Scripts>.
577
10a6ecd2
JH
578=cut
579
580sub charscripts {
b08cd201 581 _charscripts() unless %SCRIPTS;
741297c1 582 return dclone \%SCRIPTS;
561c79ed
JH
583}
584
a452d459 585=head2 B<charinrange()>
10a6ecd2
JH
586
587In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
588can also test whether a code point is in the I<range> as returned by
a452d459
KW
589L</charblock()> and L</charscript()> or as the values of the hash returned
590by L</charblocks()> and L</charscripts()> by using charinrange():
10a6ecd2 591
55d7b906 592 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
593
594 $range = charscript('Hiragana');
e145285f 595 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
596
597=cut
598
ea508aee
JH
599my %GENERAL_CATEGORIES =
600 (
601 'L' => 'Letter',
602 'LC' => 'CasedLetter',
603 'Lu' => 'UppercaseLetter',
604 'Ll' => 'LowercaseLetter',
605 'Lt' => 'TitlecaseLetter',
606 'Lm' => 'ModifierLetter',
607 'Lo' => 'OtherLetter',
608 'M' => 'Mark',
609 'Mn' => 'NonspacingMark',
610 'Mc' => 'SpacingMark',
611 'Me' => 'EnclosingMark',
612 'N' => 'Number',
613 'Nd' => 'DecimalNumber',
614 'Nl' => 'LetterNumber',
615 'No' => 'OtherNumber',
616 'P' => 'Punctuation',
617 'Pc' => 'ConnectorPunctuation',
618 'Pd' => 'DashPunctuation',
619 'Ps' => 'OpenPunctuation',
620 'Pe' => 'ClosePunctuation',
621 'Pi' => 'InitialPunctuation',
622 'Pf' => 'FinalPunctuation',
623 'Po' => 'OtherPunctuation',
624 'S' => 'Symbol',
625 'Sm' => 'MathSymbol',
626 'Sc' => 'CurrencySymbol',
627 'Sk' => 'ModifierSymbol',
628 'So' => 'OtherSymbol',
629 'Z' => 'Separator',
630 'Zs' => 'SpaceSeparator',
631 'Zl' => 'LineSeparator',
632 'Zp' => 'ParagraphSeparator',
633 'C' => 'Other',
634 'Cc' => 'Control',
635 'Cf' => 'Format',
636 'Cs' => 'Surrogate',
637 'Co' => 'PrivateUse',
638 'Cn' => 'Unassigned',
639 );
640
641sub general_categories {
642 return dclone \%GENERAL_CATEGORIES;
643}
644
a452d459 645=head2 B<general_categories()>
ea508aee
JH
646
647 use Unicode::UCD 'general_categories';
648
649 my $categories = general_categories();
650
a452d459 651This returns a reference to a hash which has short
ea508aee
JH
652general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
653names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
654C<Symbol>) as values. The hash is reversible in case you need to go
655from the long names to the short names. The general category is the
a452d459
KW
656one returned from
657L</charinfo()> under the C<category> key.
ea508aee
JH
658
659=cut
660
661my %BIDI_TYPES =
662 (
663 'L' => 'Left-to-Right',
664 'LRE' => 'Left-to-Right Embedding',
665 'LRO' => 'Left-to-Right Override',
666 'R' => 'Right-to-Left',
667 'AL' => 'Right-to-Left Arabic',
668 'RLE' => 'Right-to-Left Embedding',
669 'RLO' => 'Right-to-Left Override',
670 'PDF' => 'Pop Directional Format',
671 'EN' => 'European Number',
672 'ES' => 'European Number Separator',
673 'ET' => 'European Number Terminator',
674 'AN' => 'Arabic Number',
675 'CS' => 'Common Number Separator',
676 'NSM' => 'Non-Spacing Mark',
677 'BN' => 'Boundary Neutral',
678 'B' => 'Paragraph Separator',
679 'S' => 'Segment Separator',
680 'WS' => 'Whitespace',
681 'ON' => 'Other Neutrals',
682 );
683
a452d459 684=head2 B<bidi_types()>
ea508aee
JH
685
686 use Unicode::UCD 'bidi_types';
687
688 my $categories = bidi_types();
689
a452d459 690This returns a reference to a hash which has the short
ea508aee
JH
691bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
692names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
693hash is reversible in case you need to go from the long names to the
a452d459
KW
694short names. The bidi type is the one returned from
695L</charinfo()>
ea508aee
JH
696under the C<bidi> key. For the exact meaning of the various bidi classes
697the Unicode TR9 is recommended reading:
a452d459 698L<http://www.unicode.org/reports/tr9/>
ea508aee
JH
699(as of Unicode 5.0.0)
700
701=cut
702
a452d459
KW
703sub bidi_types {
704 return dclone \%BIDI_TYPES;
705}
706
707=head2 B<compexcl()>
b08cd201 708
55d7b906 709 use Unicode::UCD 'compexcl';
b08cd201 710
a452d459 711 my $compexcl = compexcl(0x09dc);
b08cd201 712
a452d459
KW
713This returns B<true> if the
714L</code point argument> should not be produced by composition normalization,
715B<AND> if that fact is not otherwise determinable from the Unicode data base.
716It currently does not return B<true> if the code point has a decomposition
717consisting of another single code point, nor if its decomposition starts
718with a code point whose combining class is non-zero. Code points that meet
719either of these conditions should also not be produced by composition
720normalization.
b08cd201 721
a452d459 722It returns B<false> otherwise.
b08cd201
JH
723
724=cut
725
726my %COMPEXCL;
727
728sub _compexcl {
729 unless (%COMPEXCL) {
551b6b6f 730 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 731 local $_;
b08cd201 732 while (<$COMPEXCLFH>) {
822ebcc8 733 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201
JH
734 my $code = hex($1);
735 $COMPEXCL{$code} = undef;
736 }
737 }
738 close($COMPEXCLFH);
739 }
740 }
741}
742
743sub compexcl {
744 my $arg = shift;
745 my $code = _getcode($arg);
74f8133e
JH
746 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
747 unless defined $code;
b08cd201
JH
748
749 _compexcl() unless %COMPEXCL;
750
751 return exists $COMPEXCL{$code};
752}
753
a452d459 754=head2 B<casefold()>
b08cd201 755
55d7b906 756 use Unicode::UCD 'casefold';
b08cd201 757
a452d459
KW
758 my $casefold = casefold(0xDF);
759 if (defined $casefold) {
760 my @full_fold_hex = split / /, $casefold->{'full'};
761 my $full_fold_string =
762 join "", map {chr(hex($_))} @full_fold_hex;
763 my @turkic_fold_hex =
764 split / /, ($casefold->{'turkic'} ne "")
765 ? $casefold->{'turkic'}
766 : $casefold->{'full'};
767 my $turkic_fold_string =
768 join "", map {chr(hex($_))} @turkic_fold_hex;
769 }
770 if (defined $casefold && $casefold->{'simple'} ne "") {
771 my $simple_fold_hex = $casefold->{'simple'};
772 my $simple_fold_string = chr(hex($simple_fold_hex));
773 }
b08cd201 774
a452d459
KW
775This returns the (almost) locale-independent case folding of the
776character specified by the L</code point argument>.
b08cd201 777
a452d459
KW
778If there is no case folding for that code point, B<undef> is returned.
779
780If there is a case folding for that code point, a reference to a hash
b08cd201
JH
781with the following fields is returned:
782
a452d459
KW
783=over
784
785=item B<code>
786
787the input L</code point argument> expressed in hexadecimal, with leading zeros
788added if necessary to make it contain at least four hexdigits
789
790=item B<full>
791
792one or more codes (separated by spaces) that taken in order give the
793code points for the case folding for I<code>.
794Each has at least four hexdigits.
795
796=item B<simple>
797
798is empty, or is exactly one code with at least four hexdigits which can be used
799as an alternative case folding when the calling program cannot cope with the
800fold being a sequence of multiple code points. If I<full> is just one code
801point, then I<simple> equals I<full>. If there is no single code point folding
802defined for I<code>, then I<simple> is the empty string. Otherwise, it is an
803inferior, but still better-than-nothing alternative folding to I<full>.
804
805=item B<mapping>
806
807is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
808otherwise. It can be considered to be the simplest possible folding for
809I<code>. It is defined primarily for backwards compatibility.
810
811=item B<status>
b08cd201 812
a452d459
KW
813is C<C> (for C<common>) if the best possible fold is a single code point
814(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct
815folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if
816there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty). Note
817that this
818describes the contents of I<mapping>. It is defined primarily for backwards
819compatibility.
b08cd201 820
a452d459
KW
821On versions 3.1 and earlier of Unicode, I<status> can also be
822C<I> which is the same as C<C> but is a special case for dotted uppercase I and
823dotless lowercase i:
b08cd201 824
a452d459 825=over
b08cd201 826
a452d459
KW
827=item B<*>
828
829If you use this C<I> mapping, the result is case-insensitive,
830but dotless and dotted I's are not distinguished
831
832=item B<*>
833
834If you exclude this C<I> mapping, the result is not fully case-insensitive, but
835dotless and dotted I's are distinguished
836
837=back
838
839=item B<turkic>
840
841contains any special folding for Turkic languages. For versions of Unicode
842starting with 3.2, this field is empty unless I<code> has a different folding
843in Turkic languages, in which case it is one or more codes (separated by
844spaces) that taken in order give the code points for the case folding for
845I<code> in those languages.
846Each code has at least four hexdigits.
847Note that this folding does not maintain canonical equivalence without
848additional processing.
849
850For versions of Unicode 3.1 and earlier, this field is empty unless there is a
851special folding for Turkic languages, in which case I<status> is C<I>, and
852I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
853
854=back
855
856Programs that want complete generality and the best folding results should use
857the folding contained in the I<full> field. But note that the fold for some
858code points will be a sequence of multiple code points.
859
860Programs that can't cope with the fold mapping being multiple code points can
861use the folding contained in the I<simple> field, with the loss of some
862generality. In Unicode 5.1, about 7% of the defined foldings have no single
863code point folding.
864
865The I<mapping> and I<status> fields are provided for backwards compatibility for
866existing programs. They contain the same values as in previous versions of
867this function.
868
869Locale is not completely independent. The I<turkic> field contains results to
870use when the locale is a Turkic language.
b08cd201
JH
871
872For more information about case mappings see
a452d459 873L<http://www.unicode.org/unicode/reports/tr21>
b08cd201
JH
874
875=cut
876
877my %CASEFOLD;
878
879sub _casefold {
880 unless (%CASEFOLD) {
551b6b6f 881 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 882 local $_;
b08cd201 883 while (<$CASEFOLDFH>) {
a452d459 884 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
b08cd201 885 my $code = hex($1);
a452d459
KW
886 $CASEFOLD{$code}{'code'} = $1;
887 $CASEFOLD{$code}{'turkic'} = "" unless
888 defined $CASEFOLD{$code}{'turkic'};
889 if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and
890 # earlier Unicodes
891 # Both entries there (I
892 # only checked 3.1) are
893 # the same as C, and
894 # there are no other
895 # entries for those
896 # codepoints, so treat
897 # as if C, but override
898 # the turkic one for
899 # 'I'.
900 $CASEFOLD{$code}{'status'} = $2;
901 $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
902 $CASEFOLD{$code}{'mapping'} = $3;
903 $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
904 } elsif ($2 eq 'F') {
905 $CASEFOLD{$code}{'full'} = $3;
906 unless (defined $CASEFOLD{$code}{'simple'}) {
907 $CASEFOLD{$code}{'simple'} = "";
908 $CASEFOLD{$code}{'mapping'} = $3;
909 $CASEFOLD{$code}{'status'} = $2;
910 }
911 } elsif ($2 eq 'S') {
912
913
914 # There can't be a simple without a full, and simple
915 # overrides all but full
916
917 $CASEFOLD{$code}{'simple'} = $3;
918 $CASEFOLD{$code}{'mapping'} = $3;
919 $CASEFOLD{$code}{'status'} = $2;
920 } elsif ($2 eq 'T') {
921 $CASEFOLD{$code}{'turkic'} = $3;
922 } # else can't happen because only [CIFST] are possible
b08cd201
JH
923 }
924 }
925 close($CASEFOLDFH);
926 }
927 }
928}
929
930sub casefold {
931 my $arg = shift;
932 my $code = _getcode($arg);
74f8133e
JH
933 croak __PACKAGE__, "::casefold: unknown code '$arg'"
934 unless defined $code;
b08cd201
JH
935
936 _casefold() unless %CASEFOLD;
937
938 return $CASEFOLD{$code};
939}
940
a452d459 941=head2 B<casespec()>
b08cd201 942
55d7b906 943 use Unicode::UCD 'casespec';
b08cd201 944
a452d459 945 my $casespec = casespec(0xFB00);
b08cd201 946
a452d459
KW
947This returns the potentially locale-dependent case mappings of the L</code point
948argument>. The mappings may be longer than a single code point (which the basic
949Unicode case mappings as returned by L</charinfo()> never are).
b08cd201 950
a452d459
KW
951If there are no case mappings for the L</code point argument>, or if all three
952possible mappings (I<lower>, I<title> and I<upper>) result in single code
5d8e6e41
KW
953points and are locale independent and unconditional, B<undef> is returned
954(which means that the case mappings, if any, for the code point are those
955returned by L</charinfo()>).
a452d459
KW
956
957Otherwise, a reference to a hash giving the mappings (or a reference to a hash
5d8e6e41
KW
958of such hashes, explained below) is returned with the following keys and their
959meanings:
a452d459
KW
960
961The keys in the bottom layer hash with the meanings of their values are:
962
963=over
964
965=item B<code>
966
967the input L</code point argument> expressed in hexadecimal, with leading zeros
968added if necessary to make it contain at least four hexdigits
969
970=item B<lower>
971
972one or more codes (separated by spaces) that taken in order give the
973code points for the lower case of I<code>.
974Each has at least four hexdigits.
975
976=item B<title>
b08cd201 977
a452d459
KW
978one or more codes (separated by spaces) that taken in order give the
979code points for the title case of I<code>.
980Each has at least four hexdigits.
b08cd201 981
d2da20e3 982=item B<upper>
b08cd201 983
a452d459
KW
984one or more codes (separated by spaces) that taken in order give the
985code points for the upper case of I<code>.
986Each has at least four hexdigits.
987
988=item B<condition>
989
990the conditions for the mappings to be valid.
991If B<undef>, the mappings are always valid.
992When defined, this field is a list of conditions,
993all of which must be true for the mappings to be valid.
994The list consists of one or more
995I<locales> (see below)
996and/or I<contexts> (explained in the next paragraph),
997separated by spaces.
998(Other than as used to separate elements, spaces are to be ignored.)
999Case distinctions in the condition list are not significant.
82c0b05b 1000Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 1001
a452d459
KW
1002A I<context> is one of those defined in the Unicode standard.
1003For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1004available at
5d8e6e41
KW
1005L<http://www.unicode.org/versions/Unicode5.1.0/>.
1006These are for context-sensitive casing.
f499c386 1007
a452d459
KW
1008=back
1009
5d8e6e41
KW
1010The hash described above is returned for locale-independent casing, where
1011at least one of the mappings has length longer than one. If B<undef> is
1012returned, the code point may have mappings, but if so, all are length one,
1013and are returned by L</charinfo()>.
1014Note that when this function does return a value, it will be for the complete
1015set of mappings for a code point, even those whose length is one.
1016
1017If there are additional casing rules that apply only in certain locales,
1018an additional key for each will be defined in the returned hash. Each such key
1019will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1020followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1021and a variant code). You can find the lists of all possible locales, see
1022L<Locale::Country> and L<Locale::Language>.
a452d459
KW
1023(In Unicode 5.1, the only locales returned by this function
1024are C<lt>, C<tr>, and C<az>.)
b08cd201 1025
5d8e6e41
KW
1026Each locale key is a reference to a hash that has the form above, and gives
1027the casing rules for that particular locale, which take precedence over the
1028locale-independent ones when in that locale.
1029
1030If the only casing for a code point is locale-dependent, then the returned
1031hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1032will contain only locale keys.
1033
b08cd201 1034For more information about case mappings see
a452d459 1035L<http://www.unicode.org/unicode/reports/tr21/>
b08cd201
JH
1036
1037=cut
1038
1039my %CASESPEC;
1040
1041sub _casespec {
1042 unless (%CASESPEC) {
551b6b6f 1043 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 1044 local $_;
b08cd201
JH
1045 while (<$CASESPECFH>) {
1046 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
1047 my ($hexcode, $lower, $title, $upper, $condition) =
1048 ($1, $2, $3, $4, $5);
1049 my $code = hex($hexcode);
1050 if (exists $CASESPEC{$code}) {
1051 if (exists $CASESPEC{$code}->{code}) {
1052 my ($oldlower,
1053 $oldtitle,
1054 $oldupper,
1055 $oldcondition) =
1056 @{$CASESPEC{$code}}{qw(lower
1057 title
1058 upper
1059 condition)};
822ebcc8
JH
1060 if (defined $oldcondition) {
1061 my ($oldlocale) =
f499c386 1062 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
1063 delete $CASESPEC{$code};
1064 $CASESPEC{$code}->{$oldlocale} =
1065 { code => $hexcode,
1066 lower => $oldlower,
1067 title => $oldtitle,
1068 upper => $oldupper,
1069 condition => $oldcondition };
f499c386
JH
1070 }
1071 }
1072 my ($locale) =
1073 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1074 $CASESPEC{$code}->{$locale} =
1075 { code => $hexcode,
1076 lower => $lower,
1077 title => $title,
1078 upper => $upper,
1079 condition => $condition };
1080 } else {
1081 $CASESPEC{$code} =
1082 { code => $hexcode,
1083 lower => $lower,
1084 title => $title,
1085 upper => $upper,
1086 condition => $condition };
1087 }
b08cd201
JH
1088 }
1089 }
1090 close($CASESPECFH);
1091 }
1092 }
1093}
1094
1095sub casespec {
1096 my $arg = shift;
1097 my $code = _getcode($arg);
74f8133e
JH
1098 croak __PACKAGE__, "::casespec: unknown code '$arg'"
1099 unless defined $code;
b08cd201
JH
1100
1101 _casespec() unless %CASESPEC;
1102
741297c1 1103 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
1104}
1105
a452d459 1106=head2 B<namedseq()>
a2bd7410
JH
1107
1108 use Unicode::UCD 'namedseq';
1109
1110 my $namedseq = namedseq("KATAKANA LETTER AINU P");
1111 my @namedseq = namedseq("KATAKANA LETTER AINU P");
1112 my %namedseq = namedseq();
1113
1114If used with a single argument in a scalar context, returns the string
a452d459 1115consisting of the code points of the named sequence, or B<undef> if no
a2bd7410 1116named sequence by that name exists. If used with a single argument in
956cae9a
KW
1117a list context, it returns the list of the ordinals of the code points. If used
1118with no
a2bd7410
JH
1119arguments in a list context, returns a hash with the names of the
1120named sequences as the keys and the named sequences as strings as
a452d459 1121the values. Otherwise, it returns B<undef> or an empty list depending
a2bd7410
JH
1122on the context.
1123
a452d459
KW
1124This function only operates on officially approved (not provisional) named
1125sequences.
a2bd7410
JH
1126
1127=cut
1128
1129my %NAMEDSEQ;
1130
1131sub _namedseq {
1132 unless (%NAMEDSEQ) {
98ef7649 1133 if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
a2bd7410
JH
1134 local $_;
1135 while (<$NAMEDSEQFH>) {
98ef7649
KW
1136 if (/^ [0-9A-F]+ \ /x) {
1137 chomp;
1138 my ($sequence, $name) = split /\t/;
1139 my @s = map { chr(hex($_)) } split(' ', $sequence);
1140 $NAMEDSEQ{$name} = join("", @s);
a2bd7410
JH
1141 }
1142 }
1143 close($NAMEDSEQFH);
1144 }
1145 }
1146}
1147
1148sub namedseq {
98ef7649
KW
1149
1150 # Use charnames::string_vianame() which now returns this information,
1151 # unless the caller wants the hash returned, in which case we read it in,
1152 # and thereafter use it instead of calling charnames, as it is faster.
1153
a2bd7410
JH
1154 my $wantarray = wantarray();
1155 if (defined $wantarray) {
1156 if ($wantarray) {
1157 if (@_ == 0) {
98ef7649 1158 _namedseq() unless %NAMEDSEQ;
a2bd7410
JH
1159 return %NAMEDSEQ;
1160 } elsif (@_ == 1) {
98ef7649
KW
1161 my $s;
1162 if (%NAMEDSEQ) {
1163 $s = $NAMEDSEQ{ $_[0] };
1164 }
1165 else {
1166 $s = charnames::string_vianame($_[0]);
1167 }
a2bd7410
JH
1168 return defined $s ? map { ord($_) } split('', $s) : ();
1169 }
1170 } elsif (@_ == 1) {
98ef7649
KW
1171 return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
1172 return charnames::string_vianame($_[0]);
a2bd7410
JH
1173 }
1174 }
1175 return;
1176}
1177
55d7b906 1178=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 1179
a452d459
KW
1180This returns the version of the Unicode Character Database, in other words, the
1181version of the Unicode standard the database implements. The version is a
1182string of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
1183
1184=cut
1185
1186my $UNICODEVERSION;
1187
1188sub UnicodeVersion {
1189 unless (defined $UNICODEVERSION) {
1190 openunicode(\$VERSIONFH, "version");
1191 chomp($UNICODEVERSION = <$VERSIONFH>);
1192 close($VERSIONFH);
1193 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1194 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1195 }
1196 return $UNICODEVERSION;
1197}
3aa957f9 1198
a452d459
KW
1199=head2 B<Blocks versus Scripts>
1200
1201The difference between a block and a script is that scripts are closer
1202to the linguistic notion of a set of code points required to present
1203languages, while block is more of an artifact of the Unicode code point
1204numbering and separation into blocks of (mostly) 256 code points.
1205
1206For example the Latin B<script> is spread over several B<blocks>, such
1207as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1208C<Latin Extended-B>. On the other hand, the Latin script does not
1209contain all the characters of the C<Basic Latin> block (also known as
1210ASCII): it includes only the letters, and not, for example, the digits
1211or the punctuation.
1212
1213For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1214
1215For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1216
1217=head2 B<Matching Scripts and Blocks>
1218
1219Scripts are matched with the regular-expression construct
1220C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
1221while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
1222any of the 256 code points in the Tibetan block).
1223
1224
3aa957f9 1225=head2 Implementation Note
32c16050 1226
ad9cab37
JH
1227The first use of charinfo() opens a read-only filehandle to the Unicode
1228Character Database (the database is included in the Perl distribution).
78bf21c2
JH
1229The filehandle is then kept open for further queries. In other words,
1230if you are wondering where one of your filehandles went, that's where.
32c16050 1231
8b731da2
JH
1232=head1 BUGS
1233
1234Does not yet support EBCDIC platforms.
1235
a452d459
KW
1236L</compexcl()> should give a complete list of excluded code points.
1237
561c79ed
JH
1238=head1 AUTHOR
1239
1240Jarkko Hietaniemi
1241
1242=cut
1243
12441;