This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Removing invalid port L<> links from Version documentation updates.
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
5
a452d459 6our $VERSION = '0.27';
561c79ed 7
741297c1
JH
8use Storable qw(dclone);
9
561c79ed
JH
10require Exporter;
11
12our @ISA = qw(Exporter);
74f8133e 13
10a6ecd2
JH
14our @EXPORT_OK = qw(charinfo
15 charblock charscript
16 charblocks charscripts
b08cd201 17 charinrange
ea508aee 18 general_categories bidi_types
b08cd201 19 compexcl
a2bd7410
JH
20 casefold casespec
21 namedseq);
561c79ed
JH
22
23use Carp;
24
25=head1 NAME
26
55d7b906 27Unicode::UCD - Unicode character database
561c79ed
JH
28
29=head1 SYNOPSIS
30
55d7b906 31 use Unicode::UCD 'charinfo';
b08cd201 32 my $charinfo = charinfo($codepoint);
561c79ed 33
956cae9a
KW
34 use Unicode::UCD 'casefold';
35 my $casefold = casefold(0xFB00);
36
5d8e6e41
KW
37 use Unicode::UCD 'casespec';
38 my $casespec = casespec(0xFB00);
39
55d7b906 40 use Unicode::UCD 'charblock';
e882dd67
JH
41 my $charblock = charblock($codepoint);
42
55d7b906 43 use Unicode::UCD 'charscript';
65044554 44 my $charscript = charscript($codepoint);
561c79ed 45
55d7b906 46 use Unicode::UCD 'charblocks';
e145285f
JH
47 my $charblocks = charblocks();
48
55d7b906 49 use Unicode::UCD 'charscripts';
ea508aee 50 my $charscripts = charscripts();
e145285f 51
55d7b906 52 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
53 my $range = charscript($script);
54 print "looks like $script\n" if charinrange($range, $codepoint);
55
ea508aee
JH
56 use Unicode::UCD qw(general_categories bidi_types);
57 my $categories = general_categories();
58 my $types = bidi_types();
59
55d7b906 60 use Unicode::UCD 'compexcl';
e145285f
JH
61 my $compexcl = compexcl($codepoint);
62
a2bd7410
JH
63 use Unicode::UCD 'namedseq';
64 my $namedseq = namedseq($named_sequence_name);
65
55d7b906 66 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 67
561c79ed
JH
68=head1 DESCRIPTION
69
a452d459
KW
70The Unicode::UCD module offers a series of functions that
71provide a simple interface to the Unicode
8b731da2 72Character Database.
561c79ed 73
a452d459
KW
74=head2 code point argument
75
76Some of the functions are called with a I<code point argument>, which is either
77a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
78followed by hexadecimals designating a Unicode code point. In other words, if
79you want a code point to be interpreted as a hexadecimal number, you must
80prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
81interpreted as a decimal code point. Also note that Unicode is B<not> limited
82to 16 bits (the number of Unicode code points is open-ended, in theory
83unlimited): you may have more than 4 hexdigits.
561c79ed
JH
84=cut
85
10a6ecd2
JH
86my $UNICODEFH;
87my $BLOCKSFH;
88my $SCRIPTSFH;
89my $VERSIONFH;
b08cd201
JH
90my $COMPEXCLFH;
91my $CASEFOLDFH;
92my $CASESPECFH;
a2bd7410 93my $NAMEDSEQFH;
561c79ed
JH
94
95sub openunicode {
96 my ($rfh, @path) = @_;
97 my $f;
98 unless (defined $$rfh) {
99 for my $d (@INC) {
100 use File::Spec;
55d7b906 101 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 102 last if open($$rfh, $f);
e882dd67 103 undef $f;
561c79ed 104 }
e882dd67
JH
105 croak __PACKAGE__, ": failed to find ",
106 File::Spec->catfile(@path), " in @INC"
107 unless defined $f;
561c79ed
JH
108 }
109 return $f;
110}
111
a452d459 112=head2 B<charinfo()>
561c79ed 113
55d7b906 114 use Unicode::UCD 'charinfo';
561c79ed 115
b08cd201 116 my $charinfo = charinfo(0x41);
561c79ed 117
a452d459
KW
118This returns information about the input L</code point argument>
119as a reference to a hash of fields as defined by the Unicode
120standard. If the L</code point argument> is not assigned in the standard
121(i.e., has the general category C<Cn> meaning C<Unassigned>)
122or is a non-character (meaning it is guaranteed to never be assigned in
123the standard),
124B<undef> is returned.
125
126Fields that aren't applicable to the particular code point argument exist in the
127returned hash, and are empty.
128
129The keys in the hash with the meanings of their values are:
130
131=over
132
133=item B<code>
134
135the input L</code point argument> expressed in hexadecimal, with leading zeros
136added if necessary to make it contain at least four hexdigits
137
138=item B<name>
139
140name of I<code>, all IN UPPER CASE.
141Some control-type code points do not have names.
142This field will be empty for C<Surrogate> and C<Private Use> code points,
143and for the others without a name,
144it will contain a description enclosed in angle brackets, like
145C<E<lt>controlE<gt>>.
146
147
148=item B<category>
149
150The short name of the general category of I<code>.
151This will match one of the keys in the hash returned by L</general_categories()>.
152
153=item B<combining>
154
155the combining class number for I<code> used in the Canonical Ordering Algorithm.
156For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
157available at
158L<http://www.unicode.org/versions/Unicode5.1.0/>
159
160=item B<bidi>
161
162bidirectional type of I<code>.
163This will match one of the keys in the hash returned by L</bidi_types()>.
164
165=item B<decomposition>
166
167is empty if I<code> has no decomposition; or is one or more codes
168(separated by spaces) that taken in order represent a decomposition for
169I<code>. Each has at least four hexdigits.
170The codes may be preceded by a word enclosed in angle brackets then a space,
171like C<E<lt>compatE<gt> >, giving the type of decomposition
172
173=item B<decimal>
174
175if I<code> is a decimal digit this is its integer numeric value
176
177=item B<digit>
178
179if I<code> represents a whole number, this is its integer numeric value
180
181=item B<numeric>
182
183if I<code> represents a whole or rational number, this is its numeric value.
184Rational values are expressed as a string like C<1/4>.
185
186=item B<mirrored>
187
188C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
189
190=item B<unicode10>
191
192name of I<code> in the Unicode 1.0 standard if one
193existed for this code point and is different from the current name
194
195=item B<comment>
196
197ISO 10646 comment field.
198It appears in parentheses in the ISO 10646 names list,
199or contains an asterisk to indicate there is
200a note for this code point in Annex P of that standard.
201
202=item B<upper>
203
204is empty if there is no single code point uppercase mapping for I<code>;
205otherwise it is that mapping expressed as at least four hexdigits.
206(L</casespec()> should be used in addition to B<charinfo()>
207for case mappings when the calling program can cope with multiple code point
208mappings.)
209
210=item B<lower>
211
212is empty if there is no single code point lowercase mapping for I<code>;
213otherwise it is that mapping expressed as at least four hexdigits.
214(L</casespec()> should be used in addition to B<charinfo()>
215for case mappings when the calling program can cope with multiple code point
216mappings.)
217
218=item B<title>
219
220is empty if there is no single code point titlecase mapping for I<code>;
221otherwise it is that mapping expressed as at least four hexdigits.
222(L</casespec()> should be used in addition to B<charinfo()>
223for case mappings when the calling program can cope with multiple code point
224mappings.)
225
226=item B<block>
227
228block I<code> belongs to (used in \p{In...}).
229See L</Blocks versus Scripts>.
230
231
232=item B<script>
233
234script I<code> belongs to.
235See L</Blocks versus Scripts>.
236
237=back
32c16050
JH
238
239Note that you cannot do (de)composition and casing based solely on the
a452d459
KW
240I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
241you will need also the L</compexcl()>, and L</casespec()> functions.
561c79ed
JH
242
243=cut
244
0616d9cf 245# NB: This function is duplicated in charnames.pm
10a6ecd2
JH
246sub _getcode {
247 my $arg = shift;
248
dc0a4417 249 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 250 return $arg;
dc0a4417 251 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
252 return hex($1);
253 }
254
255 return;
256}
257
ac5ea531
JH
258# Lingua::KO::Hangul::Util not part of the standard distribution
259# but it will be used if available.
260
261eval { require Lingua::KO::Hangul::Util };
262my $hasHangulUtil = ! $@;
263if ($hasHangulUtil) {
264 Lingua::KO::Hangul::Util->import();
265}
9087a70b
TS
266
267sub hangul_decomp { # internal: called from charinfo
ac5ea531
JH
268 if ($hasHangulUtil) {
269 my @tmp = decomposeHangul(shift);
270 return sprintf("%04X %04X", @tmp) if @tmp == 2;
271 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
272 }
273 return;
274}
275
276sub hangul_charname { # internal: called from charinfo
277 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b
TS
278}
279
9087a70b
TS
280sub han_charname { # internal: called from charinfo
281 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b
TS
282}
283
5d8e6e41 284# Overwritten by data in file
324f9e44
RB
285my %first_last = (
286 'CJK Ideograph Extension A' => [ 0x3400, 0x4DB5 ],
287 'CJK Ideograph' => [ 0x4E00, 0x9FA5 ],
288 'CJK Ideograph Extension B' => [ 0x20000, 0x2A6D6 ],
289);
290
291get_charinfo_ranges();
292
293sub get_charinfo_ranges {
294 my @blocks = keys %first_last;
295
296 my $fh;
297 openunicode( \$fh, 'UnicodeData.txt' );
298 if( defined $fh ){
299 while( my $line = <$fh> ){
300 next unless $line =~ /(?:First|Last)/;
301 if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
302 my ($number,$block,$type);
303 ($number,$block) = split /;/, $line;
304 $block =~ s/<|>//g;
305 ($block,$type) = split /, /, $block;
306 my $index = $type eq 'First' ? 0 : 1;
307 $first_last{ $block }->[$index] = hex $number;
308 }
309 }
310 }
311}
312
a6fa416b
TS
313my @CharinfoRanges = (
314# block name
315# [ first, last, coderef to name, coderef to decompose ],
316# CJK Ideographs Extension A
324f9e44 317 [ @{ $first_last{'CJK Ideograph Extension A'} }, \&han_charname, undef ],
a6fa416b 318# CJK Ideographs
324f9e44 319 [ @{ $first_last{'CJK Ideograph'} }, \&han_charname, undef ],
a6fa416b 320# Hangul Syllables
ac5ea531 321 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b
TS
322# Non-Private Use High Surrogates
323 [ 0xD800, 0xDB7F, undef, undef ],
324# Private Use High Surrogates
325 [ 0xDB80, 0xDBFF, undef, undef ],
326# Low Surrogates
327 [ 0xDC00, 0xDFFF, undef, undef ],
328# The Private Use Area
329 [ 0xE000, 0xF8FF, undef, undef ],
330# CJK Ideographs Extension B
324f9e44 331 [ @{ $first_last{'CJK Ideograph Extension B'} }, \&han_charname, undef ],
a6fa416b
TS
332# Plane 15 Private Use Area
333 [ 0xF0000, 0xFFFFD, undef, undef ],
334# Plane 16 Private Use Area
335 [ 0x100000, 0x10FFFD, undef, undef ],
336);
337
561c79ed 338sub charinfo {
10a6ecd2
JH
339 my $arg = shift;
340 my $code = _getcode($arg);
341 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
342 unless defined $code;
e63dbbf9 343 my $hexk = sprintf("%06X", $code);
a6fa416b
TS
344 my($rcode,$rname,$rdec);
345 foreach my $range (@CharinfoRanges){
74f8133e 346 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 347 $rcode = $hexk;
e63dbbf9
JH
348 $rcode =~ s/^0+//;
349 $rcode = sprintf("%04X", hex($rcode));
a6fa416b
TS
350 $rname = $range->[2] ? $range->[2]->($code) : '';
351 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 352 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b
TS
353 last;
354 }
355 }
551b6b6f 356 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 357 if (defined $UNICODEFH) {
e63dbbf9
JH
358 use Search::Dict 1.02;
359 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 360 my $line = <$UNICODEFH>;
c5a29f40 361 return unless defined $line;
561c79ed
JH
362 chomp $line;
363 my %prop;
364 @prop{qw(
365 code name category
366 combining bidi decomposition
367 decimal digit numeric
368 mirrored unicode10 comment
369 upper lower title
370 )} = split(/;/, $line, -1);
e63dbbf9
JH
371 $hexk =~ s/^0+//;
372 $hexk = sprintf("%04X", hex($hexk));
561c79ed 373 if ($prop{code} eq $hexk) {
a196fbfd
JH
374 $prop{block} = charblock($code);
375 $prop{script} = charscript($code);
a6fa416b
TS
376 if(defined $rname){
377 $prop{code} = $rcode;
378 $prop{name} = $rname;
379 $prop{decomposition} = $rdec;
380 }
b08cd201 381 return \%prop;
561c79ed
JH
382 }
383 }
384 }
385 return;
386}
387
e882dd67
JH
388sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
389 my ($table, $lo, $hi, $code) = @_;
390
391 return if $lo > $hi;
392
393 my $mid = int(($lo+$hi) / 2);
394
395 if ($table->[$mid]->[0] < $code) {
10a6ecd2 396 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
397 return $table->[$mid]->[2];
398 } else {
399 _search($table, $mid + 1, $hi, $code);
400 }
401 } elsif ($table->[$mid]->[0] > $code) {
402 _search($table, $lo, $mid - 1, $code);
403 } else {
404 return $table->[$mid]->[2];
405 }
406}
407
10a6ecd2
JH
408sub charinrange {
409 my ($range, $arg) = @_;
410 my $code = _getcode($arg);
411 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
412 unless defined $code;
413 _search($range, 0, $#$range, $code);
414}
415
a452d459 416=head2 B<charblock()>
561c79ed 417
55d7b906 418 use Unicode::UCD 'charblock';
561c79ed
JH
419
420 my $charblock = charblock(0x41);
10a6ecd2 421 my $charblock = charblock(1234);
a452d459 422 my $charblock = charblock(0x263a);
10a6ecd2
JH
423 my $charblock = charblock("U+263a");
424
78bf21c2 425 my $range = charblock('Armenian');
10a6ecd2 426
a452d459
KW
427With a L</code point argument> charblock() returns the I<block> the code point
428belongs to, e.g. C<Basic Latin>.
429If the code point is unassigned, this returns the block it would belong to if
430it were assigned (which it may in future versions of the Unicode Standard).
10a6ecd2 431
78bf21c2
JH
432See also L</Blocks versus Scripts>.
433
eb0cc9e3 434If supplied with an argument that can't be a code point, charblock() tries
a452d459 435to do the opposite and interpret the argument as a code point block. The
eb0cc9e3 436return value is a I<range>: an anonymous list of lists that contain
a2bd7410 437I<start-of-range>, I<end-of-range> code point pairs. You can test whether
a452d459
KW
438a code point is in a range using the L</charinrange()> function. If the
439argument is not a known code point block, B<undef> is returned.
561c79ed 440
561c79ed
JH
441=cut
442
443my @BLOCKS;
10a6ecd2 444my %BLOCKS;
561c79ed 445
10a6ecd2 446sub _charblocks {
561c79ed 447 unless (@BLOCKS) {
10a6ecd2 448 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 449 local $_;
10a6ecd2 450 while (<$BLOCKSFH>) {
2796c109 451 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
452 my ($lo, $hi) = (hex($1), hex($2));
453 my $subrange = [ $lo, $hi, $3 ];
454 push @BLOCKS, $subrange;
455 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
456 }
457 }
10a6ecd2 458 close($BLOCKSFH);
561c79ed
JH
459 }
460 }
10a6ecd2
JH
461}
462
463sub charblock {
464 my $arg = shift;
465
466 _charblocks() unless @BLOCKS;
467
468 my $code = _getcode($arg);
561c79ed 469
10a6ecd2
JH
470 if (defined $code) {
471 _search(\@BLOCKS, 0, $#BLOCKS, $code);
472 } else {
473 if (exists $BLOCKS{$arg}) {
741297c1 474 return dclone $BLOCKS{$arg};
10a6ecd2
JH
475 } else {
476 return;
477 }
478 }
e882dd67
JH
479}
480
a452d459 481=head2 B<charscript()>
e882dd67 482
55d7b906 483 use Unicode::UCD 'charscript';
e882dd67
JH
484
485 my $charscript = charscript(0x41);
10a6ecd2
JH
486 my $charscript = charscript(1234);
487 my $charscript = charscript("U+263a");
e882dd67 488
78bf21c2 489 my $range = charscript('Thai');
10a6ecd2 490
a452d459
KW
491With a L</code point argument> charscript() returns the I<script> the
492code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
493If the code point is unassigned, it returns B<undef>
78bf21c2 494
eb0cc9e3 495If supplied with an argument that can't be a code point, charscript() tries
a452d459 496to do the opposite and interpret the argument as a code point script. The
eb0cc9e3
JH
497return value is a I<range>: an anonymous list of lists that contain
498I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
a452d459
KW
499code point is in a range using the L</charinrange()> function. If the
500argument is not a known code point script, B<undef> is returned.
501
502See also L</Blocks versus Scripts>.
e882dd67 503
e882dd67
JH
504=cut
505
506my @SCRIPTS;
10a6ecd2 507my %SCRIPTS;
e882dd67 508
10a6ecd2 509sub _charscripts {
e882dd67 510 unless (@SCRIPTS) {
10a6ecd2 511 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 512 local $_;
10a6ecd2 513 while (<$SCRIPTSFH>) {
e882dd67 514 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
515 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
516 my $script = lc($3);
517 $script =~ s/\b(\w)/uc($1)/ge;
518 my $subrange = [ $lo, $hi, $script ];
519 push @SCRIPTS, $subrange;
520 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
521 }
522 }
10a6ecd2 523 close($SCRIPTSFH);
e882dd67
JH
524 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
525 }
526 }
10a6ecd2
JH
527}
528
529sub charscript {
530 my $arg = shift;
531
532 _charscripts() unless @SCRIPTS;
e882dd67 533
10a6ecd2
JH
534 my $code = _getcode($arg);
535
536 if (defined $code) {
537 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
538 } else {
539 if (exists $SCRIPTS{$arg}) {
741297c1 540 return dclone $SCRIPTS{$arg};
10a6ecd2
JH
541 } else {
542 return;
543 }
544 }
545}
546
a452d459 547=head2 B<charblocks()>
10a6ecd2 548
55d7b906 549 use Unicode::UCD 'charblocks';
10a6ecd2 550
b08cd201 551 my $charblocks = charblocks();
10a6ecd2 552
b08cd201 553charblocks() returns a reference to a hash with the known block names
a452d459 554as the keys, and the code point ranges (see L</charblock()>) as the values.
10a6ecd2 555
78bf21c2
JH
556See also L</Blocks versus Scripts>.
557
10a6ecd2
JH
558=cut
559
560sub charblocks {
b08cd201 561 _charblocks() unless %BLOCKS;
741297c1 562 return dclone \%BLOCKS;
10a6ecd2
JH
563}
564
a452d459 565=head2 B<charscripts()>
10a6ecd2 566
55d7b906 567 use Unicode::UCD 'charscripts';
10a6ecd2 568
ea508aee 569 my $charscripts = charscripts();
10a6ecd2 570
ea508aee 571charscripts() returns a reference to a hash with the known script
a452d459 572names as the keys, and the code point ranges (see L</charscript()>) as
ea508aee 573the values.
10a6ecd2 574
78bf21c2
JH
575See also L</Blocks versus Scripts>.
576
10a6ecd2
JH
577=cut
578
579sub charscripts {
b08cd201 580 _charscripts() unless %SCRIPTS;
741297c1 581 return dclone \%SCRIPTS;
561c79ed
JH
582}
583
a452d459 584=head2 B<charinrange()>
10a6ecd2
JH
585
586In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
587can also test whether a code point is in the I<range> as returned by
a452d459
KW
588L</charblock()> and L</charscript()> or as the values of the hash returned
589by L</charblocks()> and L</charscripts()> by using charinrange():
10a6ecd2 590
55d7b906 591 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
592
593 $range = charscript('Hiragana');
e145285f 594 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
595
596=cut
597
ea508aee
JH
598my %GENERAL_CATEGORIES =
599 (
600 'L' => 'Letter',
601 'LC' => 'CasedLetter',
602 'Lu' => 'UppercaseLetter',
603 'Ll' => 'LowercaseLetter',
604 'Lt' => 'TitlecaseLetter',
605 'Lm' => 'ModifierLetter',
606 'Lo' => 'OtherLetter',
607 'M' => 'Mark',
608 'Mn' => 'NonspacingMark',
609 'Mc' => 'SpacingMark',
610 'Me' => 'EnclosingMark',
611 'N' => 'Number',
612 'Nd' => 'DecimalNumber',
613 'Nl' => 'LetterNumber',
614 'No' => 'OtherNumber',
615 'P' => 'Punctuation',
616 'Pc' => 'ConnectorPunctuation',
617 'Pd' => 'DashPunctuation',
618 'Ps' => 'OpenPunctuation',
619 'Pe' => 'ClosePunctuation',
620 'Pi' => 'InitialPunctuation',
621 'Pf' => 'FinalPunctuation',
622 'Po' => 'OtherPunctuation',
623 'S' => 'Symbol',
624 'Sm' => 'MathSymbol',
625 'Sc' => 'CurrencySymbol',
626 'Sk' => 'ModifierSymbol',
627 'So' => 'OtherSymbol',
628 'Z' => 'Separator',
629 'Zs' => 'SpaceSeparator',
630 'Zl' => 'LineSeparator',
631 'Zp' => 'ParagraphSeparator',
632 'C' => 'Other',
633 'Cc' => 'Control',
634 'Cf' => 'Format',
635 'Cs' => 'Surrogate',
636 'Co' => 'PrivateUse',
637 'Cn' => 'Unassigned',
638 );
639
640sub general_categories {
641 return dclone \%GENERAL_CATEGORIES;
642}
643
a452d459 644=head2 B<general_categories()>
ea508aee
JH
645
646 use Unicode::UCD 'general_categories';
647
648 my $categories = general_categories();
649
a452d459 650This returns a reference to a hash which has short
ea508aee
JH
651general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
652names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
653C<Symbol>) as values. The hash is reversible in case you need to go
654from the long names to the short names. The general category is the
a452d459
KW
655one returned from
656L</charinfo()> under the C<category> key.
ea508aee
JH
657
658=cut
659
660my %BIDI_TYPES =
661 (
662 'L' => 'Left-to-Right',
663 'LRE' => 'Left-to-Right Embedding',
664 'LRO' => 'Left-to-Right Override',
665 'R' => 'Right-to-Left',
666 'AL' => 'Right-to-Left Arabic',
667 'RLE' => 'Right-to-Left Embedding',
668 'RLO' => 'Right-to-Left Override',
669 'PDF' => 'Pop Directional Format',
670 'EN' => 'European Number',
671 'ES' => 'European Number Separator',
672 'ET' => 'European Number Terminator',
673 'AN' => 'Arabic Number',
674 'CS' => 'Common Number Separator',
675 'NSM' => 'Non-Spacing Mark',
676 'BN' => 'Boundary Neutral',
677 'B' => 'Paragraph Separator',
678 'S' => 'Segment Separator',
679 'WS' => 'Whitespace',
680 'ON' => 'Other Neutrals',
681 );
682
a452d459 683=head2 B<bidi_types()>
ea508aee
JH
684
685 use Unicode::UCD 'bidi_types';
686
687 my $categories = bidi_types();
688
a452d459 689This returns a reference to a hash which has the short
ea508aee
JH
690bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
691names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
692hash is reversible in case you need to go from the long names to the
a452d459
KW
693short names. The bidi type is the one returned from
694L</charinfo()>
ea508aee
JH
695under the C<bidi> key. For the exact meaning of the various bidi classes
696the Unicode TR9 is recommended reading:
a452d459 697L<http://www.unicode.org/reports/tr9/>
ea508aee
JH
698(as of Unicode 5.0.0)
699
700=cut
701
a452d459
KW
702sub bidi_types {
703 return dclone \%BIDI_TYPES;
704}
705
706=head2 B<compexcl()>
b08cd201 707
55d7b906 708 use Unicode::UCD 'compexcl';
b08cd201 709
a452d459 710 my $compexcl = compexcl(0x09dc);
b08cd201 711
a452d459
KW
712This returns B<true> if the
713L</code point argument> should not be produced by composition normalization,
714B<AND> if that fact is not otherwise determinable from the Unicode data base.
715It currently does not return B<true> if the code point has a decomposition
716consisting of another single code point, nor if its decomposition starts
717with a code point whose combining class is non-zero. Code points that meet
718either of these conditions should also not be produced by composition
719normalization.
b08cd201 720
a452d459 721It returns B<false> otherwise.
b08cd201
JH
722
723=cut
724
725my %COMPEXCL;
726
727sub _compexcl {
728 unless (%COMPEXCL) {
551b6b6f 729 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 730 local $_;
b08cd201 731 while (<$COMPEXCLFH>) {
822ebcc8 732 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201
JH
733 my $code = hex($1);
734 $COMPEXCL{$code} = undef;
735 }
736 }
737 close($COMPEXCLFH);
738 }
739 }
740}
741
742sub compexcl {
743 my $arg = shift;
744 my $code = _getcode($arg);
74f8133e
JH
745 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
746 unless defined $code;
b08cd201
JH
747
748 _compexcl() unless %COMPEXCL;
749
750 return exists $COMPEXCL{$code};
751}
752
a452d459 753=head2 B<casefold()>
b08cd201 754
55d7b906 755 use Unicode::UCD 'casefold';
b08cd201 756
a452d459
KW
757 my $casefold = casefold(0xDF);
758 if (defined $casefold) {
759 my @full_fold_hex = split / /, $casefold->{'full'};
760 my $full_fold_string =
761 join "", map {chr(hex($_))} @full_fold_hex;
762 my @turkic_fold_hex =
763 split / /, ($casefold->{'turkic'} ne "")
764 ? $casefold->{'turkic'}
765 : $casefold->{'full'};
766 my $turkic_fold_string =
767 join "", map {chr(hex($_))} @turkic_fold_hex;
768 }
769 if (defined $casefold && $casefold->{'simple'} ne "") {
770 my $simple_fold_hex = $casefold->{'simple'};
771 my $simple_fold_string = chr(hex($simple_fold_hex));
772 }
b08cd201 773
a452d459
KW
774This returns the (almost) locale-independent case folding of the
775character specified by the L</code point argument>.
b08cd201 776
a452d459
KW
777If there is no case folding for that code point, B<undef> is returned.
778
779If there is a case folding for that code point, a reference to a hash
b08cd201
JH
780with the following fields is returned:
781
a452d459
KW
782=over
783
784=item B<code>
785
786the input L</code point argument> expressed in hexadecimal, with leading zeros
787added if necessary to make it contain at least four hexdigits
788
789=item B<full>
790
791one or more codes (separated by spaces) that taken in order give the
792code points for the case folding for I<code>.
793Each has at least four hexdigits.
794
795=item B<simple>
796
797is empty, or is exactly one code with at least four hexdigits which can be used
798as an alternative case folding when the calling program cannot cope with the
799fold being a sequence of multiple code points. If I<full> is just one code
800point, then I<simple> equals I<full>. If there is no single code point folding
801defined for I<code>, then I<simple> is the empty string. Otherwise, it is an
802inferior, but still better-than-nothing alternative folding to I<full>.
803
804=item B<mapping>
805
806is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
807otherwise. It can be considered to be the simplest possible folding for
808I<code>. It is defined primarily for backwards compatibility.
809
810=item B<status>
b08cd201 811
a452d459
KW
812is C<C> (for C<common>) if the best possible fold is a single code point
813(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct
814folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if
815there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty). Note
816that this
817describes the contents of I<mapping>. It is defined primarily for backwards
818compatibility.
b08cd201 819
a452d459
KW
820On versions 3.1 and earlier of Unicode, I<status> can also be
821C<I> which is the same as C<C> but is a special case for dotted uppercase I and
822dotless lowercase i:
b08cd201 823
a452d459 824=over
b08cd201 825
a452d459
KW
826=item B<*>
827
828If you use this C<I> mapping, the result is case-insensitive,
829but dotless and dotted I's are not distinguished
830
831=item B<*>
832
833If you exclude this C<I> mapping, the result is not fully case-insensitive, but
834dotless and dotted I's are distinguished
835
836=back
837
838=item B<turkic>
839
840contains any special folding for Turkic languages. For versions of Unicode
841starting with 3.2, this field is empty unless I<code> has a different folding
842in Turkic languages, in which case it is one or more codes (separated by
843spaces) that taken in order give the code points for the case folding for
844I<code> in those languages.
845Each code has at least four hexdigits.
846Note that this folding does not maintain canonical equivalence without
847additional processing.
848
849For versions of Unicode 3.1 and earlier, this field is empty unless there is a
850special folding for Turkic languages, in which case I<status> is C<I>, and
851I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
852
853=back
854
855Programs that want complete generality and the best folding results should use
856the folding contained in the I<full> field. But note that the fold for some
857code points will be a sequence of multiple code points.
858
859Programs that can't cope with the fold mapping being multiple code points can
860use the folding contained in the I<simple> field, with the loss of some
861generality. In Unicode 5.1, about 7% of the defined foldings have no single
862code point folding.
863
864The I<mapping> and I<status> fields are provided for backwards compatibility for
865existing programs. They contain the same values as in previous versions of
866this function.
867
868Locale is not completely independent. The I<turkic> field contains results to
869use when the locale is a Turkic language.
b08cd201
JH
870
871For more information about case mappings see
a452d459 872L<http://www.unicode.org/unicode/reports/tr21>
b08cd201
JH
873
874=cut
875
876my %CASEFOLD;
877
878sub _casefold {
879 unless (%CASEFOLD) {
551b6b6f 880 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 881 local $_;
b08cd201 882 while (<$CASEFOLDFH>) {
a452d459 883 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
b08cd201 884 my $code = hex($1);
a452d459
KW
885 $CASEFOLD{$code}{'code'} = $1;
886 $CASEFOLD{$code}{'turkic'} = "" unless
887 defined $CASEFOLD{$code}{'turkic'};
888 if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and
889 # earlier Unicodes
890 # Both entries there (I
891 # only checked 3.1) are
892 # the same as C, and
893 # there are no other
894 # entries for those
895 # codepoints, so treat
896 # as if C, but override
897 # the turkic one for
898 # 'I'.
899 $CASEFOLD{$code}{'status'} = $2;
900 $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
901 $CASEFOLD{$code}{'mapping'} = $3;
902 $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
903 } elsif ($2 eq 'F') {
904 $CASEFOLD{$code}{'full'} = $3;
905 unless (defined $CASEFOLD{$code}{'simple'}) {
906 $CASEFOLD{$code}{'simple'} = "";
907 $CASEFOLD{$code}{'mapping'} = $3;
908 $CASEFOLD{$code}{'status'} = $2;
909 }
910 } elsif ($2 eq 'S') {
911
912
913 # There can't be a simple without a full, and simple
914 # overrides all but full
915
916 $CASEFOLD{$code}{'simple'} = $3;
917 $CASEFOLD{$code}{'mapping'} = $3;
918 $CASEFOLD{$code}{'status'} = $2;
919 } elsif ($2 eq 'T') {
920 $CASEFOLD{$code}{'turkic'} = $3;
921 } # else can't happen because only [CIFST] are possible
b08cd201
JH
922 }
923 }
924 close($CASEFOLDFH);
925 }
926 }
927}
928
929sub casefold {
930 my $arg = shift;
931 my $code = _getcode($arg);
74f8133e
JH
932 croak __PACKAGE__, "::casefold: unknown code '$arg'"
933 unless defined $code;
b08cd201
JH
934
935 _casefold() unless %CASEFOLD;
936
937 return $CASEFOLD{$code};
938}
939
a452d459 940=head2 B<casespec()>
b08cd201 941
55d7b906 942 use Unicode::UCD 'casespec';
b08cd201 943
a452d459 944 my $casespec = casespec(0xFB00);
b08cd201 945
a452d459
KW
946This returns the potentially locale-dependent case mappings of the L</code point
947argument>. The mappings may be longer than a single code point (which the basic
948Unicode case mappings as returned by L</charinfo()> never are).
b08cd201 949
a452d459
KW
950If there are no case mappings for the L</code point argument>, or if all three
951possible mappings (I<lower>, I<title> and I<upper>) result in single code
5d8e6e41
KW
952points and are locale independent and unconditional, B<undef> is returned
953(which means that the case mappings, if any, for the code point are those
954returned by L</charinfo()>).
a452d459
KW
955
956Otherwise, a reference to a hash giving the mappings (or a reference to a hash
5d8e6e41
KW
957of such hashes, explained below) is returned with the following keys and their
958meanings:
a452d459
KW
959
960The keys in the bottom layer hash with the meanings of their values are:
961
962=over
963
964=item B<code>
965
966the input L</code point argument> expressed in hexadecimal, with leading zeros
967added if necessary to make it contain at least four hexdigits
968
969=item B<lower>
970
971one or more codes (separated by spaces) that taken in order give the
972code points for the lower case of I<code>.
973Each has at least four hexdigits.
974
975=item B<title>
b08cd201 976
a452d459
KW
977one or more codes (separated by spaces) that taken in order give the
978code points for the title case of I<code>.
979Each has at least four hexdigits.
b08cd201 980
a452d459 981=item B<lower>
b08cd201 982
a452d459
KW
983one or more codes (separated by spaces) that taken in order give the
984code points for the upper case of I<code>.
985Each has at least four hexdigits.
986
987=item B<condition>
988
989the conditions for the mappings to be valid.
990If B<undef>, the mappings are always valid.
991When defined, this field is a list of conditions,
992all of which must be true for the mappings to be valid.
993The list consists of one or more
994I<locales> (see below)
995and/or I<contexts> (explained in the next paragraph),
996separated by spaces.
997(Other than as used to separate elements, spaces are to be ignored.)
998Case distinctions in the condition list are not significant.
82c0b05b 999Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 1000
a452d459
KW
1001A I<context> is one of those defined in the Unicode standard.
1002For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1003available at
5d8e6e41
KW
1004L<http://www.unicode.org/versions/Unicode5.1.0/>.
1005These are for context-sensitive casing.
f499c386 1006
a452d459
KW
1007=back
1008
5d8e6e41
KW
1009The hash described above is returned for locale-independent casing, where
1010at least one of the mappings has length longer than one. If B<undef> is
1011returned, the code point may have mappings, but if so, all are length one,
1012and are returned by L</charinfo()>.
1013Note that when this function does return a value, it will be for the complete
1014set of mappings for a code point, even those whose length is one.
1015
1016If there are additional casing rules that apply only in certain locales,
1017an additional key for each will be defined in the returned hash. Each such key
1018will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1019followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1020and a variant code). You can find the lists of all possible locales, see
1021L<Locale::Country> and L<Locale::Language>.
a452d459
KW
1022(In Unicode 5.1, the only locales returned by this function
1023are C<lt>, C<tr>, and C<az>.)
b08cd201 1024
5d8e6e41
KW
1025Each locale key is a reference to a hash that has the form above, and gives
1026the casing rules for that particular locale, which take precedence over the
1027locale-independent ones when in that locale.
1028
1029If the only casing for a code point is locale-dependent, then the returned
1030hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1031will contain only locale keys.
1032
b08cd201 1033For more information about case mappings see
a452d459 1034L<http://www.unicode.org/unicode/reports/tr21/>
b08cd201
JH
1035
1036=cut
1037
1038my %CASESPEC;
1039
1040sub _casespec {
1041 unless (%CASESPEC) {
551b6b6f 1042 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 1043 local $_;
b08cd201
JH
1044 while (<$CASESPECFH>) {
1045 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
1046 my ($hexcode, $lower, $title, $upper, $condition) =
1047 ($1, $2, $3, $4, $5);
1048 my $code = hex($hexcode);
1049 if (exists $CASESPEC{$code}) {
1050 if (exists $CASESPEC{$code}->{code}) {
1051 my ($oldlower,
1052 $oldtitle,
1053 $oldupper,
1054 $oldcondition) =
1055 @{$CASESPEC{$code}}{qw(lower
1056 title
1057 upper
1058 condition)};
822ebcc8
JH
1059 if (defined $oldcondition) {
1060 my ($oldlocale) =
f499c386 1061 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
1062 delete $CASESPEC{$code};
1063 $CASESPEC{$code}->{$oldlocale} =
1064 { code => $hexcode,
1065 lower => $oldlower,
1066 title => $oldtitle,
1067 upper => $oldupper,
1068 condition => $oldcondition };
f499c386
JH
1069 }
1070 }
1071 my ($locale) =
1072 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1073 $CASESPEC{$code}->{$locale} =
1074 { code => $hexcode,
1075 lower => $lower,
1076 title => $title,
1077 upper => $upper,
1078 condition => $condition };
1079 } else {
1080 $CASESPEC{$code} =
1081 { code => $hexcode,
1082 lower => $lower,
1083 title => $title,
1084 upper => $upper,
1085 condition => $condition };
1086 }
b08cd201
JH
1087 }
1088 }
1089 close($CASESPECFH);
1090 }
1091 }
1092}
1093
1094sub casespec {
1095 my $arg = shift;
1096 my $code = _getcode($arg);
74f8133e
JH
1097 croak __PACKAGE__, "::casespec: unknown code '$arg'"
1098 unless defined $code;
b08cd201
JH
1099
1100 _casespec() unless %CASESPEC;
1101
741297c1 1102 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
1103}
1104
a452d459 1105=head2 B<namedseq()>
a2bd7410
JH
1106
1107 use Unicode::UCD 'namedseq';
1108
1109 my $namedseq = namedseq("KATAKANA LETTER AINU P");
1110 my @namedseq = namedseq("KATAKANA LETTER AINU P");
1111 my %namedseq = namedseq();
1112
1113If used with a single argument in a scalar context, returns the string
a452d459 1114consisting of the code points of the named sequence, or B<undef> if no
a2bd7410 1115named sequence by that name exists. If used with a single argument in
956cae9a
KW
1116a list context, it returns the list of the ordinals of the code points. If used
1117with no
a2bd7410
JH
1118arguments in a list context, returns a hash with the names of the
1119named sequences as the keys and the named sequences as strings as
a452d459 1120the values. Otherwise, it returns B<undef> or an empty list depending
a2bd7410
JH
1121on the context.
1122
a452d459
KW
1123This function only operates on officially approved (not provisional) named
1124sequences.
a2bd7410
JH
1125
1126=cut
1127
1128my %NAMEDSEQ;
1129
1130sub _namedseq {
1131 unless (%NAMEDSEQ) {
1132 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
1133 local $_;
1134 while (<$NAMEDSEQFH>) {
1135 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
1136 my ($n, $s) = ($1, $2);
1137 my @s = map { chr(hex($_)) } split(' ', $s);
1138 $NAMEDSEQ{$n} = join("", @s);
1139 }
1140 }
1141 close($NAMEDSEQFH);
1142 }
1143 }
1144}
1145
1146sub namedseq {
1147 _namedseq() unless %NAMEDSEQ;
1148 my $wantarray = wantarray();
1149 if (defined $wantarray) {
1150 if ($wantarray) {
1151 if (@_ == 0) {
1152 return %NAMEDSEQ;
1153 } elsif (@_ == 1) {
1154 my $s = $NAMEDSEQ{ $_[0] };
1155 return defined $s ? map { ord($_) } split('', $s) : ();
1156 }
1157 } elsif (@_ == 1) {
1158 return $NAMEDSEQ{ $_[0] };
1159 }
1160 }
1161 return;
1162}
1163
55d7b906 1164=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 1165
a452d459
KW
1166This returns the version of the Unicode Character Database, in other words, the
1167version of the Unicode standard the database implements. The version is a
1168string of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
1169
1170=cut
1171
1172my $UNICODEVERSION;
1173
1174sub UnicodeVersion {
1175 unless (defined $UNICODEVERSION) {
1176 openunicode(\$VERSIONFH, "version");
1177 chomp($UNICODEVERSION = <$VERSIONFH>);
1178 close($VERSIONFH);
1179 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1180 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1181 }
1182 return $UNICODEVERSION;
1183}
3aa957f9 1184
a452d459
KW
1185=head2 B<Blocks versus Scripts>
1186
1187The difference between a block and a script is that scripts are closer
1188to the linguistic notion of a set of code points required to present
1189languages, while block is more of an artifact of the Unicode code point
1190numbering and separation into blocks of (mostly) 256 code points.
1191
1192For example the Latin B<script> is spread over several B<blocks>, such
1193as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1194C<Latin Extended-B>. On the other hand, the Latin script does not
1195contain all the characters of the C<Basic Latin> block (also known as
1196ASCII): it includes only the letters, and not, for example, the digits
1197or the punctuation.
1198
1199For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1200
1201For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1202
1203=head2 B<Matching Scripts and Blocks>
1204
1205Scripts are matched with the regular-expression construct
1206C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
1207while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
1208any of the 256 code points in the Tibetan block).
1209
1210
3aa957f9 1211=head2 Implementation Note
32c16050 1212
ad9cab37
JH
1213The first use of charinfo() opens a read-only filehandle to the Unicode
1214Character Database (the database is included in the Perl distribution).
78bf21c2
JH
1215The filehandle is then kept open for further queries. In other words,
1216if you are wondering where one of your filehandles went, that's where.
32c16050 1217
8b731da2
JH
1218=head1 BUGS
1219
1220Does not yet support EBCDIC platforms.
1221
a452d459
KW
1222L</compexcl()> should give a complete list of excluded code points.
1223
561c79ed
JH
1224=head1 AUTHOR
1225
1226Jarkko Hietaniemi
1227
1228=cut
1229
12301;