This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #59908] \x, \0, and \N{} not ok in double-quotish when followed by > \x100
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
5
a452d459 6our $VERSION = '0.27';
561c79ed 7
741297c1
JH
8use Storable qw(dclone);
9
561c79ed
JH
10require Exporter;
11
12our @ISA = qw(Exporter);
74f8133e 13
10a6ecd2
JH
14our @EXPORT_OK = qw(charinfo
15 charblock charscript
16 charblocks charscripts
b08cd201 17 charinrange
ea508aee 18 general_categories bidi_types
b08cd201 19 compexcl
a2bd7410
JH
20 casefold casespec
21 namedseq);
561c79ed
JH
22
23use Carp;
24
25=head1 NAME
26
55d7b906 27Unicode::UCD - Unicode character database
561c79ed
JH
28
29=head1 SYNOPSIS
30
55d7b906 31 use Unicode::UCD 'charinfo';
b08cd201 32 my $charinfo = charinfo($codepoint);
561c79ed 33
55d7b906 34 use Unicode::UCD 'charblock';
e882dd67
JH
35 my $charblock = charblock($codepoint);
36
55d7b906 37 use Unicode::UCD 'charscript';
65044554 38 my $charscript = charscript($codepoint);
561c79ed 39
55d7b906 40 use Unicode::UCD 'charblocks';
e145285f
JH
41 my $charblocks = charblocks();
42
55d7b906 43 use Unicode::UCD 'charscripts';
ea508aee 44 my $charscripts = charscripts();
e145285f 45
55d7b906 46 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
47 my $range = charscript($script);
48 print "looks like $script\n" if charinrange($range, $codepoint);
49
ea508aee
JH
50 use Unicode::UCD qw(general_categories bidi_types);
51 my $categories = general_categories();
52 my $types = bidi_types();
53
55d7b906 54 use Unicode::UCD 'compexcl';
e145285f
JH
55 my $compexcl = compexcl($codepoint);
56
a2bd7410
JH
57 use Unicode::UCD 'namedseq';
58 my $namedseq = namedseq($named_sequence_name);
59
55d7b906 60 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 61
561c79ed
JH
62=head1 DESCRIPTION
63
a452d459
KW
64The Unicode::UCD module offers a series of functions that
65provide a simple interface to the Unicode
8b731da2 66Character Database.
561c79ed 67
a452d459
KW
68=head2 code point argument
69
70Some of the functions are called with a I<code point argument>, which is either
71a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
72followed by hexadecimals designating a Unicode code point. In other words, if
73you want a code point to be interpreted as a hexadecimal number, you must
74prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
75interpreted as a decimal code point. Also note that Unicode is B<not> limited
76to 16 bits (the number of Unicode code points is open-ended, in theory
77unlimited): you may have more than 4 hexdigits.
561c79ed
JH
78=cut
79
10a6ecd2
JH
80my $UNICODEFH;
81my $BLOCKSFH;
82my $SCRIPTSFH;
83my $VERSIONFH;
b08cd201
JH
84my $COMPEXCLFH;
85my $CASEFOLDFH;
86my $CASESPECFH;
a2bd7410 87my $NAMEDSEQFH;
561c79ed
JH
88
89sub openunicode {
90 my ($rfh, @path) = @_;
91 my $f;
92 unless (defined $$rfh) {
93 for my $d (@INC) {
94 use File::Spec;
55d7b906 95 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 96 last if open($$rfh, $f);
e882dd67 97 undef $f;
561c79ed 98 }
e882dd67
JH
99 croak __PACKAGE__, ": failed to find ",
100 File::Spec->catfile(@path), " in @INC"
101 unless defined $f;
561c79ed
JH
102 }
103 return $f;
104}
105
a452d459 106=head2 B<charinfo()>
561c79ed 107
55d7b906 108 use Unicode::UCD 'charinfo';
561c79ed 109
b08cd201 110 my $charinfo = charinfo(0x41);
561c79ed 111
a452d459
KW
112This returns information about the input L</code point argument>
113as a reference to a hash of fields as defined by the Unicode
114standard. If the L</code point argument> is not assigned in the standard
115(i.e., has the general category C<Cn> meaning C<Unassigned>)
116or is a non-character (meaning it is guaranteed to never be assigned in
117the standard),
118B<undef> is returned.
119
120Fields that aren't applicable to the particular code point argument exist in the
121returned hash, and are empty.
122
123The keys in the hash with the meanings of their values are:
124
125=over
126
127=item B<code>
128
129the input L</code point argument> expressed in hexadecimal, with leading zeros
130added if necessary to make it contain at least four hexdigits
131
132=item B<name>
133
134name of I<code>, all IN UPPER CASE.
135Some control-type code points do not have names.
136This field will be empty for C<Surrogate> and C<Private Use> code points,
137and for the others without a name,
138it will contain a description enclosed in angle brackets, like
139C<E<lt>controlE<gt>>.
140
141
142=item B<category>
143
144The short name of the general category of I<code>.
145This will match one of the keys in the hash returned by L</general_categories()>.
146
147=item B<combining>
148
149the combining class number for I<code> used in the Canonical Ordering Algorithm.
150For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
151available at
152L<http://www.unicode.org/versions/Unicode5.1.0/>
153
154=item B<bidi>
155
156bidirectional type of I<code>.
157This will match one of the keys in the hash returned by L</bidi_types()>.
158
159=item B<decomposition>
160
161is empty if I<code> has no decomposition; or is one or more codes
162(separated by spaces) that taken in order represent a decomposition for
163I<code>. Each has at least four hexdigits.
164The codes may be preceded by a word enclosed in angle brackets then a space,
165like C<E<lt>compatE<gt> >, giving the type of decomposition
166
167=item B<decimal>
168
169if I<code> is a decimal digit this is its integer numeric value
170
171=item B<digit>
172
173if I<code> represents a whole number, this is its integer numeric value
174
175=item B<numeric>
176
177if I<code> represents a whole or rational number, this is its numeric value.
178Rational values are expressed as a string like C<1/4>.
179
180=item B<mirrored>
181
182C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
183
184=item B<unicode10>
185
186name of I<code> in the Unicode 1.0 standard if one
187existed for this code point and is different from the current name
188
189=item B<comment>
190
191ISO 10646 comment field.
192It appears in parentheses in the ISO 10646 names list,
193or contains an asterisk to indicate there is
194a note for this code point in Annex P of that standard.
195
196=item B<upper>
197
198is empty if there is no single code point uppercase mapping for I<code>;
199otherwise it is that mapping expressed as at least four hexdigits.
200(L</casespec()> should be used in addition to B<charinfo()>
201for case mappings when the calling program can cope with multiple code point
202mappings.)
203
204=item B<lower>
205
206is empty if there is no single code point lowercase mapping for I<code>;
207otherwise it is that mapping expressed as at least four hexdigits.
208(L</casespec()> should be used in addition to B<charinfo()>
209for case mappings when the calling program can cope with multiple code point
210mappings.)
211
212=item B<title>
213
214is empty if there is no single code point titlecase mapping for I<code>;
215otherwise it is that mapping expressed as at least four hexdigits.
216(L</casespec()> should be used in addition to B<charinfo()>
217for case mappings when the calling program can cope with multiple code point
218mappings.)
219
220=item B<block>
221
222block I<code> belongs to (used in \p{In...}).
223See L</Blocks versus Scripts>.
224
225
226=item B<script>
227
228script I<code> belongs to.
229See L</Blocks versus Scripts>.
230
231=back
32c16050
JH
232
233Note that you cannot do (de)composition and casing based solely on the
a452d459
KW
234I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
235you will need also the L</compexcl()>, and L</casespec()> functions.
561c79ed
JH
236
237=cut
238
0616d9cf 239# NB: This function is duplicated in charnames.pm
10a6ecd2
JH
240sub _getcode {
241 my $arg = shift;
242
dc0a4417 243 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 244 return $arg;
dc0a4417 245 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
246 return hex($1);
247 }
248
249 return;
250}
251
ac5ea531
JH
252# Lingua::KO::Hangul::Util not part of the standard distribution
253# but it will be used if available.
254
255eval { require Lingua::KO::Hangul::Util };
256my $hasHangulUtil = ! $@;
257if ($hasHangulUtil) {
258 Lingua::KO::Hangul::Util->import();
259}
9087a70b
ST
260
261sub hangul_decomp { # internal: called from charinfo
ac5ea531
JH
262 if ($hasHangulUtil) {
263 my @tmp = decomposeHangul(shift);
264 return sprintf("%04X %04X", @tmp) if @tmp == 2;
265 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
266 }
267 return;
268}
269
270sub hangul_charname { # internal: called from charinfo
271 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b
ST
272}
273
9087a70b
ST
274sub han_charname { # internal: called from charinfo
275 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b
ST
276}
277
324f9e44
RB
278my %first_last = (
279 'CJK Ideograph Extension A' => [ 0x3400, 0x4DB5 ],
280 'CJK Ideograph' => [ 0x4E00, 0x9FA5 ],
281 'CJK Ideograph Extension B' => [ 0x20000, 0x2A6D6 ],
282);
283
284get_charinfo_ranges();
285
286sub get_charinfo_ranges {
287 my @blocks = keys %first_last;
288
289 my $fh;
290 openunicode( \$fh, 'UnicodeData.txt' );
291 if( defined $fh ){
292 while( my $line = <$fh> ){
293 next unless $line =~ /(?:First|Last)/;
294 if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
295 my ($number,$block,$type);
296 ($number,$block) = split /;/, $line;
297 $block =~ s/<|>//g;
298 ($block,$type) = split /, /, $block;
299 my $index = $type eq 'First' ? 0 : 1;
300 $first_last{ $block }->[$index] = hex $number;
301 }
302 }
303 }
304}
305
a6fa416b
ST
306my @CharinfoRanges = (
307# block name
308# [ first, last, coderef to name, coderef to decompose ],
309# CJK Ideographs Extension A
324f9e44 310 [ @{ $first_last{'CJK Ideograph Extension A'} }, \&han_charname, undef ],
a6fa416b 311# CJK Ideographs
324f9e44 312 [ @{ $first_last{'CJK Ideograph'} }, \&han_charname, undef ],
a6fa416b 313# Hangul Syllables
ac5ea531 314 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b
ST
315# Non-Private Use High Surrogates
316 [ 0xD800, 0xDB7F, undef, undef ],
317# Private Use High Surrogates
318 [ 0xDB80, 0xDBFF, undef, undef ],
319# Low Surrogates
320 [ 0xDC00, 0xDFFF, undef, undef ],
321# The Private Use Area
322 [ 0xE000, 0xF8FF, undef, undef ],
323# CJK Ideographs Extension B
324f9e44 324 [ @{ $first_last{'CJK Ideograph Extension B'} }, \&han_charname, undef ],
a6fa416b
ST
325# Plane 15 Private Use Area
326 [ 0xF0000, 0xFFFFD, undef, undef ],
327# Plane 16 Private Use Area
328 [ 0x100000, 0x10FFFD, undef, undef ],
329);
330
561c79ed 331sub charinfo {
10a6ecd2
JH
332 my $arg = shift;
333 my $code = _getcode($arg);
334 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
335 unless defined $code;
e63dbbf9 336 my $hexk = sprintf("%06X", $code);
a6fa416b
ST
337 my($rcode,$rname,$rdec);
338 foreach my $range (@CharinfoRanges){
74f8133e 339 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 340 $rcode = $hexk;
e63dbbf9
JH
341 $rcode =~ s/^0+//;
342 $rcode = sprintf("%04X", hex($rcode));
a6fa416b
ST
343 $rname = $range->[2] ? $range->[2]->($code) : '';
344 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 345 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b
ST
346 last;
347 }
348 }
551b6b6f 349 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 350 if (defined $UNICODEFH) {
e63dbbf9
JH
351 use Search::Dict 1.02;
352 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 353 my $line = <$UNICODEFH>;
c5a29f40 354 return unless defined $line;
561c79ed
JH
355 chomp $line;
356 my %prop;
357 @prop{qw(
358 code name category
359 combining bidi decomposition
360 decimal digit numeric
361 mirrored unicode10 comment
362 upper lower title
363 )} = split(/;/, $line, -1);
e63dbbf9
JH
364 $hexk =~ s/^0+//;
365 $hexk = sprintf("%04X", hex($hexk));
561c79ed 366 if ($prop{code} eq $hexk) {
a196fbfd
JH
367 $prop{block} = charblock($code);
368 $prop{script} = charscript($code);
a6fa416b
ST
369 if(defined $rname){
370 $prop{code} = $rcode;
371 $prop{name} = $rname;
372 $prop{decomposition} = $rdec;
373 }
b08cd201 374 return \%prop;
561c79ed
JH
375 }
376 }
377 }
378 return;
379}
380
e882dd67
JH
381sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
382 my ($table, $lo, $hi, $code) = @_;
383
384 return if $lo > $hi;
385
386 my $mid = int(($lo+$hi) / 2);
387
388 if ($table->[$mid]->[0] < $code) {
10a6ecd2 389 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
390 return $table->[$mid]->[2];
391 } else {
392 _search($table, $mid + 1, $hi, $code);
393 }
394 } elsif ($table->[$mid]->[0] > $code) {
395 _search($table, $lo, $mid - 1, $code);
396 } else {
397 return $table->[$mid]->[2];
398 }
399}
400
10a6ecd2
JH
401sub charinrange {
402 my ($range, $arg) = @_;
403 my $code = _getcode($arg);
404 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
405 unless defined $code;
406 _search($range, 0, $#$range, $code);
407}
408
a452d459 409=head2 B<charblock()>
561c79ed 410
55d7b906 411 use Unicode::UCD 'charblock';
561c79ed
JH
412
413 my $charblock = charblock(0x41);
10a6ecd2 414 my $charblock = charblock(1234);
a452d459 415 my $charblock = charblock(0x263a);
10a6ecd2
JH
416 my $charblock = charblock("U+263a");
417
78bf21c2 418 my $range = charblock('Armenian');
10a6ecd2 419
a452d459
KW
420With a L</code point argument> charblock() returns the I<block> the code point
421belongs to, e.g. C<Basic Latin>.
422If the code point is unassigned, this returns the block it would belong to if
423it were assigned (which it may in future versions of the Unicode Standard).
10a6ecd2 424
78bf21c2
JH
425See also L</Blocks versus Scripts>.
426
eb0cc9e3 427If supplied with an argument that can't be a code point, charblock() tries
a452d459 428to do the opposite and interpret the argument as a code point block. The
eb0cc9e3 429return value is a I<range>: an anonymous list of lists that contain
a2bd7410 430I<start-of-range>, I<end-of-range> code point pairs. You can test whether
a452d459
KW
431a code point is in a range using the L</charinrange()> function. If the
432argument is not a known code point block, B<undef> is returned.
561c79ed 433
561c79ed
JH
434=cut
435
436my @BLOCKS;
10a6ecd2 437my %BLOCKS;
561c79ed 438
10a6ecd2 439sub _charblocks {
561c79ed 440 unless (@BLOCKS) {
10a6ecd2 441 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 442 local $_;
10a6ecd2 443 while (<$BLOCKSFH>) {
2796c109 444 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
445 my ($lo, $hi) = (hex($1), hex($2));
446 my $subrange = [ $lo, $hi, $3 ];
447 push @BLOCKS, $subrange;
448 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
449 }
450 }
10a6ecd2 451 close($BLOCKSFH);
561c79ed
JH
452 }
453 }
10a6ecd2
JH
454}
455
456sub charblock {
457 my $arg = shift;
458
459 _charblocks() unless @BLOCKS;
460
461 my $code = _getcode($arg);
561c79ed 462
10a6ecd2
JH
463 if (defined $code) {
464 _search(\@BLOCKS, 0, $#BLOCKS, $code);
465 } else {
466 if (exists $BLOCKS{$arg}) {
741297c1 467 return dclone $BLOCKS{$arg};
10a6ecd2
JH
468 } else {
469 return;
470 }
471 }
e882dd67
JH
472}
473
a452d459 474=head2 B<charscript()>
e882dd67 475
55d7b906 476 use Unicode::UCD 'charscript';
e882dd67
JH
477
478 my $charscript = charscript(0x41);
10a6ecd2
JH
479 my $charscript = charscript(1234);
480 my $charscript = charscript("U+263a");
e882dd67 481
78bf21c2 482 my $range = charscript('Thai');
10a6ecd2 483
a452d459
KW
484With a L</code point argument> charscript() returns the I<script> the
485code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
486If the code point is unassigned, it returns B<undef>
78bf21c2 487
eb0cc9e3 488If supplied with an argument that can't be a code point, charscript() tries
a452d459 489to do the opposite and interpret the argument as a code point script. The
eb0cc9e3
JH
490return value is a I<range>: an anonymous list of lists that contain
491I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
a452d459
KW
492code point is in a range using the L</charinrange()> function. If the
493argument is not a known code point script, B<undef> is returned.
494
495See also L</Blocks versus Scripts>.
e882dd67 496
e882dd67
JH
497=cut
498
499my @SCRIPTS;
10a6ecd2 500my %SCRIPTS;
e882dd67 501
10a6ecd2 502sub _charscripts {
e882dd67 503 unless (@SCRIPTS) {
10a6ecd2 504 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 505 local $_;
10a6ecd2 506 while (<$SCRIPTSFH>) {
e882dd67 507 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
508 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
509 my $script = lc($3);
510 $script =~ s/\b(\w)/uc($1)/ge;
511 my $subrange = [ $lo, $hi, $script ];
512 push @SCRIPTS, $subrange;
513 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
514 }
515 }
10a6ecd2 516 close($SCRIPTSFH);
e882dd67
JH
517 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
518 }
519 }
10a6ecd2
JH
520}
521
522sub charscript {
523 my $arg = shift;
524
525 _charscripts() unless @SCRIPTS;
e882dd67 526
10a6ecd2
JH
527 my $code = _getcode($arg);
528
529 if (defined $code) {
530 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
531 } else {
532 if (exists $SCRIPTS{$arg}) {
741297c1 533 return dclone $SCRIPTS{$arg};
10a6ecd2
JH
534 } else {
535 return;
536 }
537 }
538}
539
a452d459 540=head2 B<charblocks()>
10a6ecd2 541
55d7b906 542 use Unicode::UCD 'charblocks';
10a6ecd2 543
b08cd201 544 my $charblocks = charblocks();
10a6ecd2 545
b08cd201 546charblocks() returns a reference to a hash with the known block names
a452d459 547as the keys, and the code point ranges (see L</charblock()>) as the values.
10a6ecd2 548
78bf21c2
JH
549See also L</Blocks versus Scripts>.
550
10a6ecd2
JH
551=cut
552
553sub charblocks {
b08cd201 554 _charblocks() unless %BLOCKS;
741297c1 555 return dclone \%BLOCKS;
10a6ecd2
JH
556}
557
a452d459 558=head2 B<charscripts()>
10a6ecd2 559
55d7b906 560 use Unicode::UCD 'charscripts';
10a6ecd2 561
ea508aee 562 my $charscripts = charscripts();
10a6ecd2 563
ea508aee 564charscripts() returns a reference to a hash with the known script
a452d459 565names as the keys, and the code point ranges (see L</charscript()>) as
ea508aee 566the values.
10a6ecd2 567
78bf21c2
JH
568See also L</Blocks versus Scripts>.
569
10a6ecd2
JH
570=cut
571
572sub charscripts {
b08cd201 573 _charscripts() unless %SCRIPTS;
741297c1 574 return dclone \%SCRIPTS;
561c79ed
JH
575}
576
a452d459 577=head2 B<charinrange()>
10a6ecd2
JH
578
579In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
580can also test whether a code point is in the I<range> as returned by
a452d459
KW
581L</charblock()> and L</charscript()> or as the values of the hash returned
582by L</charblocks()> and L</charscripts()> by using charinrange():
10a6ecd2 583
55d7b906 584 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
585
586 $range = charscript('Hiragana');
e145285f 587 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
588
589=cut
590
ea508aee
JH
591my %GENERAL_CATEGORIES =
592 (
593 'L' => 'Letter',
594 'LC' => 'CasedLetter',
595 'Lu' => 'UppercaseLetter',
596 'Ll' => 'LowercaseLetter',
597 'Lt' => 'TitlecaseLetter',
598 'Lm' => 'ModifierLetter',
599 'Lo' => 'OtherLetter',
600 'M' => 'Mark',
601 'Mn' => 'NonspacingMark',
602 'Mc' => 'SpacingMark',
603 'Me' => 'EnclosingMark',
604 'N' => 'Number',
605 'Nd' => 'DecimalNumber',
606 'Nl' => 'LetterNumber',
607 'No' => 'OtherNumber',
608 'P' => 'Punctuation',
609 'Pc' => 'ConnectorPunctuation',
610 'Pd' => 'DashPunctuation',
611 'Ps' => 'OpenPunctuation',
612 'Pe' => 'ClosePunctuation',
613 'Pi' => 'InitialPunctuation',
614 'Pf' => 'FinalPunctuation',
615 'Po' => 'OtherPunctuation',
616 'S' => 'Symbol',
617 'Sm' => 'MathSymbol',
618 'Sc' => 'CurrencySymbol',
619 'Sk' => 'ModifierSymbol',
620 'So' => 'OtherSymbol',
621 'Z' => 'Separator',
622 'Zs' => 'SpaceSeparator',
623 'Zl' => 'LineSeparator',
624 'Zp' => 'ParagraphSeparator',
625 'C' => 'Other',
626 'Cc' => 'Control',
627 'Cf' => 'Format',
628 'Cs' => 'Surrogate',
629 'Co' => 'PrivateUse',
630 'Cn' => 'Unassigned',
631 );
632
633sub general_categories {
634 return dclone \%GENERAL_CATEGORIES;
635}
636
a452d459 637=head2 B<general_categories()>
ea508aee
JH
638
639 use Unicode::UCD 'general_categories';
640
641 my $categories = general_categories();
642
a452d459 643This returns a reference to a hash which has short
ea508aee
JH
644general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
645names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
646C<Symbol>) as values. The hash is reversible in case you need to go
647from the long names to the short names. The general category is the
a452d459
KW
648one returned from
649L</charinfo()> under the C<category> key.
ea508aee
JH
650
651=cut
652
653my %BIDI_TYPES =
654 (
655 'L' => 'Left-to-Right',
656 'LRE' => 'Left-to-Right Embedding',
657 'LRO' => 'Left-to-Right Override',
658 'R' => 'Right-to-Left',
659 'AL' => 'Right-to-Left Arabic',
660 'RLE' => 'Right-to-Left Embedding',
661 'RLO' => 'Right-to-Left Override',
662 'PDF' => 'Pop Directional Format',
663 'EN' => 'European Number',
664 'ES' => 'European Number Separator',
665 'ET' => 'European Number Terminator',
666 'AN' => 'Arabic Number',
667 'CS' => 'Common Number Separator',
668 'NSM' => 'Non-Spacing Mark',
669 'BN' => 'Boundary Neutral',
670 'B' => 'Paragraph Separator',
671 'S' => 'Segment Separator',
672 'WS' => 'Whitespace',
673 'ON' => 'Other Neutrals',
674 );
675
a452d459 676=head2 B<bidi_types()>
ea508aee
JH
677
678 use Unicode::UCD 'bidi_types';
679
680 my $categories = bidi_types();
681
a452d459 682This returns a reference to a hash which has the short
ea508aee
JH
683bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
684names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
685hash is reversible in case you need to go from the long names to the
a452d459
KW
686short names. The bidi type is the one returned from
687L</charinfo()>
ea508aee
JH
688under the C<bidi> key. For the exact meaning of the various bidi classes
689the Unicode TR9 is recommended reading:
a452d459 690L<http://www.unicode.org/reports/tr9/>
ea508aee
JH
691(as of Unicode 5.0.0)
692
693=cut
694
a452d459
KW
695sub bidi_types {
696 return dclone \%BIDI_TYPES;
697}
698
699=head2 B<compexcl()>
b08cd201 700
55d7b906 701 use Unicode::UCD 'compexcl';
b08cd201 702
a452d459 703 my $compexcl = compexcl(0x09dc);
b08cd201 704
a452d459
KW
705This returns B<true> if the
706L</code point argument> should not be produced by composition normalization,
707B<AND> if that fact is not otherwise determinable from the Unicode data base.
708It currently does not return B<true> if the code point has a decomposition
709consisting of another single code point, nor if its decomposition starts
710with a code point whose combining class is non-zero. Code points that meet
711either of these conditions should also not be produced by composition
712normalization.
b08cd201 713
a452d459 714It returns B<false> otherwise.
b08cd201
JH
715
716=cut
717
718my %COMPEXCL;
719
720sub _compexcl {
721 unless (%COMPEXCL) {
551b6b6f 722 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 723 local $_;
b08cd201 724 while (<$COMPEXCLFH>) {
822ebcc8 725 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201
JH
726 my $code = hex($1);
727 $COMPEXCL{$code} = undef;
728 }
729 }
730 close($COMPEXCLFH);
731 }
732 }
733}
734
735sub compexcl {
736 my $arg = shift;
737 my $code = _getcode($arg);
74f8133e
JH
738 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
739 unless defined $code;
b08cd201
JH
740
741 _compexcl() unless %COMPEXCL;
742
743 return exists $COMPEXCL{$code};
744}
745
a452d459 746=head2 B<casefold()>
b08cd201 747
55d7b906 748 use Unicode::UCD 'casefold';
b08cd201 749
a452d459
KW
750 my $casefold = casefold(0xDF);
751 if (defined $casefold) {
752 my @full_fold_hex = split / /, $casefold->{'full'};
753 my $full_fold_string =
754 join "", map {chr(hex($_))} @full_fold_hex;
755 my @turkic_fold_hex =
756 split / /, ($casefold->{'turkic'} ne "")
757 ? $casefold->{'turkic'}
758 : $casefold->{'full'};
759 my $turkic_fold_string =
760 join "", map {chr(hex($_))} @turkic_fold_hex;
761 }
762 if (defined $casefold && $casefold->{'simple'} ne "") {
763 my $simple_fold_hex = $casefold->{'simple'};
764 my $simple_fold_string = chr(hex($simple_fold_hex));
765 }
b08cd201 766
a452d459
KW
767This returns the (almost) locale-independent case folding of the
768character specified by the L</code point argument>.
b08cd201 769
a452d459
KW
770If there is no case folding for that code point, B<undef> is returned.
771
772If there is a case folding for that code point, a reference to a hash
b08cd201
JH
773with the following fields is returned:
774
a452d459
KW
775=over
776
777=item B<code>
778
779the input L</code point argument> expressed in hexadecimal, with leading zeros
780added if necessary to make it contain at least four hexdigits
781
782=item B<full>
783
784one or more codes (separated by spaces) that taken in order give the
785code points for the case folding for I<code>.
786Each has at least four hexdigits.
787
788=item B<simple>
789
790is empty, or is exactly one code with at least four hexdigits which can be used
791as an alternative case folding when the calling program cannot cope with the
792fold being a sequence of multiple code points. If I<full> is just one code
793point, then I<simple> equals I<full>. If there is no single code point folding
794defined for I<code>, then I<simple> is the empty string. Otherwise, it is an
795inferior, but still better-than-nothing alternative folding to I<full>.
796
797=item B<mapping>
798
799is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
800otherwise. It can be considered to be the simplest possible folding for
801I<code>. It is defined primarily for backwards compatibility.
802
803=item B<status>
b08cd201 804
a452d459
KW
805is C<C> (for C<common>) if the best possible fold is a single code point
806(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct
807folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if
808there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty). Note
809that this
810describes the contents of I<mapping>. It is defined primarily for backwards
811compatibility.
b08cd201 812
a452d459
KW
813On versions 3.1 and earlier of Unicode, I<status> can also be
814C<I> which is the same as C<C> but is a special case for dotted uppercase I and
815dotless lowercase i:
b08cd201 816
a452d459 817=over
b08cd201 818
a452d459
KW
819=item B<*>
820
821If you use this C<I> mapping, the result is case-insensitive,
822but dotless and dotted I's are not distinguished
823
824=item B<*>
825
826If you exclude this C<I> mapping, the result is not fully case-insensitive, but
827dotless and dotted I's are distinguished
828
829=back
830
831=item B<turkic>
832
833contains any special folding for Turkic languages. For versions of Unicode
834starting with 3.2, this field is empty unless I<code> has a different folding
835in Turkic languages, in which case it is one or more codes (separated by
836spaces) that taken in order give the code points for the case folding for
837I<code> in those languages.
838Each code has at least four hexdigits.
839Note that this folding does not maintain canonical equivalence without
840additional processing.
841
842For versions of Unicode 3.1 and earlier, this field is empty unless there is a
843special folding for Turkic languages, in which case I<status> is C<I>, and
844I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
845
846=back
847
848Programs that want complete generality and the best folding results should use
849the folding contained in the I<full> field. But note that the fold for some
850code points will be a sequence of multiple code points.
851
852Programs that can't cope with the fold mapping being multiple code points can
853use the folding contained in the I<simple> field, with the loss of some
854generality. In Unicode 5.1, about 7% of the defined foldings have no single
855code point folding.
856
857The I<mapping> and I<status> fields are provided for backwards compatibility for
858existing programs. They contain the same values as in previous versions of
859this function.
860
861Locale is not completely independent. The I<turkic> field contains results to
862use when the locale is a Turkic language.
b08cd201
JH
863
864For more information about case mappings see
a452d459 865L<http://www.unicode.org/unicode/reports/tr21>
b08cd201
JH
866
867=cut
868
869my %CASEFOLD;
870
871sub _casefold {
872 unless (%CASEFOLD) {
551b6b6f 873 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 874 local $_;
b08cd201 875 while (<$CASEFOLDFH>) {
a452d459 876 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
b08cd201 877 my $code = hex($1);
a452d459
KW
878 $CASEFOLD{$code}{'code'} = $1;
879 $CASEFOLD{$code}{'turkic'} = "" unless
880 defined $CASEFOLD{$code}{'turkic'};
881 if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and
882 # earlier Unicodes
883 # Both entries there (I
884 # only checked 3.1) are
885 # the same as C, and
886 # there are no other
887 # entries for those
888 # codepoints, so treat
889 # as if C, but override
890 # the turkic one for
891 # 'I'.
892 $CASEFOLD{$code}{'status'} = $2;
893 $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
894 $CASEFOLD{$code}{'mapping'} = $3;
895 $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
896 } elsif ($2 eq 'F') {
897 $CASEFOLD{$code}{'full'} = $3;
898 unless (defined $CASEFOLD{$code}{'simple'}) {
899 $CASEFOLD{$code}{'simple'} = "";
900 $CASEFOLD{$code}{'mapping'} = $3;
901 $CASEFOLD{$code}{'status'} = $2;
902 }
903 } elsif ($2 eq 'S') {
904
905
906 # There can't be a simple without a full, and simple
907 # overrides all but full
908
909 $CASEFOLD{$code}{'simple'} = $3;
910 $CASEFOLD{$code}{'mapping'} = $3;
911 $CASEFOLD{$code}{'status'} = $2;
912 } elsif ($2 eq 'T') {
913 $CASEFOLD{$code}{'turkic'} = $3;
914 } # else can't happen because only [CIFST] are possible
b08cd201
JH
915 }
916 }
917 close($CASEFOLDFH);
918 }
919 }
920}
921
922sub casefold {
923 my $arg = shift;
924 my $code = _getcode($arg);
74f8133e
JH
925 croak __PACKAGE__, "::casefold: unknown code '$arg'"
926 unless defined $code;
b08cd201
JH
927
928 _casefold() unless %CASEFOLD;
929
930 return $CASEFOLD{$code};
931}
932
a452d459 933=head2 B<casespec()>
b08cd201 934
55d7b906 935 use Unicode::UCD 'casespec';
b08cd201 936
a452d459 937 my $casespec = casespec(0xFB00);
b08cd201 938
a452d459
KW
939This returns the potentially locale-dependent case mappings of the L</code point
940argument>. The mappings may be longer than a single code point (which the basic
941Unicode case mappings as returned by L</charinfo()> never are).
b08cd201 942
a452d459
KW
943If there are no case mappings for the L</code point argument>, or if all three
944possible mappings (I<lower>, I<title> and I<upper>) result in single code
c45e3c48 945points and are locale independent and unconditional, B<undef> is returned.
a452d459
KW
946
947Otherwise, a reference to a hash giving the mappings (or a reference to a hash
948of such hashes, explained below) is returned.
949
950The keys in the bottom layer hash with the meanings of their values are:
951
952=over
953
954=item B<code>
955
956the input L</code point argument> expressed in hexadecimal, with leading zeros
957added if necessary to make it contain at least four hexdigits
958
959=item B<lower>
960
961one or more codes (separated by spaces) that taken in order give the
962code points for the lower case of I<code>.
963Each has at least four hexdigits.
964
965=item B<title>
b08cd201 966
a452d459
KW
967one or more codes (separated by spaces) that taken in order give the
968code points for the title case of I<code>.
969Each has at least four hexdigits.
b08cd201 970
a452d459 971=item B<lower>
b08cd201 972
a452d459
KW
973one or more codes (separated by spaces) that taken in order give the
974code points for the upper case of I<code>.
975Each has at least four hexdigits.
976
977=item B<condition>
978
979the conditions for the mappings to be valid.
980If B<undef>, the mappings are always valid.
981When defined, this field is a list of conditions,
982all of which must be true for the mappings to be valid.
983The list consists of one or more
984I<locales> (see below)
985and/or I<contexts> (explained in the next paragraph),
986separated by spaces.
987(Other than as used to separate elements, spaces are to be ignored.)
988Case distinctions in the condition list are not significant.
82c0b05b 989Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 990
a452d459
KW
991A I<context> is one of those defined in the Unicode standard.
992For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
993available at
994L<http://www.unicode.org/versions/Unicode5.1.0/>
f499c386 995
a452d459
KW
996=back
997
998If the return value is to a hash of hashes, it is because there are multiple
999case mapping definitions for a single code point
1000(because of different rules for different locales).
1001Each sub-hash is of the form above, and the keys of the outer hash are
1002the locales, which are
1003defined as 2-letter ISO 3166 country codes, possibly
e618509d 1004followed by a "_" and a 2-letter ISO language code (possibly followed
a452d459 1005by a "_" and a variant code). You can find the lists of all possible locales,
e618509d 1006see L<Locale::Country> and L<Locale::Language>.
a452d459
KW
1007(In Unicode 5.1, the only locales returned by this function
1008are C<lt>, C<tr>, and C<az>.)
b08cd201
JH
1009
1010For more information about case mappings see
a452d459 1011L<http://www.unicode.org/unicode/reports/tr21/>
b08cd201
JH
1012
1013=cut
1014
1015my %CASESPEC;
1016
1017sub _casespec {
1018 unless (%CASESPEC) {
551b6b6f 1019 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 1020 local $_;
b08cd201
JH
1021 while (<$CASESPECFH>) {
1022 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
1023 my ($hexcode, $lower, $title, $upper, $condition) =
1024 ($1, $2, $3, $4, $5);
1025 my $code = hex($hexcode);
1026 if (exists $CASESPEC{$code}) {
1027 if (exists $CASESPEC{$code}->{code}) {
1028 my ($oldlower,
1029 $oldtitle,
1030 $oldupper,
1031 $oldcondition) =
1032 @{$CASESPEC{$code}}{qw(lower
1033 title
1034 upper
1035 condition)};
822ebcc8
JH
1036 if (defined $oldcondition) {
1037 my ($oldlocale) =
f499c386 1038 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
1039 delete $CASESPEC{$code};
1040 $CASESPEC{$code}->{$oldlocale} =
1041 { code => $hexcode,
1042 lower => $oldlower,
1043 title => $oldtitle,
1044 upper => $oldupper,
1045 condition => $oldcondition };
f499c386
JH
1046 }
1047 }
1048 my ($locale) =
1049 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1050 $CASESPEC{$code}->{$locale} =
1051 { code => $hexcode,
1052 lower => $lower,
1053 title => $title,
1054 upper => $upper,
1055 condition => $condition };
1056 } else {
1057 $CASESPEC{$code} =
1058 { code => $hexcode,
1059 lower => $lower,
1060 title => $title,
1061 upper => $upper,
1062 condition => $condition };
1063 }
b08cd201
JH
1064 }
1065 }
1066 close($CASESPECFH);
1067 }
1068 }
1069}
1070
1071sub casespec {
1072 my $arg = shift;
1073 my $code = _getcode($arg);
74f8133e
JH
1074 croak __PACKAGE__, "::casespec: unknown code '$arg'"
1075 unless defined $code;
b08cd201
JH
1076
1077 _casespec() unless %CASESPEC;
1078
741297c1 1079 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
1080}
1081
a452d459 1082=head2 B<namedseq()>
a2bd7410
JH
1083
1084 use Unicode::UCD 'namedseq';
1085
1086 my $namedseq = namedseq("KATAKANA LETTER AINU P");
1087 my @namedseq = namedseq("KATAKANA LETTER AINU P");
1088 my %namedseq = namedseq();
1089
1090If used with a single argument in a scalar context, returns the string
a452d459 1091consisting of the code points of the named sequence, or B<undef> if no
a2bd7410 1092named sequence by that name exists. If used with a single argument in
a452d459 1093a list context, it returns the list of the code points. If used with no
a2bd7410
JH
1094arguments in a list context, returns a hash with the names of the
1095named sequences as the keys and the named sequences as strings as
a452d459 1096the values. Otherwise, it returns B<undef> or an empty list depending
a2bd7410
JH
1097on the context.
1098
a452d459
KW
1099This function only operates on officially approved (not provisional) named
1100sequences.
a2bd7410
JH
1101
1102=cut
1103
1104my %NAMEDSEQ;
1105
1106sub _namedseq {
1107 unless (%NAMEDSEQ) {
1108 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
1109 local $_;
1110 while (<$NAMEDSEQFH>) {
1111 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
1112 my ($n, $s) = ($1, $2);
1113 my @s = map { chr(hex($_)) } split(' ', $s);
1114 $NAMEDSEQ{$n} = join("", @s);
1115 }
1116 }
1117 close($NAMEDSEQFH);
1118 }
1119 }
1120}
1121
1122sub namedseq {
1123 _namedseq() unless %NAMEDSEQ;
1124 my $wantarray = wantarray();
1125 if (defined $wantarray) {
1126 if ($wantarray) {
1127 if (@_ == 0) {
1128 return %NAMEDSEQ;
1129 } elsif (@_ == 1) {
1130 my $s = $NAMEDSEQ{ $_[0] };
1131 return defined $s ? map { ord($_) } split('', $s) : ();
1132 }
1133 } elsif (@_ == 1) {
1134 return $NAMEDSEQ{ $_[0] };
1135 }
1136 }
1137 return;
1138}
1139
55d7b906 1140=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 1141
a452d459
KW
1142This returns the version of the Unicode Character Database, in other words, the
1143version of the Unicode standard the database implements. The version is a
1144string of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
1145
1146=cut
1147
1148my $UNICODEVERSION;
1149
1150sub UnicodeVersion {
1151 unless (defined $UNICODEVERSION) {
1152 openunicode(\$VERSIONFH, "version");
1153 chomp($UNICODEVERSION = <$VERSIONFH>);
1154 close($VERSIONFH);
1155 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1156 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1157 }
1158 return $UNICODEVERSION;
1159}
3aa957f9 1160
a452d459
KW
1161=head2 B<Blocks versus Scripts>
1162
1163The difference between a block and a script is that scripts are closer
1164to the linguistic notion of a set of code points required to present
1165languages, while block is more of an artifact of the Unicode code point
1166numbering and separation into blocks of (mostly) 256 code points.
1167
1168For example the Latin B<script> is spread over several B<blocks>, such
1169as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1170C<Latin Extended-B>. On the other hand, the Latin script does not
1171contain all the characters of the C<Basic Latin> block (also known as
1172ASCII): it includes only the letters, and not, for example, the digits
1173or the punctuation.
1174
1175For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1176
1177For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1178
1179=head2 B<Matching Scripts and Blocks>
1180
1181Scripts are matched with the regular-expression construct
1182C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
1183while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
1184any of the 256 code points in the Tibetan block).
1185
1186
3aa957f9 1187=head2 Implementation Note
32c16050 1188
ad9cab37
JH
1189The first use of charinfo() opens a read-only filehandle to the Unicode
1190Character Database (the database is included in the Perl distribution).
78bf21c2
JH
1191The filehandle is then kept open for further queries. In other words,
1192if you are wondering where one of your filehandles went, that's where.
32c16050 1193
8b731da2
JH
1194=head1 BUGS
1195
1196Does not yet support EBCDIC platforms.
1197
a452d459
KW
1198L</compexcl()> should give a complete list of excluded code points.
1199
561c79ed
JH
1200=head1 AUTHOR
1201
1202Jarkko Hietaniemi
1203
1204=cut
1205
12061;