This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
File::Copy & permission bits.
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
5
ea508aee 6our $VERSION = '0.25';
561c79ed 7
741297c1
JH
8use Storable qw(dclone);
9
561c79ed
JH
10require Exporter;
11
12our @ISA = qw(Exporter);
74f8133e 13
10a6ecd2
JH
14our @EXPORT_OK = qw(charinfo
15 charblock charscript
16 charblocks charscripts
b08cd201 17 charinrange
ea508aee 18 general_categories bidi_types
b08cd201 19 compexcl
a2bd7410
JH
20 casefold casespec
21 namedseq);
561c79ed
JH
22
23use Carp;
24
25=head1 NAME
26
55d7b906 27Unicode::UCD - Unicode character database
561c79ed
JH
28
29=head1 SYNOPSIS
30
55d7b906 31 use Unicode::UCD 'charinfo';
b08cd201 32 my $charinfo = charinfo($codepoint);
561c79ed 33
55d7b906 34 use Unicode::UCD 'charblock';
e882dd67
JH
35 my $charblock = charblock($codepoint);
36
55d7b906 37 use Unicode::UCD 'charscript';
65044554 38 my $charscript = charscript($codepoint);
561c79ed 39
55d7b906 40 use Unicode::UCD 'charblocks';
e145285f
JH
41 my $charblocks = charblocks();
42
55d7b906 43 use Unicode::UCD 'charscripts';
ea508aee 44 my $charscripts = charscripts();
e145285f 45
55d7b906 46 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
47 my $range = charscript($script);
48 print "looks like $script\n" if charinrange($range, $codepoint);
49
ea508aee
JH
50 use Unicode::UCD qw(general_categories bidi_types);
51 my $categories = general_categories();
52 my $types = bidi_types();
53
55d7b906 54 use Unicode::UCD 'compexcl';
e145285f
JH
55 my $compexcl = compexcl($codepoint);
56
a2bd7410
JH
57 use Unicode::UCD 'namedseq';
58 my $namedseq = namedseq($named_sequence_name);
59
55d7b906 60 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 61
561c79ed
JH
62=head1 DESCRIPTION
63
8b731da2
JH
64The Unicode::UCD module offers a simple interface to the Unicode
65Character Database.
561c79ed
JH
66
67=cut
68
10a6ecd2
JH
69my $UNICODEFH;
70my $BLOCKSFH;
71my $SCRIPTSFH;
72my $VERSIONFH;
b08cd201
JH
73my $COMPEXCLFH;
74my $CASEFOLDFH;
75my $CASESPECFH;
a2bd7410 76my $NAMEDSEQFH;
561c79ed
JH
77
78sub openunicode {
79 my ($rfh, @path) = @_;
80 my $f;
81 unless (defined $$rfh) {
82 for my $d (@INC) {
83 use File::Spec;
55d7b906 84 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 85 last if open($$rfh, $f);
e882dd67 86 undef $f;
561c79ed 87 }
e882dd67
JH
88 croak __PACKAGE__, ": failed to find ",
89 File::Spec->catfile(@path), " in @INC"
90 unless defined $f;
561c79ed
JH
91 }
92 return $f;
93}
94
95=head2 charinfo
96
55d7b906 97 use Unicode::UCD 'charinfo';
561c79ed 98
b08cd201 99 my $charinfo = charinfo(0x41);
561c79ed 100
b08cd201
JH
101charinfo() returns a reference to a hash that has the following fields
102as defined by the Unicode standard:
561c79ed
JH
103
104 key
105
106 code code point with at least four hexdigits
107 name name of the character IN UPPER CASE
108 category general category of the character
109 combining classes used in the Canonical Ordering Algorithm
ea508aee 110 bidi bidirectional type
561c79ed
JH
111 decomposition character decomposition mapping
112 decimal if decimal digit this is the integer numeric value
113 digit if digit this is the numeric value
114 numeric if numeric is the integer or rational numeric value
115 mirrored if mirrored in bidirectional text
116 unicode10 Unicode 1.0 name if existed and different
117 comment ISO 10646 comment field
118 upper uppercase equivalent mapping
119 lower lowercase equivalent mapping
120 title titlecase equivalent mapping
e882dd67 121
561c79ed 122 block block the character belongs to (used in \p{In...})
eb0cc9e3 123 script script the character belongs to
561c79ed 124
b08cd201 125If no match is found, a reference to an empty hash is returned.
561c79ed 126
d1be9408 127The C<block> property is the same as returned by charinfo(). It is
32c16050 128not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2
JH
129Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
130(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050
JH
131
132Note that you cannot do (de)composition and casing based solely on the
133above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 134you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed
JH
135
136=cut
137
0616d9cf 138# NB: This function is duplicated in charnames.pm
10a6ecd2
JH
139sub _getcode {
140 my $arg = shift;
141
dc0a4417 142 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 143 return $arg;
dc0a4417 144 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
145 return hex($1);
146 }
147
148 return;
149}
150
ac5ea531
JH
151# Lingua::KO::Hangul::Util not part of the standard distribution
152# but it will be used if available.
153
154eval { require Lingua::KO::Hangul::Util };
155my $hasHangulUtil = ! $@;
156if ($hasHangulUtil) {
157 Lingua::KO::Hangul::Util->import();
158}
9087a70b
TS
159
160sub hangul_decomp { # internal: called from charinfo
ac5ea531
JH
161 if ($hasHangulUtil) {
162 my @tmp = decomposeHangul(shift);
163 return sprintf("%04X %04X", @tmp) if @tmp == 2;
164 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
165 }
166 return;
167}
168
169sub hangul_charname { # internal: called from charinfo
170 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b
TS
171}
172
9087a70b
TS
173sub han_charname { # internal: called from charinfo
174 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b
TS
175}
176
177my @CharinfoRanges = (
178# block name
179# [ first, last, coderef to name, coderef to decompose ],
180# CJK Ideographs Extension A
181 [ 0x3400, 0x4DB5, \&han_charname, undef ],
182# CJK Ideographs
183 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
184# Hangul Syllables
ac5ea531 185 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b
TS
186# Non-Private Use High Surrogates
187 [ 0xD800, 0xDB7F, undef, undef ],
188# Private Use High Surrogates
189 [ 0xDB80, 0xDBFF, undef, undef ],
190# Low Surrogates
191 [ 0xDC00, 0xDFFF, undef, undef ],
192# The Private Use Area
193 [ 0xE000, 0xF8FF, undef, undef ],
194# CJK Ideographs Extension B
195 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
196# Plane 15 Private Use Area
197 [ 0xF0000, 0xFFFFD, undef, undef ],
198# Plane 16 Private Use Area
199 [ 0x100000, 0x10FFFD, undef, undef ],
200);
201
561c79ed 202sub charinfo {
10a6ecd2
JH
203 my $arg = shift;
204 my $code = _getcode($arg);
205 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
206 unless defined $code;
e63dbbf9 207 my $hexk = sprintf("%06X", $code);
a6fa416b
TS
208 my($rcode,$rname,$rdec);
209 foreach my $range (@CharinfoRanges){
74f8133e 210 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 211 $rcode = $hexk;
e63dbbf9
JH
212 $rcode =~ s/^0+//;
213 $rcode = sprintf("%04X", hex($rcode));
a6fa416b
TS
214 $rname = $range->[2] ? $range->[2]->($code) : '';
215 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 216 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b
TS
217 last;
218 }
219 }
551b6b6f 220 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 221 if (defined $UNICODEFH) {
e63dbbf9
JH
222 use Search::Dict 1.02;
223 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 224 my $line = <$UNICODEFH>;
c5a29f40 225 return unless defined $line;
561c79ed
JH
226 chomp $line;
227 my %prop;
228 @prop{qw(
229 code name category
230 combining bidi decomposition
231 decimal digit numeric
232 mirrored unicode10 comment
233 upper lower title
234 )} = split(/;/, $line, -1);
e63dbbf9
JH
235 $hexk =~ s/^0+//;
236 $hexk = sprintf("%04X", hex($hexk));
561c79ed 237 if ($prop{code} eq $hexk) {
a196fbfd
JH
238 $prop{block} = charblock($code);
239 $prop{script} = charscript($code);
a6fa416b
TS
240 if(defined $rname){
241 $prop{code} = $rcode;
242 $prop{name} = $rname;
243 $prop{decomposition} = $rdec;
244 }
b08cd201 245 return \%prop;
561c79ed
JH
246 }
247 }
248 }
249 return;
250}
251
e882dd67
JH
252sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
253 my ($table, $lo, $hi, $code) = @_;
254
255 return if $lo > $hi;
256
257 my $mid = int(($lo+$hi) / 2);
258
259 if ($table->[$mid]->[0] < $code) {
10a6ecd2 260 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
261 return $table->[$mid]->[2];
262 } else {
263 _search($table, $mid + 1, $hi, $code);
264 }
265 } elsif ($table->[$mid]->[0] > $code) {
266 _search($table, $lo, $mid - 1, $code);
267 } else {
268 return $table->[$mid]->[2];
269 }
270}
271
10a6ecd2
JH
272sub charinrange {
273 my ($range, $arg) = @_;
274 my $code = _getcode($arg);
275 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
276 unless defined $code;
277 _search($range, 0, $#$range, $code);
278}
279
354a27bf 280=head2 charblock
561c79ed 281
55d7b906 282 use Unicode::UCD 'charblock';
561c79ed
JH
283
284 my $charblock = charblock(0x41);
10a6ecd2
JH
285 my $charblock = charblock(1234);
286 my $charblock = charblock("0x263a");
287 my $charblock = charblock("U+263a");
288
78bf21c2 289 my $range = charblock('Armenian');
10a6ecd2 290
78bf21c2 291With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 292belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 293positions within all blocks are defined.
10a6ecd2 294
78bf21c2
JH
295See also L</Blocks versus Scripts>.
296
eb0cc9e3
JH
297If supplied with an argument that can't be a code point, charblock() tries
298to do the opposite and interpret the argument as a character block. The
299return value is a I<range>: an anonymous list of lists that contain
a2bd7410
JH
300I<start-of-range>, I<end-of-range> code point pairs. You can test whether
301a code point is in a range using the L</charinrange> function. If the
3c4b39be 302argument is not a known character block, C<undef> is returned.
561c79ed 303
561c79ed
JH
304=cut
305
306my @BLOCKS;
10a6ecd2 307my %BLOCKS;
561c79ed 308
10a6ecd2 309sub _charblocks {
561c79ed 310 unless (@BLOCKS) {
10a6ecd2 311 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 312 local $_;
10a6ecd2 313 while (<$BLOCKSFH>) {
2796c109 314 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
315 my ($lo, $hi) = (hex($1), hex($2));
316 my $subrange = [ $lo, $hi, $3 ];
317 push @BLOCKS, $subrange;
318 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
319 }
320 }
10a6ecd2 321 close($BLOCKSFH);
561c79ed
JH
322 }
323 }
10a6ecd2
JH
324}
325
326sub charblock {
327 my $arg = shift;
328
329 _charblocks() unless @BLOCKS;
330
331 my $code = _getcode($arg);
561c79ed 332
10a6ecd2
JH
333 if (defined $code) {
334 _search(\@BLOCKS, 0, $#BLOCKS, $code);
335 } else {
336 if (exists $BLOCKS{$arg}) {
741297c1 337 return dclone $BLOCKS{$arg};
10a6ecd2
JH
338 } else {
339 return;
340 }
341 }
e882dd67
JH
342}
343
344=head2 charscript
345
55d7b906 346 use Unicode::UCD 'charscript';
e882dd67
JH
347
348 my $charscript = charscript(0x41);
10a6ecd2
JH
349 my $charscript = charscript(1234);
350 my $charscript = charscript("U+263a");
e882dd67 351
78bf21c2 352 my $range = charscript('Thai');
10a6ecd2 353
78bf21c2 354With a B<code point argument> charscript() returns the I<script> the
b08cd201 355character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 356
78bf21c2
JH
357See also L</Blocks versus Scripts>.
358
eb0cc9e3
JH
359If supplied with an argument that can't be a code point, charscript() tries
360to do the opposite and interpret the argument as a character script. The
361return value is a I<range>: an anonymous list of lists that contain
362I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
363code point is in a range using the L</charinrange> function. If the
3c4b39be 364argument is not a known character script, C<undef> is returned.
e882dd67 365
e882dd67
JH
366=cut
367
368my @SCRIPTS;
10a6ecd2 369my %SCRIPTS;
e882dd67 370
10a6ecd2 371sub _charscripts {
e882dd67 372 unless (@SCRIPTS) {
10a6ecd2 373 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 374 local $_;
10a6ecd2 375 while (<$SCRIPTSFH>) {
e882dd67 376 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
377 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
378 my $script = lc($3);
379 $script =~ s/\b(\w)/uc($1)/ge;
380 my $subrange = [ $lo, $hi, $script ];
381 push @SCRIPTS, $subrange;
382 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
383 }
384 }
10a6ecd2 385 close($SCRIPTSFH);
e882dd67
JH
386 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
387 }
388 }
10a6ecd2
JH
389}
390
391sub charscript {
392 my $arg = shift;
393
394 _charscripts() unless @SCRIPTS;
e882dd67 395
10a6ecd2
JH
396 my $code = _getcode($arg);
397
398 if (defined $code) {
399 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
400 } else {
401 if (exists $SCRIPTS{$arg}) {
741297c1 402 return dclone $SCRIPTS{$arg};
10a6ecd2
JH
403 } else {
404 return;
405 }
406 }
407}
408
409=head2 charblocks
410
55d7b906 411 use Unicode::UCD 'charblocks';
10a6ecd2 412
b08cd201 413 my $charblocks = charblocks();
10a6ecd2 414
b08cd201
JH
415charblocks() returns a reference to a hash with the known block names
416as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 417
78bf21c2
JH
418See also L</Blocks versus Scripts>.
419
10a6ecd2
JH
420=cut
421
422sub charblocks {
b08cd201 423 _charblocks() unless %BLOCKS;
741297c1 424 return dclone \%BLOCKS;
10a6ecd2
JH
425}
426
427=head2 charscripts
428
55d7b906 429 use Unicode::UCD 'charscripts';
10a6ecd2 430
ea508aee 431 my $charscripts = charscripts();
10a6ecd2 432
ea508aee
JH
433charscripts() returns a reference to a hash with the known script
434names as the keys, and the code point ranges (see L</charscript>) as
435the values.
10a6ecd2 436
78bf21c2
JH
437See also L</Blocks versus Scripts>.
438
10a6ecd2
JH
439=cut
440
441sub charscripts {
b08cd201 442 _charscripts() unless %SCRIPTS;
741297c1 443 return dclone \%SCRIPTS;
561c79ed
JH
444}
445
10a6ecd2 446=head2 Blocks versus Scripts
ad9cab37 447
10a6ecd2
JH
448The difference between a block and a script is that scripts are closer
449to the linguistic notion of a set of characters required to present
450languages, while block is more of an artifact of the Unicode character
eb0cc9e3 451numbering and separation into blocks of (mostly) 256 characters.
3aa957f9
JH
452
453For example the Latin B<script> is spread over several B<blocks>, such
454as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
455C<Latin Extended-B>. On the other hand, the Latin script does not
456contain all the characters of the C<Basic Latin> block (also known as
eb0cc9e3 457the ASCII): it includes only the letters, and not, for example, the digits
3aa957f9 458or the punctuation.
ad9cab37 459
3aa957f9 460For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37
JH
461
462For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
463
3aa957f9
JH
464=head2 Matching Scripts and Blocks
465
eb0cc9e3
JH
466Scripts are matched with the regular-expression construct
467C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
468while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
469any of the 256 code points in the Tibetan block).
10a6ecd2 470
b08cd201
JH
471=head2 Code Point Arguments
472
92e830a9
JH
473A I<code point argument> is either a decimal or a hexadecimal scalar
474designating a Unicode character, or C<U+> followed by hexadecimals
dc0a4417
JH
475designating a Unicode character. In other words, if you want a code
476point to be interpreted as a hexadecimal number, you must prefix it
43adb1d9 477with either C<0x> or C<U+>, because a string like e.g. C<123> will
dc0a4417
JH
478be interpreted as a decimal code point. Also note that Unicode is
479B<not> limited to 16 bits (the number of Unicode characters is
480open-ended, in theory unlimited): you may have more than 4 hexdigits.
b08cd201 481
10a6ecd2
JH
482=head2 charinrange
483
484In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
485can also test whether a code point is in the I<range> as returned by
486L</charblock> and L</charscript> or as the values of the hash returned
e618509d 487by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 488
55d7b906 489 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
490
491 $range = charscript('Hiragana');
e145285f 492 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
493
494=cut
495
ea508aee
JH
496my %GENERAL_CATEGORIES =
497 (
498 'L' => 'Letter',
499 'LC' => 'CasedLetter',
500 'Lu' => 'UppercaseLetter',
501 'Ll' => 'LowercaseLetter',
502 'Lt' => 'TitlecaseLetter',
503 'Lm' => 'ModifierLetter',
504 'Lo' => 'OtherLetter',
505 'M' => 'Mark',
506 'Mn' => 'NonspacingMark',
507 'Mc' => 'SpacingMark',
508 'Me' => 'EnclosingMark',
509 'N' => 'Number',
510 'Nd' => 'DecimalNumber',
511 'Nl' => 'LetterNumber',
512 'No' => 'OtherNumber',
513 'P' => 'Punctuation',
514 'Pc' => 'ConnectorPunctuation',
515 'Pd' => 'DashPunctuation',
516 'Ps' => 'OpenPunctuation',
517 'Pe' => 'ClosePunctuation',
518 'Pi' => 'InitialPunctuation',
519 'Pf' => 'FinalPunctuation',
520 'Po' => 'OtherPunctuation',
521 'S' => 'Symbol',
522 'Sm' => 'MathSymbol',
523 'Sc' => 'CurrencySymbol',
524 'Sk' => 'ModifierSymbol',
525 'So' => 'OtherSymbol',
526 'Z' => 'Separator',
527 'Zs' => 'SpaceSeparator',
528 'Zl' => 'LineSeparator',
529 'Zp' => 'ParagraphSeparator',
530 'C' => 'Other',
531 'Cc' => 'Control',
532 'Cf' => 'Format',
533 'Cs' => 'Surrogate',
534 'Co' => 'PrivateUse',
535 'Cn' => 'Unassigned',
536 );
537
538sub general_categories {
539 return dclone \%GENERAL_CATEGORIES;
540}
541
542=head2 general_categories
543
544 use Unicode::UCD 'general_categories';
545
546 my $categories = general_categories();
547
548The general_categories() returns a reference to a hash which has short
549general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
550names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
551C<Symbol>) as values. The hash is reversible in case you need to go
552from the long names to the short names. The general category is the
553one returned from charinfo() under the C<category> key.
554
555=cut
556
557my %BIDI_TYPES =
558 (
559 'L' => 'Left-to-Right',
560 'LRE' => 'Left-to-Right Embedding',
561 'LRO' => 'Left-to-Right Override',
562 'R' => 'Right-to-Left',
563 'AL' => 'Right-to-Left Arabic',
564 'RLE' => 'Right-to-Left Embedding',
565 'RLO' => 'Right-to-Left Override',
566 'PDF' => 'Pop Directional Format',
567 'EN' => 'European Number',
568 'ES' => 'European Number Separator',
569 'ET' => 'European Number Terminator',
570 'AN' => 'Arabic Number',
571 'CS' => 'Common Number Separator',
572 'NSM' => 'Non-Spacing Mark',
573 'BN' => 'Boundary Neutral',
574 'B' => 'Paragraph Separator',
575 'S' => 'Segment Separator',
576 'WS' => 'Whitespace',
577 'ON' => 'Other Neutrals',
578 );
579
580sub bidi_types {
581 return dclone \%BIDI_TYPES;
582}
583
584=head2 bidi_types
585
586 use Unicode::UCD 'bidi_types';
587
588 my $categories = bidi_types();
589
590The bidi_types() returns a reference to a hash which has the short
591bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
592names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
593hash is reversible in case you need to go from the long names to the
594short names. The bidi type is the one returned from charinfo()
595under the C<bidi> key. For the exact meaning of the various bidi classes
596the Unicode TR9 is recommended reading:
597http://www.unicode.org/reports/tr9/tr9-17.html
598(as of Unicode 5.0.0)
599
600=cut
601
b08cd201
JH
602=head2 compexcl
603
55d7b906 604 use Unicode::UCD 'compexcl';
b08cd201
JH
605
606 my $compexcl = compexcl("09dc");
607
608The compexcl() returns the composition exclusion (that is, if the
9046a8ae
SC
609character should not be produced during a precomposition) of the
610character specified by a B<code point argument>.
b08cd201
JH
611
612If there is a composition exclusion for the character, true is
613returned. Otherwise, false is returned.
614
615=cut
616
617my %COMPEXCL;
618
619sub _compexcl {
620 unless (%COMPEXCL) {
551b6b6f 621 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 622 local $_;
b08cd201 623 while (<$COMPEXCLFH>) {
822ebcc8 624 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201
JH
625 my $code = hex($1);
626 $COMPEXCL{$code} = undef;
627 }
628 }
629 close($COMPEXCLFH);
630 }
631 }
632}
633
634sub compexcl {
635 my $arg = shift;
636 my $code = _getcode($arg);
74f8133e
JH
637 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
638 unless defined $code;
b08cd201
JH
639
640 _compexcl() unless %COMPEXCL;
641
642 return exists $COMPEXCL{$code};
643}
644
645=head2 casefold
646
55d7b906 647 use Unicode::UCD 'casefold';
b08cd201 648
82c0b05b 649 my $casefold = casefold("00DF");
b08cd201
JH
650
651The casefold() returns the locale-independent case folding of the
652character specified by a B<code point argument>.
653
654If there is a case folding for that character, a reference to a hash
655with the following fields is returned:
656
657 key
658
659 code code point with at least four hexdigits
660 status "C", "F", "S", or "I"
661 mapping one or more codes separated by spaces
662
663The meaning of the I<status> is as follows:
664
665 C common case folding, common mappings shared
666 by both simple and full mappings
667 F full case folding, mappings that cause strings
668 to grow in length. Multiple characters are separated
669 by spaces
670 S simple case folding, mappings to single characters
671 where different from F
672 I special case for dotted uppercase I and
673 dotless lowercase i
674 - If this mapping is included, the result is
675 case-insensitive, but dotless and dotted I's
676 are not distinguished
677 - If this mapping is excluded, the result is not
678 fully case-insensitive, but dotless and dotted
679 I's are distinguished
680
681If there is no case folding for that character, C<undef> is returned.
682
683For more information about case mappings see
684http://www.unicode.org/unicode/reports/tr21/
685
686=cut
687
688my %CASEFOLD;
689
690sub _casefold {
691 unless (%CASEFOLD) {
551b6b6f 692 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 693 local $_;
b08cd201
JH
694 while (<$CASEFOLDFH>) {
695 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
696 my $code = hex($1);
697 $CASEFOLD{$code} = { code => $1,
698 status => $2,
699 mapping => $3 };
700 }
701 }
702 close($CASEFOLDFH);
703 }
704 }
705}
706
707sub casefold {
708 my $arg = shift;
709 my $code = _getcode($arg);
74f8133e
JH
710 croak __PACKAGE__, "::casefold: unknown code '$arg'"
711 unless defined $code;
b08cd201
JH
712
713 _casefold() unless %CASEFOLD;
714
715 return $CASEFOLD{$code};
716}
717
718=head2 casespec
719
55d7b906 720 use Unicode::UCD 'casespec';
b08cd201 721
82c0b05b 722 my $casespec = casespec("FB00");
b08cd201
JH
723
724The casespec() returns the potentially locale-dependent case mapping
725of the character specified by a B<code point argument>. The mapping
726may change the length of the string (which the basic Unicode case
727mappings as returned by charinfo() never do).
728
729If there is a case folding for that character, a reference to a hash
730with the following fields is returned:
731
732 key
733
734 code code point with at least four hexdigits
735 lower lowercase
736 title titlecase
737 upper uppercase
738 condition condition list (may be undef)
739
740The C<condition> is optional. Where present, it consists of one or
741more I<locales> or I<contexts>, separated by spaces (other than as
742used to separate elements, spaces are to be ignored). A condition
743list overrides the normal behavior if all of the listed conditions are
744true. Case distinctions in the condition list are not significant.
82c0b05b 745Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 746
f499c386
JH
747Note that when there are multiple case folding definitions for a
748single code point because of different locales, the value returned by
749casespec() is a hash reference which has the locales as the keys and
750hash references as described above as the values.
751
b08cd201 752A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d
JH
753followed by a "_" and a 2-letter ISO language code (possibly followed
754by a "_" and a variant code). You can find the lists of those codes,
755see L<Locale::Country> and L<Locale::Language>.
b08cd201
JH
756
757A I<context> is one of the following choices:
758
759 FINAL The letter is not followed by a letter of
760 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
761 MODERN The mapping is only used for modern text
e618509d 762 AFTER_i The last base character was "i" (U+0069)
b08cd201
JH
763
764For more information about case mappings see
765http://www.unicode.org/unicode/reports/tr21/
766
767=cut
768
769my %CASESPEC;
770
771sub _casespec {
772 unless (%CASESPEC) {
551b6b6f 773 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 774 local $_;
b08cd201
JH
775 while (<$CASESPECFH>) {
776 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
777 my ($hexcode, $lower, $title, $upper, $condition) =
778 ($1, $2, $3, $4, $5);
779 my $code = hex($hexcode);
780 if (exists $CASESPEC{$code}) {
781 if (exists $CASESPEC{$code}->{code}) {
782 my ($oldlower,
783 $oldtitle,
784 $oldupper,
785 $oldcondition) =
786 @{$CASESPEC{$code}}{qw(lower
787 title
788 upper
789 condition)};
822ebcc8
JH
790 if (defined $oldcondition) {
791 my ($oldlocale) =
f499c386 792 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
793 delete $CASESPEC{$code};
794 $CASESPEC{$code}->{$oldlocale} =
795 { code => $hexcode,
796 lower => $oldlower,
797 title => $oldtitle,
798 upper => $oldupper,
799 condition => $oldcondition };
f499c386
JH
800 }
801 }
802 my ($locale) =
803 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
804 $CASESPEC{$code}->{$locale} =
805 { code => $hexcode,
806 lower => $lower,
807 title => $title,
808 upper => $upper,
809 condition => $condition };
810 } else {
811 $CASESPEC{$code} =
812 { code => $hexcode,
813 lower => $lower,
814 title => $title,
815 upper => $upper,
816 condition => $condition };
817 }
b08cd201
JH
818 }
819 }
820 close($CASESPECFH);
821 }
822 }
823}
824
825sub casespec {
826 my $arg = shift;
827 my $code = _getcode($arg);
74f8133e
JH
828 croak __PACKAGE__, "::casespec: unknown code '$arg'"
829 unless defined $code;
b08cd201
JH
830
831 _casespec() unless %CASESPEC;
832
741297c1 833 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
834}
835
a2bd7410
JH
836=head2 namedseq()
837
838 use Unicode::UCD 'namedseq';
839
840 my $namedseq = namedseq("KATAKANA LETTER AINU P");
841 my @namedseq = namedseq("KATAKANA LETTER AINU P");
842 my %namedseq = namedseq();
843
844If used with a single argument in a scalar context, returns the string
845consisting of the code points of the named sequence, or C<undef> if no
846named sequence by that name exists. If used with a single argument in
847a list context, returns list of the code points. If used with no
848arguments in a list context, returns a hash with the names of the
849named sequences as the keys and the named sequences as strings as
850the values. Otherwise, returns C<undef> or empty list depending
851on the context.
852
853(New from Unicode 4.1.0)
854
855=cut
856
857my %NAMEDSEQ;
858
859sub _namedseq {
860 unless (%NAMEDSEQ) {
861 if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
862 local $_;
863 while (<$NAMEDSEQFH>) {
864 if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
865 my ($n, $s) = ($1, $2);
866 my @s = map { chr(hex($_)) } split(' ', $s);
867 $NAMEDSEQ{$n} = join("", @s);
868 }
869 }
870 close($NAMEDSEQFH);
871 }
872 }
873}
874
875sub namedseq {
876 _namedseq() unless %NAMEDSEQ;
877 my $wantarray = wantarray();
878 if (defined $wantarray) {
879 if ($wantarray) {
880 if (@_ == 0) {
881 return %NAMEDSEQ;
882 } elsif (@_ == 1) {
883 my $s = $NAMEDSEQ{ $_[0] };
884 return defined $s ? map { ord($_) } split('', $s) : ();
885 }
886 } elsif (@_ == 1) {
887 return $NAMEDSEQ{ $_[0] };
888 }
889 }
890 return;
891}
892
55d7b906 893=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 894
55d7b906
JH
895Unicode::UCD::UnicodeVersion() returns the version of the Unicode
896Character Database, in other words, the version of the Unicode
78bf21c2
JH
897standard the database implements. The version is a string
898of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
899
900=cut
901
902my $UNICODEVERSION;
903
904sub UnicodeVersion {
905 unless (defined $UNICODEVERSION) {
906 openunicode(\$VERSIONFH, "version");
907 chomp($UNICODEVERSION = <$VERSIONFH>);
908 close($VERSIONFH);
909 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
910 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
911 }
912 return $UNICODEVERSION;
913}
3aa957f9
JH
914
915=head2 Implementation Note
32c16050 916
ad9cab37
JH
917The first use of charinfo() opens a read-only filehandle to the Unicode
918Character Database (the database is included in the Perl distribution).
78bf21c2
JH
919The filehandle is then kept open for further queries. In other words,
920if you are wondering where one of your filehandles went, that's where.
32c16050 921
8b731da2
JH
922=head1 BUGS
923
924Does not yet support EBCDIC platforms.
925
561c79ed
JH
926=head1 AUTHOR
927
928Jarkko Hietaniemi
929
930=cut
931
9321;