This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add encoding::warnings to Maintainers.pl and rebuild META.yml for release
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
5
20a6717b 6our $VERSION = '0.22';
561c79ed 7
741297c1
JH
8use Storable qw(dclone);
9
561c79ed
JH
10require Exporter;
11
12our @ISA = qw(Exporter);
74f8133e 13
10a6ecd2
JH
14our @EXPORT_OK = qw(charinfo
15 charblock charscript
16 charblocks charscripts
b08cd201
JH
17 charinrange
18 compexcl
19 casefold casespec);
561c79ed
JH
20
21use Carp;
22
23=head1 NAME
24
55d7b906 25Unicode::UCD - Unicode character database
561c79ed
JH
26
27=head1 SYNOPSIS
28
55d7b906 29 use Unicode::UCD 'charinfo';
b08cd201 30 my $charinfo = charinfo($codepoint);
561c79ed 31
55d7b906 32 use Unicode::UCD 'charblock';
e882dd67
JH
33 my $charblock = charblock($codepoint);
34
55d7b906 35 use Unicode::UCD 'charscript';
65044554 36 my $charscript = charscript($codepoint);
561c79ed 37
55d7b906 38 use Unicode::UCD 'charblocks';
e145285f
JH
39 my $charblocks = charblocks();
40
55d7b906 41 use Unicode::UCD 'charscripts';
e145285f
JH
42 my %charscripts = charscripts();
43
55d7b906 44 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
45 my $range = charscript($script);
46 print "looks like $script\n" if charinrange($range, $codepoint);
47
55d7b906 48 use Unicode::UCD 'compexcl';
e145285f
JH
49 my $compexcl = compexcl($codepoint);
50
55d7b906 51 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 52
561c79ed
JH
53=head1 DESCRIPTION
54
8b731da2
JH
55The Unicode::UCD module offers a simple interface to the Unicode
56Character Database.
561c79ed
JH
57
58=cut
59
10a6ecd2
JH
60my $UNICODEFH;
61my $BLOCKSFH;
62my $SCRIPTSFH;
63my $VERSIONFH;
b08cd201
JH
64my $COMPEXCLFH;
65my $CASEFOLDFH;
66my $CASESPECFH;
561c79ed
JH
67
68sub openunicode {
69 my ($rfh, @path) = @_;
70 my $f;
71 unless (defined $$rfh) {
72 for my $d (@INC) {
73 use File::Spec;
55d7b906 74 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 75 last if open($$rfh, $f);
e882dd67 76 undef $f;
561c79ed 77 }
e882dd67
JH
78 croak __PACKAGE__, ": failed to find ",
79 File::Spec->catfile(@path), " in @INC"
80 unless defined $f;
561c79ed
JH
81 }
82 return $f;
83}
84
85=head2 charinfo
86
55d7b906 87 use Unicode::UCD 'charinfo';
561c79ed 88
b08cd201 89 my $charinfo = charinfo(0x41);
561c79ed 90
b08cd201
JH
91charinfo() returns a reference to a hash that has the following fields
92as defined by the Unicode standard:
561c79ed
JH
93
94 key
95
96 code code point with at least four hexdigits
97 name name of the character IN UPPER CASE
98 category general category of the character
99 combining classes used in the Canonical Ordering Algorithm
100 bidi bidirectional category
101 decomposition character decomposition mapping
102 decimal if decimal digit this is the integer numeric value
103 digit if digit this is the numeric value
104 numeric if numeric is the integer or rational numeric value
105 mirrored if mirrored in bidirectional text
106 unicode10 Unicode 1.0 name if existed and different
107 comment ISO 10646 comment field
108 upper uppercase equivalent mapping
109 lower lowercase equivalent mapping
110 title titlecase equivalent mapping
e882dd67 111
561c79ed 112 block block the character belongs to (used in \p{In...})
eb0cc9e3 113 script script the character belongs to
561c79ed 114
b08cd201 115If no match is found, a reference to an empty hash is returned.
561c79ed 116
d1be9408 117The C<block> property is the same as returned by charinfo(). It is
32c16050 118not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2
JH
119Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
120(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050
JH
121
122Note that you cannot do (de)composition and casing based solely on the
123above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 124you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed
JH
125
126=cut
127
0616d9cf 128# NB: This function is duplicated in charnames.pm
10a6ecd2
JH
129sub _getcode {
130 my $arg = shift;
131
dc0a4417 132 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 133 return $arg;
dc0a4417 134 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
135 return hex($1);
136 }
137
138 return;
139}
140
ac5ea531
JH
141# Lingua::KO::Hangul::Util not part of the standard distribution
142# but it will be used if available.
143
144eval { require Lingua::KO::Hangul::Util };
145my $hasHangulUtil = ! $@;
146if ($hasHangulUtil) {
147 Lingua::KO::Hangul::Util->import();
148}
9087a70b
ST
149
150sub hangul_decomp { # internal: called from charinfo
ac5ea531
JH
151 if ($hasHangulUtil) {
152 my @tmp = decomposeHangul(shift);
153 return sprintf("%04X %04X", @tmp) if @tmp == 2;
154 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
155 }
156 return;
157}
158
159sub hangul_charname { # internal: called from charinfo
160 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b
ST
161}
162
9087a70b
ST
163sub han_charname { # internal: called from charinfo
164 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b
ST
165}
166
167my @CharinfoRanges = (
168# block name
169# [ first, last, coderef to name, coderef to decompose ],
170# CJK Ideographs Extension A
171 [ 0x3400, 0x4DB5, \&han_charname, undef ],
172# CJK Ideographs
173 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
174# Hangul Syllables
ac5ea531 175 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b
ST
176# Non-Private Use High Surrogates
177 [ 0xD800, 0xDB7F, undef, undef ],
178# Private Use High Surrogates
179 [ 0xDB80, 0xDBFF, undef, undef ],
180# Low Surrogates
181 [ 0xDC00, 0xDFFF, undef, undef ],
182# The Private Use Area
183 [ 0xE000, 0xF8FF, undef, undef ],
184# CJK Ideographs Extension B
185 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
186# Plane 15 Private Use Area
187 [ 0xF0000, 0xFFFFD, undef, undef ],
188# Plane 16 Private Use Area
189 [ 0x100000, 0x10FFFD, undef, undef ],
190);
191
561c79ed 192sub charinfo {
10a6ecd2
JH
193 my $arg = shift;
194 my $code = _getcode($arg);
195 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
196 unless defined $code;
e63dbbf9 197 my $hexk = sprintf("%06X", $code);
a6fa416b
ST
198 my($rcode,$rname,$rdec);
199 foreach my $range (@CharinfoRanges){
74f8133e 200 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 201 $rcode = $hexk;
e63dbbf9
JH
202 $rcode =~ s/^0+//;
203 $rcode = sprintf("%04X", hex($rcode));
a6fa416b
ST
204 $rname = $range->[2] ? $range->[2]->($code) : '';
205 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 206 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b
ST
207 last;
208 }
209 }
551b6b6f 210 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 211 if (defined $UNICODEFH) {
e63dbbf9
JH
212 use Search::Dict 1.02;
213 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 214 my $line = <$UNICODEFH>;
c5a29f40 215 return unless defined $line;
561c79ed
JH
216 chomp $line;
217 my %prop;
218 @prop{qw(
219 code name category
220 combining bidi decomposition
221 decimal digit numeric
222 mirrored unicode10 comment
223 upper lower title
224 )} = split(/;/, $line, -1);
e63dbbf9
JH
225 $hexk =~ s/^0+//;
226 $hexk = sprintf("%04X", hex($hexk));
561c79ed 227 if ($prop{code} eq $hexk) {
a196fbfd
JH
228 $prop{block} = charblock($code);
229 $prop{script} = charscript($code);
a6fa416b
ST
230 if(defined $rname){
231 $prop{code} = $rcode;
232 $prop{name} = $rname;
233 $prop{decomposition} = $rdec;
234 }
b08cd201 235 return \%prop;
561c79ed
JH
236 }
237 }
238 }
239 return;
240}
241
e882dd67
JH
242sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
243 my ($table, $lo, $hi, $code) = @_;
244
245 return if $lo > $hi;
246
247 my $mid = int(($lo+$hi) / 2);
248
249 if ($table->[$mid]->[0] < $code) {
10a6ecd2 250 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
251 return $table->[$mid]->[2];
252 } else {
253 _search($table, $mid + 1, $hi, $code);
254 }
255 } elsif ($table->[$mid]->[0] > $code) {
256 _search($table, $lo, $mid - 1, $code);
257 } else {
258 return $table->[$mid]->[2];
259 }
260}
261
10a6ecd2
JH
262sub charinrange {
263 my ($range, $arg) = @_;
264 my $code = _getcode($arg);
265 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
266 unless defined $code;
267 _search($range, 0, $#$range, $code);
268}
269
354a27bf 270=head2 charblock
561c79ed 271
55d7b906 272 use Unicode::UCD 'charblock';
561c79ed
JH
273
274 my $charblock = charblock(0x41);
10a6ecd2
JH
275 my $charblock = charblock(1234);
276 my $charblock = charblock("0x263a");
277 my $charblock = charblock("U+263a");
278
78bf21c2 279 my $range = charblock('Armenian');
10a6ecd2 280
78bf21c2 281With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 282belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 283positions within all blocks are defined.
10a6ecd2 284
78bf21c2
JH
285See also L</Blocks versus Scripts>.
286
eb0cc9e3
JH
287If supplied with an argument that can't be a code point, charblock() tries
288to do the opposite and interpret the argument as a character block. The
289return value is a I<range>: an anonymous list of lists that contain
290I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
291code point is in a range using the L</charinrange> function. If the
292argument is not a known charater block, C<undef> is returned.
561c79ed 293
561c79ed
JH
294=cut
295
296my @BLOCKS;
10a6ecd2 297my %BLOCKS;
561c79ed 298
10a6ecd2 299sub _charblocks {
561c79ed 300 unless (@BLOCKS) {
10a6ecd2 301 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 302 local $_;
10a6ecd2 303 while (<$BLOCKSFH>) {
2796c109 304 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
305 my ($lo, $hi) = (hex($1), hex($2));
306 my $subrange = [ $lo, $hi, $3 ];
307 push @BLOCKS, $subrange;
308 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
309 }
310 }
10a6ecd2 311 close($BLOCKSFH);
561c79ed
JH
312 }
313 }
10a6ecd2
JH
314}
315
316sub charblock {
317 my $arg = shift;
318
319 _charblocks() unless @BLOCKS;
320
321 my $code = _getcode($arg);
561c79ed 322
10a6ecd2
JH
323 if (defined $code) {
324 _search(\@BLOCKS, 0, $#BLOCKS, $code);
325 } else {
326 if (exists $BLOCKS{$arg}) {
741297c1 327 return dclone $BLOCKS{$arg};
10a6ecd2
JH
328 } else {
329 return;
330 }
331 }
e882dd67
JH
332}
333
334=head2 charscript
335
55d7b906 336 use Unicode::UCD 'charscript';
e882dd67
JH
337
338 my $charscript = charscript(0x41);
10a6ecd2
JH
339 my $charscript = charscript(1234);
340 my $charscript = charscript("U+263a");
e882dd67 341
78bf21c2 342 my $range = charscript('Thai');
10a6ecd2 343
78bf21c2 344With a B<code point argument> charscript() returns the I<script> the
b08cd201 345character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 346
78bf21c2
JH
347See also L</Blocks versus Scripts>.
348
eb0cc9e3
JH
349If supplied with an argument that can't be a code point, charscript() tries
350to do the opposite and interpret the argument as a character script. The
351return value is a I<range>: an anonymous list of lists that contain
352I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
353code point is in a range using the L</charinrange> function. If the
354argument is not a known charater script, C<undef> is returned.
e882dd67 355
e882dd67
JH
356=cut
357
358my @SCRIPTS;
10a6ecd2 359my %SCRIPTS;
e882dd67 360
10a6ecd2 361sub _charscripts {
e882dd67 362 unless (@SCRIPTS) {
10a6ecd2 363 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
6c8d78fb 364 local $_;
10a6ecd2 365 while (<$SCRIPTSFH>) {
e882dd67 366 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
367 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
368 my $script = lc($3);
369 $script =~ s/\b(\w)/uc($1)/ge;
370 my $subrange = [ $lo, $hi, $script ];
371 push @SCRIPTS, $subrange;
372 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
373 }
374 }
10a6ecd2 375 close($SCRIPTSFH);
e882dd67
JH
376 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
377 }
378 }
10a6ecd2
JH
379}
380
381sub charscript {
382 my $arg = shift;
383
384 _charscripts() unless @SCRIPTS;
e882dd67 385
10a6ecd2
JH
386 my $code = _getcode($arg);
387
388 if (defined $code) {
389 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
390 } else {
391 if (exists $SCRIPTS{$arg}) {
741297c1 392 return dclone $SCRIPTS{$arg};
10a6ecd2
JH
393 } else {
394 return;
395 }
396 }
397}
398
399=head2 charblocks
400
55d7b906 401 use Unicode::UCD 'charblocks';
10a6ecd2 402
b08cd201 403 my $charblocks = charblocks();
10a6ecd2 404
b08cd201
JH
405charblocks() returns a reference to a hash with the known block names
406as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 407
78bf21c2
JH
408See also L</Blocks versus Scripts>.
409
10a6ecd2
JH
410=cut
411
412sub charblocks {
b08cd201 413 _charblocks() unless %BLOCKS;
741297c1 414 return dclone \%BLOCKS;
10a6ecd2
JH
415}
416
417=head2 charscripts
418
55d7b906 419 use Unicode::UCD 'charscripts';
10a6ecd2
JH
420
421 my %charscripts = charscripts();
422
423charscripts() returns a hash with the known script names as the keys,
424and the code point ranges (see L</charscript>) as the values.
425
78bf21c2
JH
426See also L</Blocks versus Scripts>.
427
10a6ecd2
JH
428=cut
429
430sub charscripts {
b08cd201 431 _charscripts() unless %SCRIPTS;
741297c1 432 return dclone \%SCRIPTS;
561c79ed
JH
433}
434
10a6ecd2 435=head2 Blocks versus Scripts
ad9cab37 436
10a6ecd2
JH
437The difference between a block and a script is that scripts are closer
438to the linguistic notion of a set of characters required to present
439languages, while block is more of an artifact of the Unicode character
eb0cc9e3 440numbering and separation into blocks of (mostly) 256 characters.
3aa957f9
JH
441
442For example the Latin B<script> is spread over several B<blocks>, such
443as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
444C<Latin Extended-B>. On the other hand, the Latin script does not
445contain all the characters of the C<Basic Latin> block (also known as
eb0cc9e3 446the ASCII): it includes only the letters, and not, for example, the digits
3aa957f9 447or the punctuation.
ad9cab37 448
3aa957f9 449For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37
JH
450
451For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
452
3aa957f9
JH
453=head2 Matching Scripts and Blocks
454
eb0cc9e3
JH
455Scripts are matched with the regular-expression construct
456C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
457while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
458any of the 256 code points in the Tibetan block).
10a6ecd2 459
b08cd201
JH
460=head2 Code Point Arguments
461
92e830a9
JH
462A I<code point argument> is either a decimal or a hexadecimal scalar
463designating a Unicode character, or C<U+> followed by hexadecimals
dc0a4417
JH
464designating a Unicode character. In other words, if you want a code
465point to be interpreted as a hexadecimal number, you must prefix it
43adb1d9 466with either C<0x> or C<U+>, because a string like e.g. C<123> will
dc0a4417
JH
467be interpreted as a decimal code point. Also note that Unicode is
468B<not> limited to 16 bits (the number of Unicode characters is
469open-ended, in theory unlimited): you may have more than 4 hexdigits.
b08cd201 470
10a6ecd2
JH
471=head2 charinrange
472
473In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
474can also test whether a code point is in the I<range> as returned by
475L</charblock> and L</charscript> or as the values of the hash returned
e618509d 476by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 477
55d7b906 478 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
479
480 $range = charscript('Hiragana');
e145285f 481 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
482
483=cut
484
b08cd201
JH
485=head2 compexcl
486
55d7b906 487 use Unicode::UCD 'compexcl';
b08cd201
JH
488
489 my $compexcl = compexcl("09dc");
490
491The compexcl() returns the composition exclusion (that is, if the
9046a8ae
SC
492character should not be produced during a precomposition) of the
493character specified by a B<code point argument>.
b08cd201
JH
494
495If there is a composition exclusion for the character, true is
496returned. Otherwise, false is returned.
497
498=cut
499
500my %COMPEXCL;
501
502sub _compexcl {
503 unless (%COMPEXCL) {
551b6b6f 504 if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
6c8d78fb 505 local $_;
b08cd201 506 while (<$COMPEXCLFH>) {
822ebcc8 507 if (/^([0-9A-F]+)\s+\#\s+/) {
b08cd201
JH
508 my $code = hex($1);
509 $COMPEXCL{$code} = undef;
510 }
511 }
512 close($COMPEXCLFH);
513 }
514 }
515}
516
517sub compexcl {
518 my $arg = shift;
519 my $code = _getcode($arg);
74f8133e
JH
520 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
521 unless defined $code;
b08cd201
JH
522
523 _compexcl() unless %COMPEXCL;
524
525 return exists $COMPEXCL{$code};
526}
527
528=head2 casefold
529
55d7b906 530 use Unicode::UCD 'casefold';
b08cd201 531
82c0b05b 532 my $casefold = casefold("00DF");
b08cd201
JH
533
534The casefold() returns the locale-independent case folding of the
535character specified by a B<code point argument>.
536
537If there is a case folding for that character, a reference to a hash
538with the following fields is returned:
539
540 key
541
542 code code point with at least four hexdigits
543 status "C", "F", "S", or "I"
544 mapping one or more codes separated by spaces
545
546The meaning of the I<status> is as follows:
547
548 C common case folding, common mappings shared
549 by both simple and full mappings
550 F full case folding, mappings that cause strings
551 to grow in length. Multiple characters are separated
552 by spaces
553 S simple case folding, mappings to single characters
554 where different from F
555 I special case for dotted uppercase I and
556 dotless lowercase i
557 - If this mapping is included, the result is
558 case-insensitive, but dotless and dotted I's
559 are not distinguished
560 - If this mapping is excluded, the result is not
561 fully case-insensitive, but dotless and dotted
562 I's are distinguished
563
564If there is no case folding for that character, C<undef> is returned.
565
566For more information about case mappings see
567http://www.unicode.org/unicode/reports/tr21/
568
569=cut
570
571my %CASEFOLD;
572
573sub _casefold {
574 unless (%CASEFOLD) {
551b6b6f 575 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 576 local $_;
b08cd201
JH
577 while (<$CASEFOLDFH>) {
578 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
579 my $code = hex($1);
580 $CASEFOLD{$code} = { code => $1,
581 status => $2,
582 mapping => $3 };
583 }
584 }
585 close($CASEFOLDFH);
586 }
587 }
588}
589
590sub casefold {
591 my $arg = shift;
592 my $code = _getcode($arg);
74f8133e
JH
593 croak __PACKAGE__, "::casefold: unknown code '$arg'"
594 unless defined $code;
b08cd201
JH
595
596 _casefold() unless %CASEFOLD;
597
598 return $CASEFOLD{$code};
599}
600
601=head2 casespec
602
55d7b906 603 use Unicode::UCD 'casespec';
b08cd201 604
82c0b05b 605 my $casespec = casespec("FB00");
b08cd201
JH
606
607The casespec() returns the potentially locale-dependent case mapping
608of the character specified by a B<code point argument>. The mapping
609may change the length of the string (which the basic Unicode case
610mappings as returned by charinfo() never do).
611
612If there is a case folding for that character, a reference to a hash
613with the following fields is returned:
614
615 key
616
617 code code point with at least four hexdigits
618 lower lowercase
619 title titlecase
620 upper uppercase
621 condition condition list (may be undef)
622
623The C<condition> is optional. Where present, it consists of one or
624more I<locales> or I<contexts>, separated by spaces (other than as
625used to separate elements, spaces are to be ignored). A condition
626list overrides the normal behavior if all of the listed conditions are
627true. Case distinctions in the condition list are not significant.
82c0b05b 628Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 629
f499c386
JH
630Note that when there are multiple case folding definitions for a
631single code point because of different locales, the value returned by
632casespec() is a hash reference which has the locales as the keys and
633hash references as described above as the values.
634
b08cd201 635A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d
JH
636followed by a "_" and a 2-letter ISO language code (possibly followed
637by a "_" and a variant code). You can find the lists of those codes,
638see L<Locale::Country> and L<Locale::Language>.
b08cd201
JH
639
640A I<context> is one of the following choices:
641
642 FINAL The letter is not followed by a letter of
643 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
644 MODERN The mapping is only used for modern text
e618509d 645 AFTER_i The last base character was "i" (U+0069)
b08cd201
JH
646
647For more information about case mappings see
648http://www.unicode.org/unicode/reports/tr21/
649
650=cut
651
652my %CASESPEC;
653
654sub _casespec {
655 unless (%CASESPEC) {
551b6b6f 656 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 657 local $_;
b08cd201
JH
658 while (<$CASESPECFH>) {
659 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
660 my ($hexcode, $lower, $title, $upper, $condition) =
661 ($1, $2, $3, $4, $5);
662 my $code = hex($hexcode);
663 if (exists $CASESPEC{$code}) {
664 if (exists $CASESPEC{$code}->{code}) {
665 my ($oldlower,
666 $oldtitle,
667 $oldupper,
668 $oldcondition) =
669 @{$CASESPEC{$code}}{qw(lower
670 title
671 upper
672 condition)};
822ebcc8
JH
673 if (defined $oldcondition) {
674 my ($oldlocale) =
f499c386 675 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
676 delete $CASESPEC{$code};
677 $CASESPEC{$code}->{$oldlocale} =
678 { code => $hexcode,
679 lower => $oldlower,
680 title => $oldtitle,
681 upper => $oldupper,
682 condition => $oldcondition };
f499c386
JH
683 }
684 }
685 my ($locale) =
686 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
687 $CASESPEC{$code}->{$locale} =
688 { code => $hexcode,
689 lower => $lower,
690 title => $title,
691 upper => $upper,
692 condition => $condition };
693 } else {
694 $CASESPEC{$code} =
695 { code => $hexcode,
696 lower => $lower,
697 title => $title,
698 upper => $upper,
699 condition => $condition };
700 }
b08cd201
JH
701 }
702 }
703 close($CASESPECFH);
704 }
705 }
706}
707
708sub casespec {
709 my $arg = shift;
710 my $code = _getcode($arg);
74f8133e
JH
711 croak __PACKAGE__, "::casespec: unknown code '$arg'"
712 unless defined $code;
b08cd201
JH
713
714 _casespec() unless %CASESPEC;
715
741297c1 716 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
717}
718
55d7b906 719=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 720
55d7b906
JH
721Unicode::UCD::UnicodeVersion() returns the version of the Unicode
722Character Database, in other words, the version of the Unicode
78bf21c2
JH
723standard the database implements. The version is a string
724of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
725
726=cut
727
728my $UNICODEVERSION;
729
730sub UnicodeVersion {
731 unless (defined $UNICODEVERSION) {
732 openunicode(\$VERSIONFH, "version");
733 chomp($UNICODEVERSION = <$VERSIONFH>);
734 close($VERSIONFH);
735 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
736 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
737 }
738 return $UNICODEVERSION;
739}
3aa957f9
JH
740
741=head2 Implementation Note
32c16050 742
ad9cab37
JH
743The first use of charinfo() opens a read-only filehandle to the Unicode
744Character Database (the database is included in the Perl distribution).
78bf21c2
JH
745The filehandle is then kept open for further queries. In other words,
746if you are wondering where one of your filehandles went, that's where.
32c16050 747
8b731da2
JH
748=head1 BUGS
749
750Does not yet support EBCDIC platforms.
751
561c79ed
JH
752=head1 AUTHOR
753
754Jarkko Hietaniemi
755
756=cut
757
7581;