This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Math::BigInt 1.41.
[perl5.git] / lib / UnicodeCD.pm
CommitLineData
1189d1e4 1package UnicodeCD;
561c79ed
JH
2
3use strict;
4use warnings;
5
74f8133e 6our $VERSION = '0.2';
561c79ed
JH
7
8require Exporter;
9
10our @ISA = qw(Exporter);
74f8133e 11
10a6ecd2
JH
12our @EXPORT_OK = qw(charinfo
13 charblock charscript
14 charblocks charscripts
b08cd201
JH
15 charinrange
16 compexcl
17 casefold casespec);
561c79ed
JH
18
19use Carp;
20
21=head1 NAME
22
1189d1e4 23UnicodeCD - Unicode character database
561c79ed
JH
24
25=head1 SYNOPSIS
26
1189d1e4 27 use UnicodeCD 'charinfo';
b08cd201 28 my $charinfo = charinfo($codepoint);
561c79ed 29
1189d1e4 30 use UnicodeCD 'charblock';
e882dd67
JH
31 my $charblock = charblock($codepoint);
32
1189d1e4 33 use UnicodeCD 'charscript';
e882dd67 34 my $charscript = charblock($codepoint);
561c79ed 35
e145285f
JH
36 use UnicodeCD 'charblocks';
37 my $charblocks = charblocks();
38
39 use UnicodeCD 'charscripts';
40 my %charscripts = charscripts();
41
42 use UnicodeCD qw(charscript charinrange);
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
45
46 use UnicodeCD 'compexcl';
47 my $compexcl = compexcl($codepoint);
48
49 my $unicode_version = UnicodeCD::UnicodeVersion();
50
561c79ed
JH
51=head1 DESCRIPTION
52
e145285f 53The UnicodeCD module offers a simple interface to the Unicode Character
561c79ed
JH
54Database.
55
56=cut
57
10a6ecd2
JH
58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
b08cd201
JH
62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
561c79ed
JH
65
66sub openunicode {
67 my ($rfh, @path) = @_;
68 my $f;
69 unless (defined $$rfh) {
70 for my $d (@INC) {
71 use File::Spec;
72 $f = File::Spec->catfile($d, "unicode", @path);
32c16050 73 last if open($$rfh, $f);
e882dd67 74 undef $f;
561c79ed 75 }
e882dd67
JH
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
78 unless defined $f;
561c79ed
JH
79 }
80 return $f;
81}
82
83=head2 charinfo
84
1189d1e4 85 use UnicodeCD 'charinfo';
561c79ed 86
b08cd201 87 my $charinfo = charinfo(0x41);
561c79ed 88
b08cd201
JH
89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
561c79ed
JH
91
92 key
93
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
e882dd67 109
561c79ed 110 block block the character belongs to (used in \p{In...})
e882dd67 111 script script the character belongs to
561c79ed 112
b08cd201 113If no match is found, a reference to an empty hash is returned.
561c79ed 114
32c16050
JH
115The C<block> property is the same as as returned by charinfo(). It is
116not defined in the Unicode Character Database proper (Chapter 4 of the
117Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67 118of TUS3). Similarly for the C<script> property.
32c16050
JH
119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 122you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed
JH
123
124=cut
125
10a6ecd2
JH
126sub _getcode {
127 my $arg = shift;
128
129 if ($arg =~ /^\d+$/) {
130 return $arg;
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
132 return hex($1);
133 }
134
135 return;
136}
137
a6fa416b
ST
138sub han_charname {
139 my $arg = shift;
140 my $code = _getcode($arg);
74f8133e 141 croak __PACKAGE__, "::han_charname: unknown code '$arg'"
a6fa416b
ST
142 unless defined $code;
143 croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
144 unless 0x3400 <= $code && $code <= 0x4DB5
145 || 0x4E00 <= $code && $code <= 0x9FA5
146 || 0x20000 <= $code && $code <= 0x2A6D6;
147 sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
148}
149
150my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
151 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
152 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
153 );
154
155my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
156 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
157 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
158 "YU", "EU", "YI", "I",
159 );
160
161my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
162 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
163 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
164 "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
165 );
166
167my %HangulConst = (
168 SBase => 0xAC00,
169 LBase => 0x1100,
170 VBase => 0x1161,
171 TBase => 0x11A7,
172 LCount => 19, # scalar @JamoL
173 VCount => 21, # scalar @JamoV
174 TCount => 28, # scalar @JamoT
175 NCount => 588, # VCount * TCount
176 SCount => 11172, # LCount * NCount
177 Final => 0xD7A3, # SBase -1 + SCount
178 );
179
180sub hangul_charname {
181 my $arg = shift;
182 my $code = _getcode($arg);
74f8133e 183 croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
a6fa416b
ST
184 unless defined $code;
185 croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
186 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
187 my $SIndex = $code - $HangulConst{SBase};
188 my $LIndex = int( $SIndex / $HangulConst{NCount});
189 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
190 my $TIndex = $SIndex % $HangulConst{TCount};
191 return join('',
192 "HANGUL SYLLABLE ",
193 $JamoL[$LIndex],
194 $JamoV[$VIndex],
195 $JamoT[$TIndex],
196 );
197}
198
199sub hangul_decomp {
200 my $arg = shift;
201 my $code = _getcode($arg);
74f8133e 202 croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
a6fa416b
ST
203 unless defined $code;
204 croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
205 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
206 my $SIndex = $code - $HangulConst{SBase};
207 my $LIndex = int( $SIndex / $HangulConst{NCount});
208 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
209 my $TIndex = $SIndex % $HangulConst{TCount};
210
211 return join(" ",
212 sprintf("%04X", $HangulConst{LBase} + $LIndex),
213 sprintf("%04X", $HangulConst{VBase} + $VIndex),
214 $TIndex ?
215 sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
216 );
217}
218
219my @CharinfoRanges = (
220# block name
221# [ first, last, coderef to name, coderef to decompose ],
222# CJK Ideographs Extension A
223 [ 0x3400, 0x4DB5, \&han_charname, undef ],
224# CJK Ideographs
225 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
226# Hangul Syllables
227 [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
228# Non-Private Use High Surrogates
229 [ 0xD800, 0xDB7F, undef, undef ],
230# Private Use High Surrogates
231 [ 0xDB80, 0xDBFF, undef, undef ],
232# Low Surrogates
233 [ 0xDC00, 0xDFFF, undef, undef ],
234# The Private Use Area
235 [ 0xE000, 0xF8FF, undef, undef ],
236# CJK Ideographs Extension B
237 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
238# Plane 15 Private Use Area
239 [ 0xF0000, 0xFFFFD, undef, undef ],
240# Plane 16 Private Use Area
241 [ 0x100000, 0x10FFFD, undef, undef ],
242);
243
561c79ed 244sub charinfo {
10a6ecd2
JH
245 my $arg = shift;
246 my $code = _getcode($arg);
247 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
248 unless defined $code;
e63dbbf9 249 my $hexk = sprintf("%06X", $code);
a6fa416b
ST
250 my($rcode,$rname,$rdec);
251 foreach my $range (@CharinfoRanges){
74f8133e 252 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 253 $rcode = $hexk;
e63dbbf9
JH
254 $rcode =~ s/^0+//;
255 $rcode = sprintf("%04X", hex($rcode));
a6fa416b
ST
256 $rname = $range->[2] ? $range->[2]->($code) : '';
257 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 258 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b
ST
259 last;
260 }
261 }
74f8133e 262 openunicode(\$UNICODEFH, "Unicode.txt");
10a6ecd2 263 if (defined $UNICODEFH) {
e63dbbf9
JH
264 use Search::Dict 1.02;
265 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 266 my $line = <$UNICODEFH>;
561c79ed
JH
267 chomp $line;
268 my %prop;
269 @prop{qw(
270 code name category
271 combining bidi decomposition
272 decimal digit numeric
273 mirrored unicode10 comment
274 upper lower title
275 )} = split(/;/, $line, -1);
e63dbbf9
JH
276 $hexk =~ s/^0+//;
277 $hexk = sprintf("%04X", hex($hexk));
561c79ed 278 if ($prop{code} eq $hexk) {
a196fbfd
JH
279 $prop{block} = charblock($code);
280 $prop{script} = charscript($code);
a6fa416b
ST
281 if(defined $rname){
282 $prop{code} = $rcode;
283 $prop{name} = $rname;
284 $prop{decomposition} = $rdec;
285 }
b08cd201 286 return \%prop;
561c79ed
JH
287 }
288 }
289 }
290 return;
291}
292
e882dd67
JH
293sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
294 my ($table, $lo, $hi, $code) = @_;
295
296 return if $lo > $hi;
297
298 my $mid = int(($lo+$hi) / 2);
299
300 if ($table->[$mid]->[0] < $code) {
10a6ecd2 301 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
302 return $table->[$mid]->[2];
303 } else {
304 _search($table, $mid + 1, $hi, $code);
305 }
306 } elsif ($table->[$mid]->[0] > $code) {
307 _search($table, $lo, $mid - 1, $code);
308 } else {
309 return $table->[$mid]->[2];
310 }
311}
312
10a6ecd2
JH
313sub charinrange {
314 my ($range, $arg) = @_;
315 my $code = _getcode($arg);
316 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
317 unless defined $code;
318 _search($range, 0, $#$range, $code);
319}
320
354a27bf 321=head2 charblock
561c79ed 322
1189d1e4 323 use UnicodeCD 'charblock';
561c79ed
JH
324
325 my $charblock = charblock(0x41);
10a6ecd2
JH
326 my $charblock = charblock(1234);
327 my $charblock = charblock("0x263a");
328 my $charblock = charblock("U+263a");
329
330 my $ranges = charblock('Armenian');
331
332With a B<code point argument> charblock() returns the block the character
333belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 334positions within all blocks are defined.
10a6ecd2
JH
335
336If supplied with an argument that can't be a code point, charblock()
337tries to do the opposite and interpret the argument as a character
338block. The return value is a I<range>: an anonymous list that
339contains anonymous lists, which in turn contain I<start-of-range>,
340I<end-of-range> code point pairs. You can test whether a code point
341is in a range using the L</charinrange> function. If the argument is
342not a known charater block, C<undef> is returned.
561c79ed 343
561c79ed
JH
344=cut
345
346my @BLOCKS;
10a6ecd2 347my %BLOCKS;
561c79ed 348
10a6ecd2 349sub _charblocks {
561c79ed 350 unless (@BLOCKS) {
10a6ecd2
JH
351 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
352 while (<$BLOCKSFH>) {
2796c109 353 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
354 my ($lo, $hi) = (hex($1), hex($2));
355 my $subrange = [ $lo, $hi, $3 ];
356 push @BLOCKS, $subrange;
357 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
358 }
359 }
10a6ecd2 360 close($BLOCKSFH);
561c79ed
JH
361 }
362 }
10a6ecd2
JH
363}
364
365sub charblock {
366 my $arg = shift;
367
368 _charblocks() unless @BLOCKS;
369
370 my $code = _getcode($arg);
561c79ed 371
10a6ecd2
JH
372 if (defined $code) {
373 _search(\@BLOCKS, 0, $#BLOCKS, $code);
374 } else {
375 if (exists $BLOCKS{$arg}) {
376 return $BLOCKS{$arg};
377 } else {
378 return;
379 }
380 }
e882dd67
JH
381}
382
383=head2 charscript
384
1189d1e4 385 use UnicodeCD 'charscript';
e882dd67
JH
386
387 my $charscript = charscript(0x41);
10a6ecd2
JH
388 my $charscript = charscript(1234);
389 my $charscript = charscript("U+263a");
e882dd67 390
10a6ecd2
JH
391 my $ranges = charscript('Thai');
392
393With a B<code point argument> charscript() returns the script the
b08cd201 394character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2
JH
395
396If supplied with an argument that can't be a code point, charscript()
397tries to do the opposite and interpret the argument as a character
398script. The return value is a I<range>: an anonymous list that
399contains anonymous lists, which in turn contain I<start-of-range>,
400I<end-of-range> code point pairs. You can test whether a code point
401is in a range using the L</charinrange> function. If the argument is
402not a known charater script, C<undef> is returned.
e882dd67 403
e882dd67
JH
404=cut
405
406my @SCRIPTS;
10a6ecd2 407my %SCRIPTS;
e882dd67 408
10a6ecd2 409sub _charscripts {
e882dd67 410 unless (@SCRIPTS) {
10a6ecd2
JH
411 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
412 while (<$SCRIPTSFH>) {
e882dd67 413 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
414 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
415 my $script = lc($3);
416 $script =~ s/\b(\w)/uc($1)/ge;
417 my $subrange = [ $lo, $hi, $script ];
418 push @SCRIPTS, $subrange;
419 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
420 }
421 }
10a6ecd2 422 close($SCRIPTSFH);
e882dd67
JH
423 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
424 }
425 }
10a6ecd2
JH
426}
427
428sub charscript {
429 my $arg = shift;
430
431 _charscripts() unless @SCRIPTS;
e882dd67 432
10a6ecd2
JH
433 my $code = _getcode($arg);
434
435 if (defined $code) {
436 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
437 } else {
438 if (exists $SCRIPTS{$arg}) {
439 return $SCRIPTS{$arg};
440 } else {
441 return;
442 }
443 }
444}
445
446=head2 charblocks
447
1189d1e4 448 use UnicodeCD 'charblocks';
10a6ecd2 449
b08cd201 450 my $charblocks = charblocks();
10a6ecd2 451
b08cd201
JH
452charblocks() returns a reference to a hash with the known block names
453as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2
JH
454
455=cut
456
457sub charblocks {
b08cd201
JH
458 _charblocks() unless %BLOCKS;
459 return \%BLOCKS;
10a6ecd2
JH
460}
461
462=head2 charscripts
463
1189d1e4 464 use UnicodeCD 'charscripts';
10a6ecd2
JH
465
466 my %charscripts = charscripts();
467
468charscripts() returns a hash with the known script names as the keys,
469and the code point ranges (see L</charscript>) as the values.
470
471=cut
472
473sub charscripts {
b08cd201
JH
474 _charscripts() unless %SCRIPTS;
475 return \%SCRIPTS;
561c79ed
JH
476}
477
10a6ecd2 478=head2 Blocks versus Scripts
ad9cab37 479
10a6ecd2
JH
480The difference between a block and a script is that scripts are closer
481to the linguistic notion of a set of characters required to present
482languages, while block is more of an artifact of the Unicode character
483numbering and separation into blocks of 256 characters.
3aa957f9
JH
484
485For example the Latin B<script> is spread over several B<blocks>, such
486as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
487C<Latin Extended-B>. On the other hand, the Latin script does not
488contain all the characters of the C<Basic Latin> block (also known as
489the ASCII): it includes only the letters, not for example the digits
490or the punctuation.
ad9cab37 491
3aa957f9 492For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37
JH
493
494For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
495
3aa957f9
JH
496=head2 Matching Scripts and Blocks
497
498Both scripts and blocks can be matched using the regular expression
499construct C<\p{In...}> and its negation C<\P{In...}>.
500
501The name of the script or the block comes after the C<In>, for example
502C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2
JH
503removed from the names for the C<\p{In...}>, for example
504C<LatinExtendedA> instead of C<Latin Extended-A>.
505
506There are a few cases where there exists both a script and a block by
507the same name, in these cases the block version has C<Block> appended:
508C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
509
b08cd201
JH
510=head2 Code Point Arguments
511
512A <code point argument> is either a decimal or a hexadecimal scalar,
513or "U+" followed by hexadecimals.
514
10a6ecd2
JH
515=head2 charinrange
516
517In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
518can also test whether a code point is in the I<range> as returned by
519L</charblock> and L</charscript> or as the values of the hash returned
e618509d 520by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 521
1189d1e4 522 use UnicodeCD qw(charscript charinrange);
10a6ecd2
JH
523
524 $range = charscript('Hiragana');
e145285f 525 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
526
527=cut
528
b08cd201
JH
529=head2 compexcl
530
1189d1e4 531 use UnicodeCD 'compexcl';
b08cd201
JH
532
533 my $compexcl = compexcl("09dc");
534
535The compexcl() returns the composition exclusion (that is, if the
9046a8ae
SC
536character should not be produced during a precomposition) of the
537character specified by a B<code point argument>.
b08cd201
JH
538
539If there is a composition exclusion for the character, true is
540returned. Otherwise, false is returned.
541
542=cut
543
544my %COMPEXCL;
545
546sub _compexcl {
547 unless (%COMPEXCL) {
548 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
549 while (<$COMPEXCLFH>) {
550 if (/^([0-9A-F]+) \# /) {
551 my $code = hex($1);
552 $COMPEXCL{$code} = undef;
553 }
554 }
555 close($COMPEXCLFH);
556 }
557 }
558}
559
560sub compexcl {
561 my $arg = shift;
562 my $code = _getcode($arg);
74f8133e
JH
563 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
564 unless defined $code;
b08cd201
JH
565
566 _compexcl() unless %COMPEXCL;
567
568 return exists $COMPEXCL{$code};
569}
570
571=head2 casefold
572
1189d1e4 573 use UnicodeCD 'casefold';
b08cd201
JH
574
575 my %casefold = casefold("09dc");
576
577The casefold() returns the locale-independent case folding of the
578character specified by a B<code point argument>.
579
580If there is a case folding for that character, a reference to a hash
581with the following fields is returned:
582
583 key
584
585 code code point with at least four hexdigits
586 status "C", "F", "S", or "I"
587 mapping one or more codes separated by spaces
588
589The meaning of the I<status> is as follows:
590
591 C common case folding, common mappings shared
592 by both simple and full mappings
593 F full case folding, mappings that cause strings
594 to grow in length. Multiple characters are separated
595 by spaces
596 S simple case folding, mappings to single characters
597 where different from F
598 I special case for dotted uppercase I and
599 dotless lowercase i
600 - If this mapping is included, the result is
601 case-insensitive, but dotless and dotted I's
602 are not distinguished
603 - If this mapping is excluded, the result is not
604 fully case-insensitive, but dotless and dotted
605 I's are distinguished
606
607If there is no case folding for that character, C<undef> is returned.
608
609For more information about case mappings see
610http://www.unicode.org/unicode/reports/tr21/
611
612=cut
613
614my %CASEFOLD;
615
616sub _casefold {
617 unless (%CASEFOLD) {
618 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
619 while (<$CASEFOLDFH>) {
620 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
621 my $code = hex($1);
622 $CASEFOLD{$code} = { code => $1,
623 status => $2,
624 mapping => $3 };
625 }
626 }
627 close($CASEFOLDFH);
628 }
629 }
630}
631
632sub casefold {
633 my $arg = shift;
634 my $code = _getcode($arg);
74f8133e
JH
635 croak __PACKAGE__, "::casefold: unknown code '$arg'"
636 unless defined $code;
b08cd201
JH
637
638 _casefold() unless %CASEFOLD;
639
640 return $CASEFOLD{$code};
641}
642
643=head2 casespec
644
1189d1e4 645 use UnicodeCD 'casespec';
b08cd201
JH
646
647 my %casespec = casespec("09dc");
648
649The casespec() returns the potentially locale-dependent case mapping
650of the character specified by a B<code point argument>. The mapping
651may change the length of the string (which the basic Unicode case
652mappings as returned by charinfo() never do).
653
654If there is a case folding for that character, a reference to a hash
655with the following fields is returned:
656
657 key
658
659 code code point with at least four hexdigits
660 lower lowercase
661 title titlecase
662 upper uppercase
663 condition condition list (may be undef)
664
665The C<condition> is optional. Where present, it consists of one or
666more I<locales> or I<contexts>, separated by spaces (other than as
667used to separate elements, spaces are to be ignored). A condition
668list overrides the normal behavior if all of the listed conditions are
669true. Case distinctions in the condition list are not significant.
670Conditions preceded by "NON_" represent the negation of the condition
671
672A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d
JH
673followed by a "_" and a 2-letter ISO language code (possibly followed
674by a "_" and a variant code). You can find the lists of those codes,
675see L<Locale::Country> and L<Locale::Language>.
b08cd201
JH
676
677A I<context> is one of the following choices:
678
679 FINAL The letter is not followed by a letter of
680 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
681 MODERN The mapping is only used for modern text
e618509d 682 AFTER_i The last base character was "i" (U+0069)
b08cd201
JH
683
684For more information about case mappings see
685http://www.unicode.org/unicode/reports/tr21/
686
687=cut
688
689my %CASESPEC;
690
691sub _casespec {
692 unless (%CASESPEC) {
693 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
694 while (<$CASESPECFH>) {
695 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
696 my $code = hex($1);
697 $CASESPEC{$code} = { code => $1,
698 lower => $2,
699 title => $3,
700 upper => $4,
701 condition => $5 };
702 }
703 }
704 close($CASESPECFH);
705 }
706 }
707}
708
709sub casespec {
710 my $arg = shift;
711 my $code = _getcode($arg);
74f8133e
JH
712 croak __PACKAGE__, "::casespec: unknown code '$arg'"
713 unless defined $code;
b08cd201
JH
714
715 _casespec() unless %CASESPEC;
716
717 return $CASESPEC{$code};
718}
719
1189d1e4 720=head2 UnicodeCD::UnicodeVersion
10a6ecd2 721
1189d1e4 722UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
10a6ecd2
JH
723Database, in other words, the version of the Unicode standard the
724database implements.
725
726=cut
727
728my $UNICODEVERSION;
729
730sub UnicodeVersion {
731 unless (defined $UNICODEVERSION) {
732 openunicode(\$VERSIONFH, "version");
733 chomp($UNICODEVERSION = <$VERSIONFH>);
734 close($VERSIONFH);
735 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
736 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
737 }
738 return $UNICODEVERSION;
739}
3aa957f9
JH
740
741=head2 Implementation Note
32c16050 742
ad9cab37
JH
743The first use of charinfo() opens a read-only filehandle to the Unicode
744Character Database (the database is included in the Perl distribution).
745The filehandle is then kept open for further queries.
32c16050 746
561c79ed
JH
747=head1 AUTHOR
748
749Jarkko Hietaniemi
750
751=cut
752
7531;