This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
UnicodeCD::charinfo
[perl5.git] / lib / UnicodeCD.pm
CommitLineData
1189d1e4 1package UnicodeCD;
561c79ed
JH
2
3use strict;
4use warnings;
5
10a6ecd2 6our $VERSION = '0.1';
561c79ed
JH
7
8require Exporter;
9
10our @ISA = qw(Exporter);
10a6ecd2
JH
11our @EXPORT_OK = qw(charinfo
12 charblock charscript
13 charblocks charscripts
b08cd201
JH
14 charinrange
15 compexcl
16 casefold casespec);
561c79ed
JH
17
18use Carp;
19
20=head1 NAME
21
1189d1e4 22UnicodeCD - Unicode character database
561c79ed
JH
23
24=head1 SYNOPSIS
25
1189d1e4 26 use UnicodeCD 'charinfo';
b08cd201 27 my $charinfo = charinfo($codepoint);
561c79ed 28
1189d1e4 29 use UnicodeCD 'charblock';
e882dd67
JH
30 my $charblock = charblock($codepoint);
31
1189d1e4 32 use UnicodeCD 'charscript';
e882dd67 33 my $charscript = charblock($codepoint);
561c79ed 34
e145285f
JH
35 use UnicodeCD 'charblocks';
36 my $charblocks = charblocks();
37
38 use UnicodeCD 'charscripts';
39 my %charscripts = charscripts();
40
41 use UnicodeCD qw(charscript charinrange);
42 my $range = charscript($script);
43 print "looks like $script\n" if charinrange($range, $codepoint);
44
45 use UnicodeCD 'compexcl';
46 my $compexcl = compexcl($codepoint);
47
48 my $unicode_version = UnicodeCD::UnicodeVersion();
49
561c79ed
JH
50=head1 DESCRIPTION
51
e145285f 52The UnicodeCD module offers a simple interface to the Unicode Character
561c79ed
JH
53Database.
54
55=cut
56
10a6ecd2
JH
57my $UNICODEFH;
58my $BLOCKSFH;
59my $SCRIPTSFH;
60my $VERSIONFH;
b08cd201
JH
61my $COMPEXCLFH;
62my $CASEFOLDFH;
63my $CASESPECFH;
561c79ed
JH
64
65sub openunicode {
66 my ($rfh, @path) = @_;
67 my $f;
68 unless (defined $$rfh) {
69 for my $d (@INC) {
70 use File::Spec;
71 $f = File::Spec->catfile($d, "unicode", @path);
32c16050 72 last if open($$rfh, $f);
e882dd67 73 undef $f;
561c79ed 74 }
e882dd67
JH
75 croak __PACKAGE__, ": failed to find ",
76 File::Spec->catfile(@path), " in @INC"
77 unless defined $f;
561c79ed
JH
78 }
79 return $f;
80}
81
82=head2 charinfo
83
1189d1e4 84 use UnicodeCD 'charinfo';
561c79ed 85
b08cd201 86 my $charinfo = charinfo(0x41);
561c79ed 87
b08cd201
JH
88charinfo() returns a reference to a hash that has the following fields
89as defined by the Unicode standard:
561c79ed
JH
90
91 key
92
93 code code point with at least four hexdigits
94 name name of the character IN UPPER CASE
95 category general category of the character
96 combining classes used in the Canonical Ordering Algorithm
97 bidi bidirectional category
98 decomposition character decomposition mapping
99 decimal if decimal digit this is the integer numeric value
100 digit if digit this is the numeric value
101 numeric if numeric is the integer or rational numeric value
102 mirrored if mirrored in bidirectional text
103 unicode10 Unicode 1.0 name if existed and different
104 comment ISO 10646 comment field
105 upper uppercase equivalent mapping
106 lower lowercase equivalent mapping
107 title titlecase equivalent mapping
e882dd67 108
561c79ed 109 block block the character belongs to (used in \p{In...})
e882dd67 110 script script the character belongs to
561c79ed 111
b08cd201 112If no match is found, a reference to an empty hash is returned.
561c79ed 113
32c16050
JH
114The C<block> property is the same as as returned by charinfo(). It is
115not defined in the Unicode Character Database proper (Chapter 4 of the
116Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67 117of TUS3). Similarly for the C<script> property.
32c16050
JH
118
119Note that you cannot do (de)composition and casing based solely on the
120above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 121you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed
JH
122
123=cut
124
10a6ecd2
JH
125sub _getcode {
126 my $arg = shift;
127
128 if ($arg =~ /^\d+$/) {
129 return $arg;
130 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
131 return hex($1);
132 }
133
134 return;
135}
136
a6fa416b
ST
137sub han_charname {
138 my $arg = shift;
139 my $code = _getcode($arg);
140 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
141 unless defined $code;
142 croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
143 unless 0x3400 <= $code && $code <= 0x4DB5
144 || 0x4E00 <= $code && $code <= 0x9FA5
145 || 0x20000 <= $code && $code <= 0x2A6D6;
146 sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
147}
148
149my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
150 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
151 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
152 );
153
154my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
155 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
156 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
157 "YU", "EU", "YI", "I",
158 );
159
160my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
161 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
162 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
163 "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
164 );
165
166my %HangulConst = (
167 SBase => 0xAC00,
168 LBase => 0x1100,
169 VBase => 0x1161,
170 TBase => 0x11A7,
171 LCount => 19, # scalar @JamoL
172 VCount => 21, # scalar @JamoV
173 TCount => 28, # scalar @JamoT
174 NCount => 588, # VCount * TCount
175 SCount => 11172, # LCount * NCount
176 Final => 0xD7A3, # SBase -1 + SCount
177 );
178
179sub hangul_charname {
180 my $arg = shift;
181 my $code = _getcode($arg);
182 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
183 unless defined $code;
184 croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
185 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
186 my $SIndex = $code - $HangulConst{SBase};
187 my $LIndex = int( $SIndex / $HangulConst{NCount});
188 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
189 my $TIndex = $SIndex % $HangulConst{TCount};
190 return join('',
191 "HANGUL SYLLABLE ",
192 $JamoL[$LIndex],
193 $JamoV[$VIndex],
194 $JamoT[$TIndex],
195 );
196}
197
198sub hangul_decomp {
199 my $arg = shift;
200 my $code = _getcode($arg);
201 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
202 unless defined $code;
203 croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
204 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
205 my $SIndex = $code - $HangulConst{SBase};
206 my $LIndex = int( $SIndex / $HangulConst{NCount});
207 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
208 my $TIndex = $SIndex % $HangulConst{TCount};
209
210 return join(" ",
211 sprintf("%04X", $HangulConst{LBase} + $LIndex),
212 sprintf("%04X", $HangulConst{VBase} + $VIndex),
213 $TIndex ?
214 sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
215 );
216}
217
218my @CharinfoRanges = (
219# block name
220# [ first, last, coderef to name, coderef to decompose ],
221# CJK Ideographs Extension A
222 [ 0x3400, 0x4DB5, \&han_charname, undef ],
223# CJK Ideographs
224 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
225# Hangul Syllables
226 [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
227# Non-Private Use High Surrogates
228 [ 0xD800, 0xDB7F, undef, undef ],
229# Private Use High Surrogates
230 [ 0xDB80, 0xDBFF, undef, undef ],
231# Low Surrogates
232 [ 0xDC00, 0xDFFF, undef, undef ],
233# The Private Use Area
234 [ 0xE000, 0xF8FF, undef, undef ],
235# CJK Ideographs Extension B
236 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
237# Plane 15 Private Use Area
238 [ 0xF0000, 0xFFFFD, undef, undef ],
239# Plane 16 Private Use Area
240 [ 0x100000, 0x10FFFD, undef, undef ],
241);
242
561c79ed 243sub charinfo {
10a6ecd2
JH
244 my $arg = shift;
245 my $code = _getcode($arg);
246 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
247 unless defined $code;
561c79ed 248 my $hexk = sprintf("%04X", $code);
a6fa416b
ST
249 my($rcode,$rname,$rdec);
250 foreach my $range (@CharinfoRanges){
251 if($range->[0] <= $code && $code <= $range->[1]){
252 $rcode = $hexk;
253 $rname = $range->[2] ? $range->[2]->($code) : '';
254 $rdec = $range->[3] ? $range->[3]->($code) : '';
255 $hexk = sprintf("%04X",$range->[0]); # replace by the first
256 last;
257 }
258 }
259 openunicode(\$UNICODEFH, "Unicode.sort"); # sorted
10a6ecd2 260 if (defined $UNICODEFH) {
561c79ed 261 use Search::Dict;
10a6ecd2
JH
262 if (look($UNICODEFH, "$hexk;") >= 0) {
263 my $line = <$UNICODEFH>;
561c79ed
JH
264 chomp $line;
265 my %prop;
266 @prop{qw(
267 code name category
268 combining bidi decomposition
269 decimal digit numeric
270 mirrored unicode10 comment
271 upper lower title
272 )} = split(/;/, $line, -1);
273 if ($prop{code} eq $hexk) {
a196fbfd
JH
274 $prop{block} = charblock($code);
275 $prop{script} = charscript($code);
a6fa416b
ST
276 if(defined $rname){
277 $prop{code} = $rcode;
278 $prop{name} = $rname;
279 $prop{decomposition} = $rdec;
280 }
b08cd201 281 return \%prop;
561c79ed
JH
282 }
283 }
284 }
285 return;
286}
287
e882dd67
JH
288sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
289 my ($table, $lo, $hi, $code) = @_;
290
291 return if $lo > $hi;
292
293 my $mid = int(($lo+$hi) / 2);
294
295 if ($table->[$mid]->[0] < $code) {
10a6ecd2 296 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
297 return $table->[$mid]->[2];
298 } else {
299 _search($table, $mid + 1, $hi, $code);
300 }
301 } elsif ($table->[$mid]->[0] > $code) {
302 _search($table, $lo, $mid - 1, $code);
303 } else {
304 return $table->[$mid]->[2];
305 }
306}
307
10a6ecd2
JH
308sub charinrange {
309 my ($range, $arg) = @_;
310 my $code = _getcode($arg);
311 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
312 unless defined $code;
313 _search($range, 0, $#$range, $code);
314}
315
354a27bf 316=head2 charblock
561c79ed 317
1189d1e4 318 use UnicodeCD 'charblock';
561c79ed
JH
319
320 my $charblock = charblock(0x41);
10a6ecd2
JH
321 my $charblock = charblock(1234);
322 my $charblock = charblock("0x263a");
323 my $charblock = charblock("U+263a");
324
325 my $ranges = charblock('Armenian');
326
327With a B<code point argument> charblock() returns the block the character
328belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 329positions within all blocks are defined.
10a6ecd2
JH
330
331If supplied with an argument that can't be a code point, charblock()
332tries to do the opposite and interpret the argument as a character
333block. The return value is a I<range>: an anonymous list that
334contains anonymous lists, which in turn contain I<start-of-range>,
335I<end-of-range> code point pairs. You can test whether a code point
336is in a range using the L</charinrange> function. If the argument is
337not a known charater block, C<undef> is returned.
561c79ed 338
561c79ed
JH
339=cut
340
341my @BLOCKS;
10a6ecd2 342my %BLOCKS;
561c79ed 343
10a6ecd2 344sub _charblocks {
561c79ed 345 unless (@BLOCKS) {
10a6ecd2
JH
346 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
347 while (<$BLOCKSFH>) {
2796c109 348 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
349 my ($lo, $hi) = (hex($1), hex($2));
350 my $subrange = [ $lo, $hi, $3 ];
351 push @BLOCKS, $subrange;
352 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
353 }
354 }
10a6ecd2 355 close($BLOCKSFH);
561c79ed
JH
356 }
357 }
10a6ecd2
JH
358}
359
360sub charblock {
361 my $arg = shift;
362
363 _charblocks() unless @BLOCKS;
364
365 my $code = _getcode($arg);
561c79ed 366
10a6ecd2
JH
367 if (defined $code) {
368 _search(\@BLOCKS, 0, $#BLOCKS, $code);
369 } else {
370 if (exists $BLOCKS{$arg}) {
371 return $BLOCKS{$arg};
372 } else {
373 return;
374 }
375 }
e882dd67
JH
376}
377
378=head2 charscript
379
1189d1e4 380 use UnicodeCD 'charscript';
e882dd67
JH
381
382 my $charscript = charscript(0x41);
10a6ecd2
JH
383 my $charscript = charscript(1234);
384 my $charscript = charscript("U+263a");
e882dd67 385
10a6ecd2
JH
386 my $ranges = charscript('Thai');
387
388With a B<code point argument> charscript() returns the script the
b08cd201 389character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2
JH
390
391If supplied with an argument that can't be a code point, charscript()
392tries to do the opposite and interpret the argument as a character
393script. The return value is a I<range>: an anonymous list that
394contains anonymous lists, which in turn contain I<start-of-range>,
395I<end-of-range> code point pairs. You can test whether a code point
396is in a range using the L</charinrange> function. If the argument is
397not a known charater script, C<undef> is returned.
e882dd67 398
e882dd67
JH
399=cut
400
401my @SCRIPTS;
10a6ecd2 402my %SCRIPTS;
e882dd67 403
10a6ecd2 404sub _charscripts {
e882dd67 405 unless (@SCRIPTS) {
10a6ecd2
JH
406 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
407 while (<$SCRIPTSFH>) {
e882dd67 408 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
409 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
410 my $script = lc($3);
411 $script =~ s/\b(\w)/uc($1)/ge;
412 my $subrange = [ $lo, $hi, $script ];
413 push @SCRIPTS, $subrange;
414 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
415 }
416 }
10a6ecd2 417 close($SCRIPTSFH);
e882dd67
JH
418 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
419 }
420 }
10a6ecd2
JH
421}
422
423sub charscript {
424 my $arg = shift;
425
426 _charscripts() unless @SCRIPTS;
e882dd67 427
10a6ecd2
JH
428 my $code = _getcode($arg);
429
430 if (defined $code) {
431 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
432 } else {
433 if (exists $SCRIPTS{$arg}) {
434 return $SCRIPTS{$arg};
435 } else {
436 return;
437 }
438 }
439}
440
441=head2 charblocks
442
1189d1e4 443 use UnicodeCD 'charblocks';
10a6ecd2 444
b08cd201 445 my $charblocks = charblocks();
10a6ecd2 446
b08cd201
JH
447charblocks() returns a reference to a hash with the known block names
448as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2
JH
449
450=cut
451
452sub charblocks {
b08cd201
JH
453 _charblocks() unless %BLOCKS;
454 return \%BLOCKS;
10a6ecd2
JH
455}
456
457=head2 charscripts
458
1189d1e4 459 use UnicodeCD 'charscripts';
10a6ecd2
JH
460
461 my %charscripts = charscripts();
462
463charscripts() returns a hash with the known script names as the keys,
464and the code point ranges (see L</charscript>) as the values.
465
466=cut
467
468sub charscripts {
b08cd201
JH
469 _charscripts() unless %SCRIPTS;
470 return \%SCRIPTS;
561c79ed
JH
471}
472
10a6ecd2 473=head2 Blocks versus Scripts
ad9cab37 474
10a6ecd2
JH
475The difference between a block and a script is that scripts are closer
476to the linguistic notion of a set of characters required to present
477languages, while block is more of an artifact of the Unicode character
478numbering and separation into blocks of 256 characters.
3aa957f9
JH
479
480For example the Latin B<script> is spread over several B<blocks>, such
481as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
482C<Latin Extended-B>. On the other hand, the Latin script does not
483contain all the characters of the C<Basic Latin> block (also known as
484the ASCII): it includes only the letters, not for example the digits
485or the punctuation.
ad9cab37 486
3aa957f9 487For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37
JH
488
489For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
490
3aa957f9
JH
491=head2 Matching Scripts and Blocks
492
493Both scripts and blocks can be matched using the regular expression
494construct C<\p{In...}> and its negation C<\P{In...}>.
495
496The name of the script or the block comes after the C<In>, for example
497C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2
JH
498removed from the names for the C<\p{In...}>, for example
499C<LatinExtendedA> instead of C<Latin Extended-A>.
500
501There are a few cases where there exists both a script and a block by
502the same name, in these cases the block version has C<Block> appended:
503C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
504
b08cd201
JH
505=head2 Code Point Arguments
506
507A <code point argument> is either a decimal or a hexadecimal scalar,
508or "U+" followed by hexadecimals.
509
10a6ecd2
JH
510=head2 charinrange
511
512In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
513can also test whether a code point is in the I<range> as returned by
514L</charblock> and L</charscript> or as the values of the hash returned
e618509d 515by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 516
1189d1e4 517 use UnicodeCD qw(charscript charinrange);
10a6ecd2
JH
518
519 $range = charscript('Hiragana');
e145285f 520 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
521
522=cut
523
b08cd201
JH
524=head2 compexcl
525
1189d1e4 526 use UnicodeCD 'compexcl';
b08cd201
JH
527
528 my $compexcl = compexcl("09dc");
529
530The compexcl() returns the composition exclusion (that is, if the
9046a8ae
SC
531character should not be produced during a precomposition) of the
532character specified by a B<code point argument>.
b08cd201
JH
533
534If there is a composition exclusion for the character, true is
535returned. Otherwise, false is returned.
536
537=cut
538
539my %COMPEXCL;
540
541sub _compexcl {
542 unless (%COMPEXCL) {
543 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
544 while (<$COMPEXCLFH>) {
545 if (/^([0-9A-F]+) \# /) {
546 my $code = hex($1);
547 $COMPEXCL{$code} = undef;
548 }
549 }
550 close($COMPEXCLFH);
551 }
552 }
553}
554
555sub compexcl {
556 my $arg = shift;
557 my $code = _getcode($arg);
558
559 _compexcl() unless %COMPEXCL;
560
561 return exists $COMPEXCL{$code};
562}
563
564=head2 casefold
565
1189d1e4 566 use UnicodeCD 'casefold';
b08cd201
JH
567
568 my %casefold = casefold("09dc");
569
570The casefold() returns the locale-independent case folding of the
571character specified by a B<code point argument>.
572
573If there is a case folding for that character, a reference to a hash
574with the following fields is returned:
575
576 key
577
578 code code point with at least four hexdigits
579 status "C", "F", "S", or "I"
580 mapping one or more codes separated by spaces
581
582The meaning of the I<status> is as follows:
583
584 C common case folding, common mappings shared
585 by both simple and full mappings
586 F full case folding, mappings that cause strings
587 to grow in length. Multiple characters are separated
588 by spaces
589 S simple case folding, mappings to single characters
590 where different from F
591 I special case for dotted uppercase I and
592 dotless lowercase i
593 - If this mapping is included, the result is
594 case-insensitive, but dotless and dotted I's
595 are not distinguished
596 - If this mapping is excluded, the result is not
597 fully case-insensitive, but dotless and dotted
598 I's are distinguished
599
600If there is no case folding for that character, C<undef> is returned.
601
602For more information about case mappings see
603http://www.unicode.org/unicode/reports/tr21/
604
605=cut
606
607my %CASEFOLD;
608
609sub _casefold {
610 unless (%CASEFOLD) {
611 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
612 while (<$CASEFOLDFH>) {
613 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
614 my $code = hex($1);
615 $CASEFOLD{$code} = { code => $1,
616 status => $2,
617 mapping => $3 };
618 }
619 }
620 close($CASEFOLDFH);
621 }
622 }
623}
624
625sub casefold {
626 my $arg = shift;
627 my $code = _getcode($arg);
628
629 _casefold() unless %CASEFOLD;
630
631 return $CASEFOLD{$code};
632}
633
634=head2 casespec
635
1189d1e4 636 use UnicodeCD 'casespec';
b08cd201
JH
637
638 my %casespec = casespec("09dc");
639
640The casespec() returns the potentially locale-dependent case mapping
641of the character specified by a B<code point argument>. The mapping
642may change the length of the string (which the basic Unicode case
643mappings as returned by charinfo() never do).
644
645If there is a case folding for that character, a reference to a hash
646with the following fields is returned:
647
648 key
649
650 code code point with at least four hexdigits
651 lower lowercase
652 title titlecase
653 upper uppercase
654 condition condition list (may be undef)
655
656The C<condition> is optional. Where present, it consists of one or
657more I<locales> or I<contexts>, separated by spaces (other than as
658used to separate elements, spaces are to be ignored). A condition
659list overrides the normal behavior if all of the listed conditions are
660true. Case distinctions in the condition list are not significant.
661Conditions preceded by "NON_" represent the negation of the condition
662
663A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d
JH
664followed by a "_" and a 2-letter ISO language code (possibly followed
665by a "_" and a variant code). You can find the lists of those codes,
666see L<Locale::Country> and L<Locale::Language>.
b08cd201
JH
667
668A I<context> is one of the following choices:
669
670 FINAL The letter is not followed by a letter of
671 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
672 MODERN The mapping is only used for modern text
e618509d 673 AFTER_i The last base character was "i" (U+0069)
b08cd201
JH
674
675For more information about case mappings see
676http://www.unicode.org/unicode/reports/tr21/
677
678=cut
679
680my %CASESPEC;
681
682sub _casespec {
683 unless (%CASESPEC) {
684 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
685 while (<$CASESPECFH>) {
686 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
687 my $code = hex($1);
688 $CASESPEC{$code} = { code => $1,
689 lower => $2,
690 title => $3,
691 upper => $4,
692 condition => $5 };
693 }
694 }
695 close($CASESPECFH);
696 }
697 }
698}
699
700sub casespec {
701 my $arg = shift;
702 my $code = _getcode($arg);
703
704 _casespec() unless %CASESPEC;
705
706 return $CASESPEC{$code};
707}
708
1189d1e4 709=head2 UnicodeCD::UnicodeVersion
10a6ecd2 710
1189d1e4 711UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
10a6ecd2
JH
712Database, in other words, the version of the Unicode standard the
713database implements.
714
715=cut
716
717my $UNICODEVERSION;
718
719sub UnicodeVersion {
720 unless (defined $UNICODEVERSION) {
721 openunicode(\$VERSIONFH, "version");
722 chomp($UNICODEVERSION = <$VERSIONFH>);
723 close($VERSIONFH);
724 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
725 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
726 }
727 return $UNICODEVERSION;
728}
3aa957f9
JH
729
730=head2 Implementation Note
32c16050 731
ad9cab37
JH
732The first use of charinfo() opens a read-only filehandle to the Unicode
733Character Database (the database is included in the Perl distribution).
734The filehandle is then kept open for further queries.
32c16050 735
561c79ed
JH
736=head1 AUTHOR
737
738Jarkko Hietaniemi
739
740=cut
741
7421;