This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Now also allow transforming the read lines before comparing them.
[perl5.git] / lib / UnicodeCD.pm
CommitLineData
1189d1e4 1package UnicodeCD;
561c79ed
JH
2
3use strict;
4use warnings;
5
74f8133e 6our $VERSION = '0.2';
561c79ed
JH
7
8require Exporter;
9
10our @ISA = qw(Exporter);
74f8133e 11
10a6ecd2
JH
12our @EXPORT_OK = qw(charinfo
13 charblock charscript
14 charblocks charscripts
b08cd201
JH
15 charinrange
16 compexcl
17 casefold casespec);
561c79ed
JH
18
19use Carp;
20
21=head1 NAME
22
1189d1e4 23UnicodeCD - Unicode character database
561c79ed
JH
24
25=head1 SYNOPSIS
26
1189d1e4 27 use UnicodeCD 'charinfo';
b08cd201 28 my $charinfo = charinfo($codepoint);
561c79ed 29
1189d1e4 30 use UnicodeCD 'charblock';
e882dd67
JH
31 my $charblock = charblock($codepoint);
32
1189d1e4 33 use UnicodeCD 'charscript';
e882dd67 34 my $charscript = charblock($codepoint);
561c79ed 35
e145285f
JH
36 use UnicodeCD 'charblocks';
37 my $charblocks = charblocks();
38
39 use UnicodeCD 'charscripts';
40 my %charscripts = charscripts();
41
42 use UnicodeCD qw(charscript charinrange);
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
45
46 use UnicodeCD 'compexcl';
47 my $compexcl = compexcl($codepoint);
48
49 my $unicode_version = UnicodeCD::UnicodeVersion();
50
561c79ed
JH
51=head1 DESCRIPTION
52
e145285f 53The UnicodeCD module offers a simple interface to the Unicode Character
561c79ed
JH
54Database.
55
56=cut
57
10a6ecd2
JH
58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
b08cd201
JH
62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
561c79ed
JH
65
66sub openunicode {
67 my ($rfh, @path) = @_;
68 my $f;
69 unless (defined $$rfh) {
70 for my $d (@INC) {
71 use File::Spec;
72 $f = File::Spec->catfile($d, "unicode", @path);
32c16050 73 last if open($$rfh, $f);
e882dd67 74 undef $f;
561c79ed 75 }
e882dd67
JH
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
78 unless defined $f;
561c79ed
JH
79 }
80 return $f;
81}
82
83=head2 charinfo
84
1189d1e4 85 use UnicodeCD 'charinfo';
561c79ed 86
b08cd201 87 my $charinfo = charinfo(0x41);
561c79ed 88
b08cd201
JH
89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
561c79ed
JH
91
92 key
93
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
e882dd67 109
561c79ed 110 block block the character belongs to (used in \p{In...})
e882dd67 111 script script the character belongs to
561c79ed 112
b08cd201 113If no match is found, a reference to an empty hash is returned.
561c79ed 114
32c16050
JH
115The C<block> property is the same as as returned by charinfo(). It is
116not defined in the Unicode Character Database proper (Chapter 4 of the
117Unicode 3.0 Standard) but instead in an auxiliary database (Chapter 14
e882dd67 118of TUS3). Similarly for the C<script> property.
32c16050
JH
119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 122you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed
JH
123
124=cut
125
10a6ecd2
JH
126sub _getcode {
127 my $arg = shift;
128
129 if ($arg =~ /^\d+$/) {
130 return $arg;
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
132 return hex($1);
133 }
134
135 return;
136}
137
a6fa416b
ST
138sub han_charname {
139 my $arg = shift;
140 my $code = _getcode($arg);
74f8133e 141 croak __PACKAGE__, "::han_charname: unknown code '$arg'"
a6fa416b
ST
142 unless defined $code;
143 croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
144 unless 0x3400 <= $code && $code <= 0x4DB5
145 || 0x4E00 <= $code && $code <= 0x9FA5
146 || 0x20000 <= $code && $code <= 0x2A6D6;
147 sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
148}
149
150my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
151 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
152 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
153 );
154
155my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
156 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
157 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
158 "YU", "EU", "YI", "I",
159 );
160
161my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
162 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
163 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
164 "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
165 );
166
167my %HangulConst = (
168 SBase => 0xAC00,
169 LBase => 0x1100,
170 VBase => 0x1161,
171 TBase => 0x11A7,
172 LCount => 19, # scalar @JamoL
173 VCount => 21, # scalar @JamoV
174 TCount => 28, # scalar @JamoT
175 NCount => 588, # VCount * TCount
176 SCount => 11172, # LCount * NCount
177 Final => 0xD7A3, # SBase -1 + SCount
178 );
179
180sub hangul_charname {
181 my $arg = shift;
182 my $code = _getcode($arg);
74f8133e 183 croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
a6fa416b
ST
184 unless defined $code;
185 croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
186 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
187 my $SIndex = $code - $HangulConst{SBase};
188 my $LIndex = int( $SIndex / $HangulConst{NCount});
189 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
190 my $TIndex = $SIndex % $HangulConst{TCount};
191 return join('',
192 "HANGUL SYLLABLE ",
193 $JamoL[$LIndex],
194 $JamoV[$VIndex],
195 $JamoT[$TIndex],
196 );
197}
198
199sub hangul_decomp {
200 my $arg = shift;
201 my $code = _getcode($arg);
74f8133e 202 croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
a6fa416b
ST
203 unless defined $code;
204 croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
205 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
206 my $SIndex = $code - $HangulConst{SBase};
207 my $LIndex = int( $SIndex / $HangulConst{NCount});
208 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
209 my $TIndex = $SIndex % $HangulConst{TCount};
210
211 return join(" ",
212 sprintf("%04X", $HangulConst{LBase} + $LIndex),
213 sprintf("%04X", $HangulConst{VBase} + $VIndex),
214 $TIndex ?
215 sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
216 );
217}
218
219my @CharinfoRanges = (
220# block name
221# [ first, last, coderef to name, coderef to decompose ],
222# CJK Ideographs Extension A
223 [ 0x3400, 0x4DB5, \&han_charname, undef ],
224# CJK Ideographs
225 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
226# Hangul Syllables
227 [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
228# Non-Private Use High Surrogates
229 [ 0xD800, 0xDB7F, undef, undef ],
230# Private Use High Surrogates
231 [ 0xDB80, 0xDBFF, undef, undef ],
232# Low Surrogates
233 [ 0xDC00, 0xDFFF, undef, undef ],
234# The Private Use Area
235 [ 0xE000, 0xF8FF, undef, undef ],
236# CJK Ideographs Extension B
237 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
238# Plane 15 Private Use Area
239 [ 0xF0000, 0xFFFFD, undef, undef ],
240# Plane 16 Private Use Area
241 [ 0x100000, 0x10FFFD, undef, undef ],
242);
243
561c79ed 244sub charinfo {
10a6ecd2
JH
245 my $arg = shift;
246 my $code = _getcode($arg);
247 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
248 unless defined $code;
561c79ed 249 my $hexk = sprintf("%04X", $code);
a6fa416b
ST
250 my($rcode,$rname,$rdec);
251 foreach my $range (@CharinfoRanges){
74f8133e 252 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b
ST
253 $rcode = $hexk;
254 $rname = $range->[2] ? $range->[2]->($code) : '';
255 $rdec = $range->[3] ? $range->[3]->($code) : '';
74f8133e 256 $hexk = sprintf("%04X", $range->[0]); # replace by the first
a6fa416b
ST
257 last;
258 }
259 }
74f8133e 260 openunicode(\$UNICODEFH, "Unicode.txt");
10a6ecd2 261 if (defined $UNICODEFH) {
561c79ed 262 use Search::Dict;
39ff46bf 263 if (look($UNICODEFH, "$hexk;")) {
10a6ecd2 264 my $line = <$UNICODEFH>;
561c79ed
JH
265 chomp $line;
266 my %prop;
267 @prop{qw(
268 code name category
269 combining bidi decomposition
270 decimal digit numeric
271 mirrored unicode10 comment
272 upper lower title
273 )} = split(/;/, $line, -1);
274 if ($prop{code} eq $hexk) {
a196fbfd
JH
275 $prop{block} = charblock($code);
276 $prop{script} = charscript($code);
a6fa416b
ST
277 if(defined $rname){
278 $prop{code} = $rcode;
279 $prop{name} = $rname;
280 $prop{decomposition} = $rdec;
281 }
b08cd201 282 return \%prop;
561c79ed
JH
283 }
284 }
285 }
286 return;
287}
288
e882dd67
JH
289sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
290 my ($table, $lo, $hi, $code) = @_;
291
292 return if $lo > $hi;
293
294 my $mid = int(($lo+$hi) / 2);
295
296 if ($table->[$mid]->[0] < $code) {
10a6ecd2 297 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
298 return $table->[$mid]->[2];
299 } else {
300 _search($table, $mid + 1, $hi, $code);
301 }
302 } elsif ($table->[$mid]->[0] > $code) {
303 _search($table, $lo, $mid - 1, $code);
304 } else {
305 return $table->[$mid]->[2];
306 }
307}
308
10a6ecd2
JH
309sub charinrange {
310 my ($range, $arg) = @_;
311 my $code = _getcode($arg);
312 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
313 unless defined $code;
314 _search($range, 0, $#$range, $code);
315}
316
354a27bf 317=head2 charblock
561c79ed 318
1189d1e4 319 use UnicodeCD 'charblock';
561c79ed
JH
320
321 my $charblock = charblock(0x41);
10a6ecd2
JH
322 my $charblock = charblock(1234);
323 my $charblock = charblock("0x263a");
324 my $charblock = charblock("U+263a");
325
326 my $ranges = charblock('Armenian');
327
328With a B<code point argument> charblock() returns the block the character
329belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 330positions within all blocks are defined.
10a6ecd2
JH
331
332If supplied with an argument that can't be a code point, charblock()
333tries to do the opposite and interpret the argument as a character
334block. The return value is a I<range>: an anonymous list that
335contains anonymous lists, which in turn contain I<start-of-range>,
336I<end-of-range> code point pairs. You can test whether a code point
337is in a range using the L</charinrange> function. If the argument is
338not a known charater block, C<undef> is returned.
561c79ed 339
561c79ed
JH
340=cut
341
342my @BLOCKS;
10a6ecd2 343my %BLOCKS;
561c79ed 344
10a6ecd2 345sub _charblocks {
561c79ed 346 unless (@BLOCKS) {
10a6ecd2
JH
347 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
348 while (<$BLOCKSFH>) {
2796c109 349 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
350 my ($lo, $hi) = (hex($1), hex($2));
351 my $subrange = [ $lo, $hi, $3 ];
352 push @BLOCKS, $subrange;
353 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
354 }
355 }
10a6ecd2 356 close($BLOCKSFH);
561c79ed
JH
357 }
358 }
10a6ecd2
JH
359}
360
361sub charblock {
362 my $arg = shift;
363
364 _charblocks() unless @BLOCKS;
365
366 my $code = _getcode($arg);
561c79ed 367
10a6ecd2
JH
368 if (defined $code) {
369 _search(\@BLOCKS, 0, $#BLOCKS, $code);
370 } else {
371 if (exists $BLOCKS{$arg}) {
372 return $BLOCKS{$arg};
373 } else {
374 return;
375 }
376 }
e882dd67
JH
377}
378
379=head2 charscript
380
1189d1e4 381 use UnicodeCD 'charscript';
e882dd67
JH
382
383 my $charscript = charscript(0x41);
10a6ecd2
JH
384 my $charscript = charscript(1234);
385 my $charscript = charscript("U+263a");
e882dd67 386
10a6ecd2
JH
387 my $ranges = charscript('Thai');
388
389With a B<code point argument> charscript() returns the script the
b08cd201 390character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2
JH
391
392If supplied with an argument that can't be a code point, charscript()
393tries to do the opposite and interpret the argument as a character
394script. The return value is a I<range>: an anonymous list that
395contains anonymous lists, which in turn contain I<start-of-range>,
396I<end-of-range> code point pairs. You can test whether a code point
397is in a range using the L</charinrange> function. If the argument is
398not a known charater script, C<undef> is returned.
e882dd67 399
e882dd67
JH
400=cut
401
402my @SCRIPTS;
10a6ecd2 403my %SCRIPTS;
e882dd67 404
10a6ecd2 405sub _charscripts {
e882dd67 406 unless (@SCRIPTS) {
10a6ecd2
JH
407 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
408 while (<$SCRIPTSFH>) {
e882dd67 409 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
410 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
411 my $script = lc($3);
412 $script =~ s/\b(\w)/uc($1)/ge;
413 my $subrange = [ $lo, $hi, $script ];
414 push @SCRIPTS, $subrange;
415 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
416 }
417 }
10a6ecd2 418 close($SCRIPTSFH);
e882dd67
JH
419 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
420 }
421 }
10a6ecd2
JH
422}
423
424sub charscript {
425 my $arg = shift;
426
427 _charscripts() unless @SCRIPTS;
e882dd67 428
10a6ecd2
JH
429 my $code = _getcode($arg);
430
431 if (defined $code) {
432 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
433 } else {
434 if (exists $SCRIPTS{$arg}) {
435 return $SCRIPTS{$arg};
436 } else {
437 return;
438 }
439 }
440}
441
442=head2 charblocks
443
1189d1e4 444 use UnicodeCD 'charblocks';
10a6ecd2 445
b08cd201 446 my $charblocks = charblocks();
10a6ecd2 447
b08cd201
JH
448charblocks() returns a reference to a hash with the known block names
449as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2
JH
450
451=cut
452
453sub charblocks {
b08cd201
JH
454 _charblocks() unless %BLOCKS;
455 return \%BLOCKS;
10a6ecd2
JH
456}
457
458=head2 charscripts
459
1189d1e4 460 use UnicodeCD 'charscripts';
10a6ecd2
JH
461
462 my %charscripts = charscripts();
463
464charscripts() returns a hash with the known script names as the keys,
465and the code point ranges (see L</charscript>) as the values.
466
467=cut
468
469sub charscripts {
b08cd201
JH
470 _charscripts() unless %SCRIPTS;
471 return \%SCRIPTS;
561c79ed
JH
472}
473
10a6ecd2 474=head2 Blocks versus Scripts
ad9cab37 475
10a6ecd2
JH
476The difference between a block and a script is that scripts are closer
477to the linguistic notion of a set of characters required to present
478languages, while block is more of an artifact of the Unicode character
479numbering and separation into blocks of 256 characters.
3aa957f9
JH
480
481For example the Latin B<script> is spread over several B<blocks>, such
482as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
483C<Latin Extended-B>. On the other hand, the Latin script does not
484contain all the characters of the C<Basic Latin> block (also known as
485the ASCII): it includes only the letters, not for example the digits
486or the punctuation.
ad9cab37 487
3aa957f9 488For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37
JH
489
490For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
491
3aa957f9
JH
492=head2 Matching Scripts and Blocks
493
494Both scripts and blocks can be matched using the regular expression
495construct C<\p{In...}> and its negation C<\P{In...}>.
496
497The name of the script or the block comes after the C<In>, for example
498C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2
JH
499removed from the names for the C<\p{In...}>, for example
500C<LatinExtendedA> instead of C<Latin Extended-A>.
501
502There are a few cases where there exists both a script and a block by
503the same name, in these cases the block version has C<Block> appended:
504C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
505
b08cd201
JH
506=head2 Code Point Arguments
507
508A <code point argument> is either a decimal or a hexadecimal scalar,
509or "U+" followed by hexadecimals.
510
10a6ecd2
JH
511=head2 charinrange
512
513In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
514can also test whether a code point is in the I<range> as returned by
515L</charblock> and L</charscript> or as the values of the hash returned
e618509d 516by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 517
1189d1e4 518 use UnicodeCD qw(charscript charinrange);
10a6ecd2
JH
519
520 $range = charscript('Hiragana');
e145285f 521 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
522
523=cut
524
b08cd201
JH
525=head2 compexcl
526
1189d1e4 527 use UnicodeCD 'compexcl';
b08cd201
JH
528
529 my $compexcl = compexcl("09dc");
530
531The compexcl() returns the composition exclusion (that is, if the
9046a8ae
SC
532character should not be produced during a precomposition) of the
533character specified by a B<code point argument>.
b08cd201
JH
534
535If there is a composition exclusion for the character, true is
536returned. Otherwise, false is returned.
537
538=cut
539
540my %COMPEXCL;
541
542sub _compexcl {
543 unless (%COMPEXCL) {
544 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
545 while (<$COMPEXCLFH>) {
546 if (/^([0-9A-F]+) \# /) {
547 my $code = hex($1);
548 $COMPEXCL{$code} = undef;
549 }
550 }
551 close($COMPEXCLFH);
552 }
553 }
554}
555
556sub compexcl {
557 my $arg = shift;
558 my $code = _getcode($arg);
74f8133e
JH
559 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
560 unless defined $code;
b08cd201
JH
561
562 _compexcl() unless %COMPEXCL;
563
564 return exists $COMPEXCL{$code};
565}
566
567=head2 casefold
568
1189d1e4 569 use UnicodeCD 'casefold';
b08cd201
JH
570
571 my %casefold = casefold("09dc");
572
573The casefold() returns the locale-independent case folding of the
574character specified by a B<code point argument>.
575
576If there is a case folding for that character, a reference to a hash
577with the following fields is returned:
578
579 key
580
581 code code point with at least four hexdigits
582 status "C", "F", "S", or "I"
583 mapping one or more codes separated by spaces
584
585The meaning of the I<status> is as follows:
586
587 C common case folding, common mappings shared
588 by both simple and full mappings
589 F full case folding, mappings that cause strings
590 to grow in length. Multiple characters are separated
591 by spaces
592 S simple case folding, mappings to single characters
593 where different from F
594 I special case for dotted uppercase I and
595 dotless lowercase i
596 - If this mapping is included, the result is
597 case-insensitive, but dotless and dotted I's
598 are not distinguished
599 - If this mapping is excluded, the result is not
600 fully case-insensitive, but dotless and dotted
601 I's are distinguished
602
603If there is no case folding for that character, C<undef> is returned.
604
605For more information about case mappings see
606http://www.unicode.org/unicode/reports/tr21/
607
608=cut
609
610my %CASEFOLD;
611
612sub _casefold {
613 unless (%CASEFOLD) {
614 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
615 while (<$CASEFOLDFH>) {
616 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
617 my $code = hex($1);
618 $CASEFOLD{$code} = { code => $1,
619 status => $2,
620 mapping => $3 };
621 }
622 }
623 close($CASEFOLDFH);
624 }
625 }
626}
627
628sub casefold {
629 my $arg = shift;
630 my $code = _getcode($arg);
74f8133e
JH
631 croak __PACKAGE__, "::casefold: unknown code '$arg'"
632 unless defined $code;
b08cd201
JH
633
634 _casefold() unless %CASEFOLD;
635
636 return $CASEFOLD{$code};
637}
638
639=head2 casespec
640
1189d1e4 641 use UnicodeCD 'casespec';
b08cd201
JH
642
643 my %casespec = casespec("09dc");
644
645The casespec() returns the potentially locale-dependent case mapping
646of the character specified by a B<code point argument>. The mapping
647may change the length of the string (which the basic Unicode case
648mappings as returned by charinfo() never do).
649
650If there is a case folding for that character, a reference to a hash
651with the following fields is returned:
652
653 key
654
655 code code point with at least four hexdigits
656 lower lowercase
657 title titlecase
658 upper uppercase
659 condition condition list (may be undef)
660
661The C<condition> is optional. Where present, it consists of one or
662more I<locales> or I<contexts>, separated by spaces (other than as
663used to separate elements, spaces are to be ignored). A condition
664list overrides the normal behavior if all of the listed conditions are
665true. Case distinctions in the condition list are not significant.
666Conditions preceded by "NON_" represent the negation of the condition
667
668A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d
JH
669followed by a "_" and a 2-letter ISO language code (possibly followed
670by a "_" and a variant code). You can find the lists of those codes,
671see L<Locale::Country> and L<Locale::Language>.
b08cd201
JH
672
673A I<context> is one of the following choices:
674
675 FINAL The letter is not followed by a letter of
676 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
677 MODERN The mapping is only used for modern text
e618509d 678 AFTER_i The last base character was "i" (U+0069)
b08cd201
JH
679
680For more information about case mappings see
681http://www.unicode.org/unicode/reports/tr21/
682
683=cut
684
685my %CASESPEC;
686
687sub _casespec {
688 unless (%CASESPEC) {
689 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
690 while (<$CASESPECFH>) {
691 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
692 my $code = hex($1);
693 $CASESPEC{$code} = { code => $1,
694 lower => $2,
695 title => $3,
696 upper => $4,
697 condition => $5 };
698 }
699 }
700 close($CASESPECFH);
701 }
702 }
703}
704
705sub casespec {
706 my $arg = shift;
707 my $code = _getcode($arg);
74f8133e
JH
708 croak __PACKAGE__, "::casespec: unknown code '$arg'"
709 unless defined $code;
b08cd201
JH
710
711 _casespec() unless %CASESPEC;
712
713 return $CASESPEC{$code};
714}
715
1189d1e4 716=head2 UnicodeCD::UnicodeVersion
10a6ecd2 717
1189d1e4 718UnicodeCD::UnicodeVersion() returns the version of the Unicode Character
10a6ecd2
JH
719Database, in other words, the version of the Unicode standard the
720database implements.
721
722=cut
723
724my $UNICODEVERSION;
725
726sub UnicodeVersion {
727 unless (defined $UNICODEVERSION) {
728 openunicode(\$VERSIONFH, "version");
729 chomp($UNICODEVERSION = <$VERSIONFH>);
730 close($VERSIONFH);
731 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
732 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
733 }
734 return $UNICODEVERSION;
735}
3aa957f9
JH
736
737=head2 Implementation Note
32c16050 738
ad9cab37
JH
739The first use of charinfo() opens a read-only filehandle to the Unicode
740Character Database (the database is included in the Perl distribution).
741The filehandle is then kept open for further queries.
32c16050 742
561c79ed
JH
743=head1 AUTHOR
744
745Jarkko Hietaniemi
746
747=cut
748
7491;