This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Test miscounting.
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
5
74f8133e 6our $VERSION = '0.2';
561c79ed
JH
7
8require Exporter;
9
10our @ISA = qw(Exporter);
74f8133e 11
10a6ecd2
JH
12our @EXPORT_OK = qw(charinfo
13 charblock charscript
14 charblocks charscripts
b08cd201
JH
15 charinrange
16 compexcl
17 casefold casespec);
561c79ed
JH
18
19use Carp;
20
21=head1 NAME
22
55d7b906 23Unicode::UCD - Unicode character database
561c79ed
JH
24
25=head1 SYNOPSIS
26
55d7b906 27 use Unicode::UCD 'charinfo';
b08cd201 28 my $charinfo = charinfo($codepoint);
561c79ed 29
55d7b906 30 use Unicode::UCD 'charblock';
e882dd67
JH
31 my $charblock = charblock($codepoint);
32
55d7b906 33 use Unicode::UCD 'charscript';
e882dd67 34 my $charscript = charblock($codepoint);
561c79ed 35
55d7b906 36 use Unicode::UCD 'charblocks';
e145285f
JH
37 my $charblocks = charblocks();
38
55d7b906 39 use Unicode::UCD 'charscripts';
e145285f
JH
40 my %charscripts = charscripts();
41
55d7b906 42 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
45
55d7b906 46 use Unicode::UCD 'compexcl';
e145285f
JH
47 my $compexcl = compexcl($codepoint);
48
55d7b906 49 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 50
561c79ed
JH
51=head1 DESCRIPTION
52
8b731da2
JH
53The Unicode::UCD module offers a simple interface to the Unicode
54Character Database.
561c79ed
JH
55
56=cut
57
10a6ecd2
JH
58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
b08cd201
JH
62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
561c79ed
JH
65
66sub openunicode {
67 my ($rfh, @path) = @_;
68 my $f;
69 unless (defined $$rfh) {
70 for my $d (@INC) {
71 use File::Spec;
55d7b906 72 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 73 last if open($$rfh, $f);
e882dd67 74 undef $f;
561c79ed 75 }
e882dd67
JH
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
78 unless defined $f;
561c79ed
JH
79 }
80 return $f;
81}
82
83=head2 charinfo
84
55d7b906 85 use Unicode::UCD 'charinfo';
561c79ed 86
b08cd201 87 my $charinfo = charinfo(0x41);
561c79ed 88
b08cd201
JH
89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
561c79ed
JH
91
92 key
93
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
e882dd67 109
561c79ed 110 block block the character belongs to (used in \p{In...})
e882dd67 111 script script the character belongs to
561c79ed 112
b08cd201 113If no match is found, a reference to an empty hash is returned.
561c79ed 114
32c16050
JH
115The C<block> property is the same as as returned by charinfo(). It is
116not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2
JH
117Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050
JH
119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 122you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed
JH
123
124=cut
125
10a6ecd2
JH
126sub _getcode {
127 my $arg = shift;
128
129 if ($arg =~ /^\d+$/) {
130 return $arg;
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
132 return hex($1);
133 }
134
135 return;
136}
137
9087a70b
ST
138use Lingua::KO::Hangul::Util;
139
140sub hangul_decomp { # internal: called from charinfo
141 my @tmp = decomposeHangul(shift);
142 return
143 @tmp == 2 ? sprintf("%04X %04X", @tmp) :
144 @tmp == 3 ? sprintf("%04X %04X %04X", @tmp) :
145 undef;
a6fa416b
ST
146}
147
9087a70b
ST
148sub han_charname { # internal: called from charinfo
149 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b
ST
150}
151
152my @CharinfoRanges = (
153# block name
154# [ first, last, coderef to name, coderef to decompose ],
155# CJK Ideographs Extension A
156 [ 0x3400, 0x4DB5, \&han_charname, undef ],
157# CJK Ideographs
158 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
159# Hangul Syllables
9087a70b 160 [ 0xAC00, 0xD7A3, \&getHangulName, \&hangul_decomp ],
a6fa416b
ST
161# Non-Private Use High Surrogates
162 [ 0xD800, 0xDB7F, undef, undef ],
163# Private Use High Surrogates
164 [ 0xDB80, 0xDBFF, undef, undef ],
165# Low Surrogates
166 [ 0xDC00, 0xDFFF, undef, undef ],
167# The Private Use Area
168 [ 0xE000, 0xF8FF, undef, undef ],
169# CJK Ideographs Extension B
170 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
171# Plane 15 Private Use Area
172 [ 0xF0000, 0xFFFFD, undef, undef ],
173# Plane 16 Private Use Area
174 [ 0x100000, 0x10FFFD, undef, undef ],
175);
176
561c79ed 177sub charinfo {
10a6ecd2
JH
178 my $arg = shift;
179 my $code = _getcode($arg);
180 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
181 unless defined $code;
e63dbbf9 182 my $hexk = sprintf("%06X", $code);
a6fa416b
ST
183 my($rcode,$rname,$rdec);
184 foreach my $range (@CharinfoRanges){
74f8133e 185 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 186 $rcode = $hexk;
e63dbbf9
JH
187 $rcode =~ s/^0+//;
188 $rcode = sprintf("%04X", hex($rcode));
a6fa416b
ST
189 $rname = $range->[2] ? $range->[2]->($code) : '';
190 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 191 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b
ST
192 last;
193 }
194 }
74f8133e 195 openunicode(\$UNICODEFH, "Unicode.txt");
10a6ecd2 196 if (defined $UNICODEFH) {
e63dbbf9
JH
197 use Search::Dict 1.02;
198 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 199 my $line = <$UNICODEFH>;
561c79ed
JH
200 chomp $line;
201 my %prop;
202 @prop{qw(
203 code name category
204 combining bidi decomposition
205 decimal digit numeric
206 mirrored unicode10 comment
207 upper lower title
208 )} = split(/;/, $line, -1);
e63dbbf9
JH
209 $hexk =~ s/^0+//;
210 $hexk = sprintf("%04X", hex($hexk));
561c79ed 211 if ($prop{code} eq $hexk) {
a196fbfd
JH
212 $prop{block} = charblock($code);
213 $prop{script} = charscript($code);
a6fa416b
ST
214 if(defined $rname){
215 $prop{code} = $rcode;
216 $prop{name} = $rname;
217 $prop{decomposition} = $rdec;
218 }
b08cd201 219 return \%prop;
561c79ed
JH
220 }
221 }
222 }
223 return;
224}
225
e882dd67
JH
226sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
227 my ($table, $lo, $hi, $code) = @_;
228
229 return if $lo > $hi;
230
231 my $mid = int(($lo+$hi) / 2);
232
233 if ($table->[$mid]->[0] < $code) {
10a6ecd2 234 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
235 return $table->[$mid]->[2];
236 } else {
237 _search($table, $mid + 1, $hi, $code);
238 }
239 } elsif ($table->[$mid]->[0] > $code) {
240 _search($table, $lo, $mid - 1, $code);
241 } else {
242 return $table->[$mid]->[2];
243 }
244}
245
10a6ecd2
JH
246sub charinrange {
247 my ($range, $arg) = @_;
248 my $code = _getcode($arg);
249 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
250 unless defined $code;
251 _search($range, 0, $#$range, $code);
252}
253
354a27bf 254=head2 charblock
561c79ed 255
55d7b906 256 use Unicode::UCD 'charblock';
561c79ed
JH
257
258 my $charblock = charblock(0x41);
10a6ecd2
JH
259 my $charblock = charblock(1234);
260 my $charblock = charblock("0x263a");
261 my $charblock = charblock("U+263a");
262
78bf21c2 263 my $range = charblock('Armenian');
10a6ecd2 264
78bf21c2 265With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 266belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 267positions within all blocks are defined.
10a6ecd2 268
78bf21c2
JH
269See also L</Blocks versus Scripts>.
270
10a6ecd2
JH
271If supplied with an argument that can't be a code point, charblock()
272tries to do the opposite and interpret the argument as a character
273block. The return value is a I<range>: an anonymous list that
274contains anonymous lists, which in turn contain I<start-of-range>,
275I<end-of-range> code point pairs. You can test whether a code point
276is in a range using the L</charinrange> function. If the argument is
277not a known charater block, C<undef> is returned.
561c79ed 278
561c79ed
JH
279=cut
280
281my @BLOCKS;
10a6ecd2 282my %BLOCKS;
561c79ed 283
10a6ecd2 284sub _charblocks {
561c79ed 285 unless (@BLOCKS) {
10a6ecd2
JH
286 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
287 while (<$BLOCKSFH>) {
2796c109 288 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
289 my ($lo, $hi) = (hex($1), hex($2));
290 my $subrange = [ $lo, $hi, $3 ];
291 push @BLOCKS, $subrange;
292 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
293 }
294 }
10a6ecd2 295 close($BLOCKSFH);
561c79ed
JH
296 }
297 }
10a6ecd2
JH
298}
299
300sub charblock {
301 my $arg = shift;
302
303 _charblocks() unless @BLOCKS;
304
305 my $code = _getcode($arg);
561c79ed 306
10a6ecd2
JH
307 if (defined $code) {
308 _search(\@BLOCKS, 0, $#BLOCKS, $code);
309 } else {
310 if (exists $BLOCKS{$arg}) {
311 return $BLOCKS{$arg};
312 } else {
313 return;
314 }
315 }
e882dd67
JH
316}
317
318=head2 charscript
319
55d7b906 320 use Unicode::UCD 'charscript';
e882dd67
JH
321
322 my $charscript = charscript(0x41);
10a6ecd2
JH
323 my $charscript = charscript(1234);
324 my $charscript = charscript("U+263a");
e882dd67 325
78bf21c2 326 my $range = charscript('Thai');
10a6ecd2 327
78bf21c2 328With a B<code point argument> charscript() returns the I<script> the
b08cd201 329character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 330
78bf21c2
JH
331See also L</Blocks versus Scripts>.
332
10a6ecd2
JH
333If supplied with an argument that can't be a code point, charscript()
334tries to do the opposite and interpret the argument as a character
335script. The return value is a I<range>: an anonymous list that
336contains anonymous lists, which in turn contain I<start-of-range>,
337I<end-of-range> code point pairs. You can test whether a code point
338is in a range using the L</charinrange> function. If the argument is
339not a known charater script, C<undef> is returned.
e882dd67 340
e882dd67
JH
341=cut
342
343my @SCRIPTS;
10a6ecd2 344my %SCRIPTS;
e882dd67 345
10a6ecd2 346sub _charscripts {
e882dd67 347 unless (@SCRIPTS) {
10a6ecd2
JH
348 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
349 while (<$SCRIPTSFH>) {
e882dd67 350 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
351 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
352 my $script = lc($3);
353 $script =~ s/\b(\w)/uc($1)/ge;
354 my $subrange = [ $lo, $hi, $script ];
355 push @SCRIPTS, $subrange;
356 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
357 }
358 }
10a6ecd2 359 close($SCRIPTSFH);
e882dd67
JH
360 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
361 }
362 }
10a6ecd2
JH
363}
364
365sub charscript {
366 my $arg = shift;
367
368 _charscripts() unless @SCRIPTS;
e882dd67 369
10a6ecd2
JH
370 my $code = _getcode($arg);
371
372 if (defined $code) {
373 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
374 } else {
375 if (exists $SCRIPTS{$arg}) {
376 return $SCRIPTS{$arg};
377 } else {
378 return;
379 }
380 }
381}
382
383=head2 charblocks
384
55d7b906 385 use Unicode::UCD 'charblocks';
10a6ecd2 386
b08cd201 387 my $charblocks = charblocks();
10a6ecd2 388
b08cd201
JH
389charblocks() returns a reference to a hash with the known block names
390as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 391
78bf21c2
JH
392See also L</Blocks versus Scripts>.
393
10a6ecd2
JH
394=cut
395
396sub charblocks {
b08cd201
JH
397 _charblocks() unless %BLOCKS;
398 return \%BLOCKS;
10a6ecd2
JH
399}
400
401=head2 charscripts
402
55d7b906 403 use Unicode::UCD 'charscripts';
10a6ecd2
JH
404
405 my %charscripts = charscripts();
406
407charscripts() returns a hash with the known script names as the keys,
408and the code point ranges (see L</charscript>) as the values.
409
78bf21c2
JH
410See also L</Blocks versus Scripts>.
411
10a6ecd2
JH
412=cut
413
414sub charscripts {
b08cd201
JH
415 _charscripts() unless %SCRIPTS;
416 return \%SCRIPTS;
561c79ed
JH
417}
418
10a6ecd2 419=head2 Blocks versus Scripts
ad9cab37 420
10a6ecd2
JH
421The difference between a block and a script is that scripts are closer
422to the linguistic notion of a set of characters required to present
423languages, while block is more of an artifact of the Unicode character
424numbering and separation into blocks of 256 characters.
3aa957f9
JH
425
426For example the Latin B<script> is spread over several B<blocks>, such
427as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
428C<Latin Extended-B>. On the other hand, the Latin script does not
429contain all the characters of the C<Basic Latin> block (also known as
430the ASCII): it includes only the letters, not for example the digits
431or the punctuation.
ad9cab37 432
3aa957f9 433For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37
JH
434
435For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
436
3aa957f9
JH
437=head2 Matching Scripts and Blocks
438
439Both scripts and blocks can be matched using the regular expression
440construct C<\p{In...}> and its negation C<\P{In...}>.
441
442The name of the script or the block comes after the C<In>, for example
443C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2
JH
444removed from the names for the C<\p{In...}>, for example
445C<LatinExtendedA> instead of C<Latin Extended-A>.
446
78bf21c2
JH
447There are a few cases where there is both a script and a block by the
448same name, in these cases the block version has C<Block> appended to
449its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
450the block.
10a6ecd2 451
b08cd201
JH
452=head2 Code Point Arguments
453
78bf21c2
JH
454A <code point argument> is either a decimal or a hexadecimal scalar
455designating a Unicode character, or "U+" followed by hexadecimals
456designating a Unicode character. Note that Unicode is B<not> limited
457to 16 bits (the number of Unicode characters is open-ended, in theory
458unlimited): you may have more than 4 hexdigits.
b08cd201 459
10a6ecd2
JH
460=head2 charinrange
461
462In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
463can also test whether a code point is in the I<range> as returned by
464L</charblock> and L</charscript> or as the values of the hash returned
e618509d 465by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 466
55d7b906 467 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
468
469 $range = charscript('Hiragana');
e145285f 470 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
471
472=cut
473
b08cd201
JH
474=head2 compexcl
475
55d7b906 476 use Unicode::UCD 'compexcl';
b08cd201
JH
477
478 my $compexcl = compexcl("09dc");
479
480The compexcl() returns the composition exclusion (that is, if the
9046a8ae
SC
481character should not be produced during a precomposition) of the
482character specified by a B<code point argument>.
b08cd201
JH
483
484If there is a composition exclusion for the character, true is
485returned. Otherwise, false is returned.
486
487=cut
488
489my %COMPEXCL;
490
491sub _compexcl {
492 unless (%COMPEXCL) {
493 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
494 while (<$COMPEXCLFH>) {
495 if (/^([0-9A-F]+) \# /) {
496 my $code = hex($1);
497 $COMPEXCL{$code} = undef;
498 }
499 }
500 close($COMPEXCLFH);
501 }
502 }
503}
504
505sub compexcl {
506 my $arg = shift;
507 my $code = _getcode($arg);
74f8133e
JH
508 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
509 unless defined $code;
b08cd201
JH
510
511 _compexcl() unless %COMPEXCL;
512
513 return exists $COMPEXCL{$code};
514}
515
516=head2 casefold
517
55d7b906 518 use Unicode::UCD 'casefold';
b08cd201
JH
519
520 my %casefold = casefold("09dc");
521
522The casefold() returns the locale-independent case folding of the
523character specified by a B<code point argument>.
524
525If there is a case folding for that character, a reference to a hash
526with the following fields is returned:
527
528 key
529
530 code code point with at least four hexdigits
531 status "C", "F", "S", or "I"
532 mapping one or more codes separated by spaces
533
534The meaning of the I<status> is as follows:
535
536 C common case folding, common mappings shared
537 by both simple and full mappings
538 F full case folding, mappings that cause strings
539 to grow in length. Multiple characters are separated
540 by spaces
541 S simple case folding, mappings to single characters
542 where different from F
543 I special case for dotted uppercase I and
544 dotless lowercase i
545 - If this mapping is included, the result is
546 case-insensitive, but dotless and dotted I's
547 are not distinguished
548 - If this mapping is excluded, the result is not
549 fully case-insensitive, but dotless and dotted
550 I's are distinguished
551
552If there is no case folding for that character, C<undef> is returned.
553
554For more information about case mappings see
555http://www.unicode.org/unicode/reports/tr21/
556
557=cut
558
559my %CASEFOLD;
560
561sub _casefold {
562 unless (%CASEFOLD) {
563 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
564 while (<$CASEFOLDFH>) {
565 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
566 my $code = hex($1);
567 $CASEFOLD{$code} = { code => $1,
568 status => $2,
569 mapping => $3 };
570 }
571 }
572 close($CASEFOLDFH);
573 }
574 }
575}
576
577sub casefold {
578 my $arg = shift;
579 my $code = _getcode($arg);
74f8133e
JH
580 croak __PACKAGE__, "::casefold: unknown code '$arg'"
581 unless defined $code;
b08cd201
JH
582
583 _casefold() unless %CASEFOLD;
584
585 return $CASEFOLD{$code};
586}
587
588=head2 casespec
589
55d7b906 590 use Unicode::UCD 'casespec';
b08cd201
JH
591
592 my %casespec = casespec("09dc");
593
594The casespec() returns the potentially locale-dependent case mapping
595of the character specified by a B<code point argument>. The mapping
596may change the length of the string (which the basic Unicode case
597mappings as returned by charinfo() never do).
598
599If there is a case folding for that character, a reference to a hash
600with the following fields is returned:
601
602 key
603
604 code code point with at least four hexdigits
605 lower lowercase
606 title titlecase
607 upper uppercase
608 condition condition list (may be undef)
609
610The C<condition> is optional. Where present, it consists of one or
611more I<locales> or I<contexts>, separated by spaces (other than as
612used to separate elements, spaces are to be ignored). A condition
613list overrides the normal behavior if all of the listed conditions are
614true. Case distinctions in the condition list are not significant.
615Conditions preceded by "NON_" represent the negation of the condition
616
f499c386
JH
617Note that when there are multiple case folding definitions for a
618single code point because of different locales, the value returned by
619casespec() is a hash reference which has the locales as the keys and
620hash references as described above as the values.
621
b08cd201 622A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d
JH
623followed by a "_" and a 2-letter ISO language code (possibly followed
624by a "_" and a variant code). You can find the lists of those codes,
625see L<Locale::Country> and L<Locale::Language>.
b08cd201
JH
626
627A I<context> is one of the following choices:
628
629 FINAL The letter is not followed by a letter of
630 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
631 MODERN The mapping is only used for modern text
e618509d 632 AFTER_i The last base character was "i" (U+0069)
b08cd201
JH
633
634For more information about case mappings see
635http://www.unicode.org/unicode/reports/tr21/
636
637=cut
638
639my %CASESPEC;
640
641sub _casespec {
642 unless (%CASESPEC) {
643 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
644 while (<$CASESPECFH>) {
645 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
646 my ($hexcode, $lower, $title, $upper, $condition) =
647 ($1, $2, $3, $4, $5);
648 my $code = hex($hexcode);
649 if (exists $CASESPEC{$code}) {
650 if (exists $CASESPEC{$code}->{code}) {
651 my ($oldlower,
652 $oldtitle,
653 $oldupper,
654 $oldcondition) =
655 @{$CASESPEC{$code}}{qw(lower
656 title
657 upper
658 condition)};
659 my ($oldlocale) =
660 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
661 if (defined $oldlocale) {
662 delete $CASESPEC{$code};
663 $CASESPEC{$code}->{$oldlocale} =
664 { code => $hexcode,
665 lower => $oldlower,
666 title => $oldtitle,
667 upper => $oldupper,
668 condition => $oldcondition };
669 } else {
670 warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
671 }
672 }
673 my ($locale) =
674 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
675 $CASESPEC{$code}->{$locale} =
676 { code => $hexcode,
677 lower => $lower,
678 title => $title,
679 upper => $upper,
680 condition => $condition };
681 } else {
682 $CASESPEC{$code} =
683 { code => $hexcode,
684 lower => $lower,
685 title => $title,
686 upper => $upper,
687 condition => $condition };
688 }
b08cd201
JH
689 }
690 }
691 close($CASESPECFH);
692 }
693 }
694}
695
696sub casespec {
697 my $arg = shift;
698 my $code = _getcode($arg);
74f8133e
JH
699 croak __PACKAGE__, "::casespec: unknown code '$arg'"
700 unless defined $code;
b08cd201
JH
701
702 _casespec() unless %CASESPEC;
703
704 return $CASESPEC{$code};
705}
706
55d7b906 707=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 708
55d7b906
JH
709Unicode::UCD::UnicodeVersion() returns the version of the Unicode
710Character Database, in other words, the version of the Unicode
78bf21c2
JH
711standard the database implements. The version is a string
712of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
713
714=cut
715
716my $UNICODEVERSION;
717
718sub UnicodeVersion {
719 unless (defined $UNICODEVERSION) {
720 openunicode(\$VERSIONFH, "version");
721 chomp($UNICODEVERSION = <$VERSIONFH>);
722 close($VERSIONFH);
723 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
724 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
725 }
726 return $UNICODEVERSION;
727}
3aa957f9
JH
728
729=head2 Implementation Note
32c16050 730
ad9cab37
JH
731The first use of charinfo() opens a read-only filehandle to the Unicode
732Character Database (the database is included in the Perl distribution).
78bf21c2
JH
733The filehandle is then kept open for further queries. In other words,
734if you are wondering where one of your filehandles went, that's where.
32c16050 735
8b731da2
JH
736=head1 BUGS
737
738Does not yet support EBCDIC platforms.
739
561c79ed
JH
740=head1 AUTHOR
741
742Jarkko Hietaniemi
743
744=cut
745
7461;