This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Change #11828 wasn't complete, this updates to intest path
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
5
74f8133e 6our $VERSION = '0.2';
561c79ed
JH
7
8require Exporter;
9
10our @ISA = qw(Exporter);
74f8133e 11
10a6ecd2
JH
12our @EXPORT_OK = qw(charinfo
13 charblock charscript
14 charblocks charscripts
b08cd201
JH
15 charinrange
16 compexcl
17 casefold casespec);
561c79ed
JH
18
19use Carp;
20
21=head1 NAME
22
55d7b906 23Unicode::UCD - Unicode character database
561c79ed
JH
24
25=head1 SYNOPSIS
26
55d7b906 27 use Unicode::UCD 'charinfo';
b08cd201 28 my $charinfo = charinfo($codepoint);
561c79ed 29
55d7b906 30 use Unicode::UCD 'charblock';
e882dd67
JH
31 my $charblock = charblock($codepoint);
32
55d7b906 33 use Unicode::UCD 'charscript';
e882dd67 34 my $charscript = charblock($codepoint);
561c79ed 35
55d7b906 36 use Unicode::UCD 'charblocks';
e145285f
JH
37 my $charblocks = charblocks();
38
55d7b906 39 use Unicode::UCD 'charscripts';
e145285f
JH
40 my %charscripts = charscripts();
41
55d7b906 42 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
43 my $range = charscript($script);
44 print "looks like $script\n" if charinrange($range, $codepoint);
45
55d7b906 46 use Unicode::UCD 'compexcl';
e145285f
JH
47 my $compexcl = compexcl($codepoint);
48
55d7b906 49 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 50
561c79ed
JH
51=head1 DESCRIPTION
52
55d7b906 53The Unicode::UCD module offers a simple interface to the Unicode Character
561c79ed
JH
54Database.
55
56=cut
57
10a6ecd2
JH
58my $UNICODEFH;
59my $BLOCKSFH;
60my $SCRIPTSFH;
61my $VERSIONFH;
b08cd201
JH
62my $COMPEXCLFH;
63my $CASEFOLDFH;
64my $CASESPECFH;
561c79ed
JH
65
66sub openunicode {
67 my ($rfh, @path) = @_;
68 my $f;
69 unless (defined $$rfh) {
70 for my $d (@INC) {
71 use File::Spec;
55d7b906 72 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 73 last if open($$rfh, $f);
e882dd67 74 undef $f;
561c79ed 75 }
e882dd67
JH
76 croak __PACKAGE__, ": failed to find ",
77 File::Spec->catfile(@path), " in @INC"
78 unless defined $f;
561c79ed
JH
79 }
80 return $f;
81}
82
83=head2 charinfo
84
55d7b906 85 use Unicode::UCD 'charinfo';
561c79ed 86
b08cd201 87 my $charinfo = charinfo(0x41);
561c79ed 88
b08cd201
JH
89charinfo() returns a reference to a hash that has the following fields
90as defined by the Unicode standard:
561c79ed
JH
91
92 key
93
94 code code point with at least four hexdigits
95 name name of the character IN UPPER CASE
96 category general category of the character
97 combining classes used in the Canonical Ordering Algorithm
98 bidi bidirectional category
99 decomposition character decomposition mapping
100 decimal if decimal digit this is the integer numeric value
101 digit if digit this is the numeric value
102 numeric if numeric is the integer or rational numeric value
103 mirrored if mirrored in bidirectional text
104 unicode10 Unicode 1.0 name if existed and different
105 comment ISO 10646 comment field
106 upper uppercase equivalent mapping
107 lower lowercase equivalent mapping
108 title titlecase equivalent mapping
e882dd67 109
561c79ed 110 block block the character belongs to (used in \p{In...})
e882dd67 111 script script the character belongs to
561c79ed 112
b08cd201 113If no match is found, a reference to an empty hash is returned.
561c79ed 114
32c16050
JH
115The C<block> property is the same as as returned by charinfo(). It is
116not defined in the Unicode Character Database proper (Chapter 4 of the
78bf21c2
JH
117Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
118(Chapter 14 of TUS3). Similarly for the C<script> property.
32c16050
JH
119
120Note that you cannot do (de)composition and casing based solely on the
121above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
b08cd201 122you will need also the compexcl(), casefold(), and casespec() functions.
561c79ed
JH
123
124=cut
125
10a6ecd2
JH
126sub _getcode {
127 my $arg = shift;
128
129 if ($arg =~ /^\d+$/) {
130 return $arg;
131 } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
132 return hex($1);
133 }
134
135 return;
136}
137
a6fa416b
ST
138sub han_charname {
139 my $arg = shift;
140 my $code = _getcode($arg);
74f8133e 141 croak __PACKAGE__, "::han_charname: unknown code '$arg'"
a6fa416b
ST
142 unless defined $code;
143 croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
144 unless 0x3400 <= $code && $code <= 0x4DB5
145 || 0x4E00 <= $code && $code <= 0x9FA5
146 || 0x20000 <= $code && $code <= 0x2A6D6;
147 sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
148}
149
150my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
151 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
152 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
153 );
154
155my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
156 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
157 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
158 "YU", "EU", "YI", "I",
159 );
160
161my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
162 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
163 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
164 "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
165 );
166
167my %HangulConst = (
168 SBase => 0xAC00,
169 LBase => 0x1100,
170 VBase => 0x1161,
171 TBase => 0x11A7,
172 LCount => 19, # scalar @JamoL
173 VCount => 21, # scalar @JamoV
174 TCount => 28, # scalar @JamoT
175 NCount => 588, # VCount * TCount
176 SCount => 11172, # LCount * NCount
177 Final => 0xD7A3, # SBase -1 + SCount
178 );
179
180sub hangul_charname {
181 my $arg = shift;
182 my $code = _getcode($arg);
74f8133e 183 croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
a6fa416b
ST
184 unless defined $code;
185 croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
186 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
187 my $SIndex = $code - $HangulConst{SBase};
188 my $LIndex = int( $SIndex / $HangulConst{NCount});
189 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
190 my $TIndex = $SIndex % $HangulConst{TCount};
191 return join('',
192 "HANGUL SYLLABLE ",
193 $JamoL[$LIndex],
194 $JamoV[$VIndex],
195 $JamoT[$TIndex],
196 );
197}
198
199sub hangul_decomp {
200 my $arg = shift;
201 my $code = _getcode($arg);
74f8133e 202 croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
a6fa416b
ST
203 unless defined $code;
204 croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
205 unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
206 my $SIndex = $code - $HangulConst{SBase};
207 my $LIndex = int( $SIndex / $HangulConst{NCount});
208 my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
209 my $TIndex = $SIndex % $HangulConst{TCount};
210
211 return join(" ",
212 sprintf("%04X", $HangulConst{LBase} + $LIndex),
213 sprintf("%04X", $HangulConst{VBase} + $VIndex),
214 $TIndex ?
215 sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
216 );
217}
218
219my @CharinfoRanges = (
220# block name
221# [ first, last, coderef to name, coderef to decompose ],
222# CJK Ideographs Extension A
223 [ 0x3400, 0x4DB5, \&han_charname, undef ],
224# CJK Ideographs
225 [ 0x4E00, 0x9FA5, \&han_charname, undef ],
226# Hangul Syllables
227 [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
228# Non-Private Use High Surrogates
229 [ 0xD800, 0xDB7F, undef, undef ],
230# Private Use High Surrogates
231 [ 0xDB80, 0xDBFF, undef, undef ],
232# Low Surrogates
233 [ 0xDC00, 0xDFFF, undef, undef ],
234# The Private Use Area
235 [ 0xE000, 0xF8FF, undef, undef ],
236# CJK Ideographs Extension B
237 [ 0x20000, 0x2A6D6, \&han_charname, undef ],
238# Plane 15 Private Use Area
239 [ 0xF0000, 0xFFFFD, undef, undef ],
240# Plane 16 Private Use Area
241 [ 0x100000, 0x10FFFD, undef, undef ],
242);
243
561c79ed 244sub charinfo {
10a6ecd2
JH
245 my $arg = shift;
246 my $code = _getcode($arg);
247 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
248 unless defined $code;
e63dbbf9 249 my $hexk = sprintf("%06X", $code);
a6fa416b
ST
250 my($rcode,$rname,$rdec);
251 foreach my $range (@CharinfoRanges){
74f8133e 252 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 253 $rcode = $hexk;
e63dbbf9
JH
254 $rcode =~ s/^0+//;
255 $rcode = sprintf("%04X", hex($rcode));
a6fa416b
ST
256 $rname = $range->[2] ? $range->[2]->($code) : '';
257 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 258 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b
ST
259 last;
260 }
261 }
74f8133e 262 openunicode(\$UNICODEFH, "Unicode.txt");
10a6ecd2 263 if (defined $UNICODEFH) {
e63dbbf9
JH
264 use Search::Dict 1.02;
265 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 266 my $line = <$UNICODEFH>;
561c79ed
JH
267 chomp $line;
268 my %prop;
269 @prop{qw(
270 code name category
271 combining bidi decomposition
272 decimal digit numeric
273 mirrored unicode10 comment
274 upper lower title
275 )} = split(/;/, $line, -1);
e63dbbf9
JH
276 $hexk =~ s/^0+//;
277 $hexk = sprintf("%04X", hex($hexk));
561c79ed 278 if ($prop{code} eq $hexk) {
a196fbfd
JH
279 $prop{block} = charblock($code);
280 $prop{script} = charscript($code);
a6fa416b
ST
281 if(defined $rname){
282 $prop{code} = $rcode;
283 $prop{name} = $rname;
284 $prop{decomposition} = $rdec;
285 }
b08cd201 286 return \%prop;
561c79ed
JH
287 }
288 }
289 }
290 return;
291}
292
e882dd67
JH
293sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
294 my ($table, $lo, $hi, $code) = @_;
295
296 return if $lo > $hi;
297
298 my $mid = int(($lo+$hi) / 2);
299
300 if ($table->[$mid]->[0] < $code) {
10a6ecd2 301 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
302 return $table->[$mid]->[2];
303 } else {
304 _search($table, $mid + 1, $hi, $code);
305 }
306 } elsif ($table->[$mid]->[0] > $code) {
307 _search($table, $lo, $mid - 1, $code);
308 } else {
309 return $table->[$mid]->[2];
310 }
311}
312
10a6ecd2
JH
313sub charinrange {
314 my ($range, $arg) = @_;
315 my $code = _getcode($arg);
316 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
317 unless defined $code;
318 _search($range, 0, $#$range, $code);
319}
320
354a27bf 321=head2 charblock
561c79ed 322
55d7b906 323 use Unicode::UCD 'charblock';
561c79ed
JH
324
325 my $charblock = charblock(0x41);
10a6ecd2
JH
326 my $charblock = charblock(1234);
327 my $charblock = charblock("0x263a");
328 my $charblock = charblock("U+263a");
329
78bf21c2 330 my $range = charblock('Armenian');
10a6ecd2 331
78bf21c2 332With a B<code point argument> charblock() returns the I<block> the character
10a6ecd2 333belongs to, e.g. C<Basic Latin>. Note that not all the character
b08cd201 334positions within all blocks are defined.
10a6ecd2 335
78bf21c2
JH
336See also L</Blocks versus Scripts>.
337
10a6ecd2
JH
338If supplied with an argument that can't be a code point, charblock()
339tries to do the opposite and interpret the argument as a character
340block. The return value is a I<range>: an anonymous list that
341contains anonymous lists, which in turn contain I<start-of-range>,
342I<end-of-range> code point pairs. You can test whether a code point
343is in a range using the L</charinrange> function. If the argument is
344not a known charater block, C<undef> is returned.
561c79ed 345
561c79ed
JH
346=cut
347
348my @BLOCKS;
10a6ecd2 349my %BLOCKS;
561c79ed 350
10a6ecd2 351sub _charblocks {
561c79ed 352 unless (@BLOCKS) {
10a6ecd2
JH
353 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
354 while (<$BLOCKSFH>) {
2796c109 355 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
356 my ($lo, $hi) = (hex($1), hex($2));
357 my $subrange = [ $lo, $hi, $3 ];
358 push @BLOCKS, $subrange;
359 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
360 }
361 }
10a6ecd2 362 close($BLOCKSFH);
561c79ed
JH
363 }
364 }
10a6ecd2
JH
365}
366
367sub charblock {
368 my $arg = shift;
369
370 _charblocks() unless @BLOCKS;
371
372 my $code = _getcode($arg);
561c79ed 373
10a6ecd2
JH
374 if (defined $code) {
375 _search(\@BLOCKS, 0, $#BLOCKS, $code);
376 } else {
377 if (exists $BLOCKS{$arg}) {
378 return $BLOCKS{$arg};
379 } else {
380 return;
381 }
382 }
e882dd67
JH
383}
384
385=head2 charscript
386
55d7b906 387 use Unicode::UCD 'charscript';
e882dd67
JH
388
389 my $charscript = charscript(0x41);
10a6ecd2
JH
390 my $charscript = charscript(1234);
391 my $charscript = charscript("U+263a");
e882dd67 392
78bf21c2 393 my $range = charscript('Thai');
10a6ecd2 394
78bf21c2 395With a B<code point argument> charscript() returns the I<script> the
b08cd201 396character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
10a6ecd2 397
78bf21c2
JH
398See also L</Blocks versus Scripts>.
399
10a6ecd2
JH
400If supplied with an argument that can't be a code point, charscript()
401tries to do the opposite and interpret the argument as a character
402script. The return value is a I<range>: an anonymous list that
403contains anonymous lists, which in turn contain I<start-of-range>,
404I<end-of-range> code point pairs. You can test whether a code point
405is in a range using the L</charinrange> function. If the argument is
406not a known charater script, C<undef> is returned.
e882dd67 407
e882dd67
JH
408=cut
409
410my @SCRIPTS;
10a6ecd2 411my %SCRIPTS;
e882dd67 412
10a6ecd2 413sub _charscripts {
e882dd67 414 unless (@SCRIPTS) {
10a6ecd2
JH
415 if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
416 while (<$SCRIPTSFH>) {
e882dd67 417 if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
10a6ecd2
JH
418 my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
419 my $script = lc($3);
420 $script =~ s/\b(\w)/uc($1)/ge;
421 my $subrange = [ $lo, $hi, $script ];
422 push @SCRIPTS, $subrange;
423 push @{$SCRIPTS{$script}}, $subrange;
e882dd67
JH
424 }
425 }
10a6ecd2 426 close($SCRIPTSFH);
e882dd67
JH
427 @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
428 }
429 }
10a6ecd2
JH
430}
431
432sub charscript {
433 my $arg = shift;
434
435 _charscripts() unless @SCRIPTS;
e882dd67 436
10a6ecd2
JH
437 my $code = _getcode($arg);
438
439 if (defined $code) {
440 _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
441 } else {
442 if (exists $SCRIPTS{$arg}) {
443 return $SCRIPTS{$arg};
444 } else {
445 return;
446 }
447 }
448}
449
450=head2 charblocks
451
55d7b906 452 use Unicode::UCD 'charblocks';
10a6ecd2 453
b08cd201 454 my $charblocks = charblocks();
10a6ecd2 455
b08cd201
JH
456charblocks() returns a reference to a hash with the known block names
457as the keys, and the code point ranges (see L</charblock>) as the values.
10a6ecd2 458
78bf21c2
JH
459See also L</Blocks versus Scripts>.
460
10a6ecd2
JH
461=cut
462
463sub charblocks {
b08cd201
JH
464 _charblocks() unless %BLOCKS;
465 return \%BLOCKS;
10a6ecd2
JH
466}
467
468=head2 charscripts
469
55d7b906 470 use Unicode::UCD 'charscripts';
10a6ecd2
JH
471
472 my %charscripts = charscripts();
473
474charscripts() returns a hash with the known script names as the keys,
475and the code point ranges (see L</charscript>) as the values.
476
78bf21c2
JH
477See also L</Blocks versus Scripts>.
478
10a6ecd2
JH
479=cut
480
481sub charscripts {
b08cd201
JH
482 _charscripts() unless %SCRIPTS;
483 return \%SCRIPTS;
561c79ed
JH
484}
485
10a6ecd2 486=head2 Blocks versus Scripts
ad9cab37 487
10a6ecd2
JH
488The difference between a block and a script is that scripts are closer
489to the linguistic notion of a set of characters required to present
490languages, while block is more of an artifact of the Unicode character
491numbering and separation into blocks of 256 characters.
3aa957f9
JH
492
493For example the Latin B<script> is spread over several B<blocks>, such
494as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
495C<Latin Extended-B>. On the other hand, the Latin script does not
496contain all the characters of the C<Basic Latin> block (also known as
497the ASCII): it includes only the letters, not for example the digits
498or the punctuation.
ad9cab37 499
3aa957f9 500For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
ad9cab37
JH
501
502For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
503
3aa957f9
JH
504=head2 Matching Scripts and Blocks
505
506Both scripts and blocks can be matched using the regular expression
507construct C<\p{In...}> and its negation C<\P{In...}>.
508
509The name of the script or the block comes after the C<In>, for example
510C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
10a6ecd2
JH
511removed from the names for the C<\p{In...}>, for example
512C<LatinExtendedA> instead of C<Latin Extended-A>.
513
78bf21c2
JH
514There are a few cases where there is both a script and a block by the
515same name, in these cases the block version has C<Block> appended to
516its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
517the block.
10a6ecd2 518
b08cd201
JH
519=head2 Code Point Arguments
520
78bf21c2
JH
521A <code point argument> is either a decimal or a hexadecimal scalar
522designating a Unicode character, or "U+" followed by hexadecimals
523designating a Unicode character. Note that Unicode is B<not> limited
524to 16 bits (the number of Unicode characters is open-ended, in theory
525unlimited): you may have more than 4 hexdigits.
b08cd201 526
10a6ecd2
JH
527=head2 charinrange
528
529In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
530can also test whether a code point is in the I<range> as returned by
531L</charblock> and L</charscript> or as the values of the hash returned
e618509d 532by L</charblocks> and L</charscripts> by using charinrange():
10a6ecd2 533
55d7b906 534 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
535
536 $range = charscript('Hiragana');
e145285f 537 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
538
539=cut
540
b08cd201
JH
541=head2 compexcl
542
55d7b906 543 use Unicode::UCD 'compexcl';
b08cd201
JH
544
545 my $compexcl = compexcl("09dc");
546
547The compexcl() returns the composition exclusion (that is, if the
9046a8ae
SC
548character should not be produced during a precomposition) of the
549character specified by a B<code point argument>.
b08cd201
JH
550
551If there is a composition exclusion for the character, true is
552returned. Otherwise, false is returned.
553
554=cut
555
556my %COMPEXCL;
557
558sub _compexcl {
559 unless (%COMPEXCL) {
560 if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
561 while (<$COMPEXCLFH>) {
562 if (/^([0-9A-F]+) \# /) {
563 my $code = hex($1);
564 $COMPEXCL{$code} = undef;
565 }
566 }
567 close($COMPEXCLFH);
568 }
569 }
570}
571
572sub compexcl {
573 my $arg = shift;
574 my $code = _getcode($arg);
74f8133e
JH
575 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
576 unless defined $code;
b08cd201
JH
577
578 _compexcl() unless %COMPEXCL;
579
580 return exists $COMPEXCL{$code};
581}
582
583=head2 casefold
584
55d7b906 585 use Unicode::UCD 'casefold';
b08cd201
JH
586
587 my %casefold = casefold("09dc");
588
589The casefold() returns the locale-independent case folding of the
590character specified by a B<code point argument>.
591
592If there is a case folding for that character, a reference to a hash
593with the following fields is returned:
594
595 key
596
597 code code point with at least four hexdigits
598 status "C", "F", "S", or "I"
599 mapping one or more codes separated by spaces
600
601The meaning of the I<status> is as follows:
602
603 C common case folding, common mappings shared
604 by both simple and full mappings
605 F full case folding, mappings that cause strings
606 to grow in length. Multiple characters are separated
607 by spaces
608 S simple case folding, mappings to single characters
609 where different from F
610 I special case for dotted uppercase I and
611 dotless lowercase i
612 - If this mapping is included, the result is
613 case-insensitive, but dotless and dotted I's
614 are not distinguished
615 - If this mapping is excluded, the result is not
616 fully case-insensitive, but dotless and dotted
617 I's are distinguished
618
619If there is no case folding for that character, C<undef> is returned.
620
621For more information about case mappings see
622http://www.unicode.org/unicode/reports/tr21/
623
624=cut
625
626my %CASEFOLD;
627
628sub _casefold {
629 unless (%CASEFOLD) {
630 if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
631 while (<$CASEFOLDFH>) {
632 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
633 my $code = hex($1);
634 $CASEFOLD{$code} = { code => $1,
635 status => $2,
636 mapping => $3 };
637 }
638 }
639 close($CASEFOLDFH);
640 }
641 }
642}
643
644sub casefold {
645 my $arg = shift;
646 my $code = _getcode($arg);
74f8133e
JH
647 croak __PACKAGE__, "::casefold: unknown code '$arg'"
648 unless defined $code;
b08cd201
JH
649
650 _casefold() unless %CASEFOLD;
651
652 return $CASEFOLD{$code};
653}
654
655=head2 casespec
656
55d7b906 657 use Unicode::UCD 'casespec';
b08cd201
JH
658
659 my %casespec = casespec("09dc");
660
661The casespec() returns the potentially locale-dependent case mapping
662of the character specified by a B<code point argument>. The mapping
663may change the length of the string (which the basic Unicode case
664mappings as returned by charinfo() never do).
665
666If there is a case folding for that character, a reference to a hash
667with the following fields is returned:
668
669 key
670
671 code code point with at least four hexdigits
672 lower lowercase
673 title titlecase
674 upper uppercase
675 condition condition list (may be undef)
676
677The C<condition> is optional. Where present, it consists of one or
678more I<locales> or I<contexts>, separated by spaces (other than as
679used to separate elements, spaces are to be ignored). A condition
680list overrides the normal behavior if all of the listed conditions are
681true. Case distinctions in the condition list are not significant.
682Conditions preceded by "NON_" represent the negation of the condition
683
684A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
e618509d
JH
685followed by a "_" and a 2-letter ISO language code (possibly followed
686by a "_" and a variant code). You can find the lists of those codes,
687see L<Locale::Country> and L<Locale::Language>.
b08cd201
JH
688
689A I<context> is one of the following choices:
690
691 FINAL The letter is not followed by a letter of
692 general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
693 MODERN The mapping is only used for modern text
e618509d 694 AFTER_i The last base character was "i" (U+0069)
b08cd201
JH
695
696For more information about case mappings see
697http://www.unicode.org/unicode/reports/tr21/
698
699=cut
700
701my %CASESPEC;
702
703sub _casespec {
704 unless (%CASESPEC) {
705 if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
706 while (<$CASESPECFH>) {
707 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
708 my $code = hex($1);
709 $CASESPEC{$code} = { code => $1,
710 lower => $2,
711 title => $3,
712 upper => $4,
713 condition => $5 };
714 }
715 }
716 close($CASESPECFH);
717 }
718 }
719}
720
721sub casespec {
722 my $arg = shift;
723 my $code = _getcode($arg);
74f8133e
JH
724 croak __PACKAGE__, "::casespec: unknown code '$arg'"
725 unless defined $code;
b08cd201
JH
726
727 _casespec() unless %CASESPEC;
728
729 return $CASESPEC{$code};
730}
731
55d7b906 732=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 733
55d7b906
JH
734Unicode::UCD::UnicodeVersion() returns the version of the Unicode
735Character Database, in other words, the version of the Unicode
78bf21c2
JH
736standard the database implements. The version is a string
737of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
738
739=cut
740
741my $UNICODEVERSION;
742
743sub UnicodeVersion {
744 unless (defined $UNICODEVERSION) {
745 openunicode(\$VERSIONFH, "version");
746 chomp($UNICODEVERSION = <$VERSIONFH>);
747 close($VERSIONFH);
748 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
749 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
750 }
751 return $UNICODEVERSION;
752}
3aa957f9
JH
753
754=head2 Implementation Note
32c16050 755
ad9cab37
JH
756The first use of charinfo() opens a read-only filehandle to the Unicode
757Character Database (the database is included in the Perl distribution).
78bf21c2
JH
758The filehandle is then kept open for further queries. In other words,
759if you are wondering where one of your filehandles went, that's where.
32c16050 760
561c79ed
JH
761=head1 AUTHOR
762
763Jarkko Hietaniemi
764
765=cut
766
7671;