Commit | Line | Data |
---|---|---|
55d7b906 | 1 | package Unicode::UCD; |
561c79ed JH |
2 | |
3 | use strict; | |
4 | use warnings; | |
5 | ||
ea508aee | 6 | our $VERSION = '0.25'; |
561c79ed | 7 | |
741297c1 JH |
8 | use Storable qw(dclone); |
9 | ||
561c79ed JH |
10 | require Exporter; |
11 | ||
12 | our @ISA = qw(Exporter); | |
74f8133e | 13 | |
10a6ecd2 JH |
14 | our @EXPORT_OK = qw(charinfo |
15 | charblock charscript | |
16 | charblocks charscripts | |
b08cd201 | 17 | charinrange |
ea508aee | 18 | general_categories bidi_types |
b08cd201 | 19 | compexcl |
a2bd7410 JH |
20 | casefold casespec |
21 | namedseq); | |
561c79ed JH |
22 | |
23 | use Carp; | |
24 | ||
25 | =head1 NAME | |
26 | ||
55d7b906 | 27 | Unicode::UCD - Unicode character database |
561c79ed JH |
28 | |
29 | =head1 SYNOPSIS | |
30 | ||
55d7b906 | 31 | use Unicode::UCD 'charinfo'; |
b08cd201 | 32 | my $charinfo = charinfo($codepoint); |
561c79ed | 33 | |
55d7b906 | 34 | use Unicode::UCD 'charblock'; |
e882dd67 JH |
35 | my $charblock = charblock($codepoint); |
36 | ||
55d7b906 | 37 | use Unicode::UCD 'charscript'; |
65044554 | 38 | my $charscript = charscript($codepoint); |
561c79ed | 39 | |
55d7b906 | 40 | use Unicode::UCD 'charblocks'; |
e145285f JH |
41 | my $charblocks = charblocks(); |
42 | ||
55d7b906 | 43 | use Unicode::UCD 'charscripts'; |
ea508aee | 44 | my $charscripts = charscripts(); |
e145285f | 45 | |
55d7b906 | 46 | use Unicode::UCD qw(charscript charinrange); |
e145285f JH |
47 | my $range = charscript($script); |
48 | print "looks like $script\n" if charinrange($range, $codepoint); | |
49 | ||
ea508aee JH |
50 | use Unicode::UCD qw(general_categories bidi_types); |
51 | my $categories = general_categories(); | |
52 | my $types = bidi_types(); | |
53 | ||
55d7b906 | 54 | use Unicode::UCD 'compexcl'; |
e145285f JH |
55 | my $compexcl = compexcl($codepoint); |
56 | ||
a2bd7410 JH |
57 | use Unicode::UCD 'namedseq'; |
58 | my $namedseq = namedseq($named_sequence_name); | |
59 | ||
55d7b906 | 60 | my $unicode_version = Unicode::UCD::UnicodeVersion(); |
e145285f | 61 | |
561c79ed JH |
62 | =head1 DESCRIPTION |
63 | ||
8b731da2 JH |
64 | The Unicode::UCD module offers a simple interface to the Unicode |
65 | Character Database. | |
561c79ed JH |
66 | |
67 | =cut | |
68 | ||
10a6ecd2 JH |
69 | my $UNICODEFH; |
70 | my $BLOCKSFH; | |
71 | my $SCRIPTSFH; | |
72 | my $VERSIONFH; | |
b08cd201 JH |
73 | my $COMPEXCLFH; |
74 | my $CASEFOLDFH; | |
75 | my $CASESPECFH; | |
a2bd7410 | 76 | my $NAMEDSEQFH; |
561c79ed JH |
77 | |
78 | sub openunicode { | |
79 | my ($rfh, @path) = @_; | |
80 | my $f; | |
81 | unless (defined $$rfh) { | |
82 | for my $d (@INC) { | |
83 | use File::Spec; | |
55d7b906 | 84 | $f = File::Spec->catfile($d, "unicore", @path); |
32c16050 | 85 | last if open($$rfh, $f); |
e882dd67 | 86 | undef $f; |
561c79ed | 87 | } |
e882dd67 JH |
88 | croak __PACKAGE__, ": failed to find ", |
89 | File::Spec->catfile(@path), " in @INC" | |
90 | unless defined $f; | |
561c79ed JH |
91 | } |
92 | return $f; | |
93 | } | |
94 | ||
95 | =head2 charinfo | |
96 | ||
55d7b906 | 97 | use Unicode::UCD 'charinfo'; |
561c79ed | 98 | |
b08cd201 | 99 | my $charinfo = charinfo(0x41); |
561c79ed | 100 | |
b08cd201 JH |
101 | charinfo() returns a reference to a hash that has the following fields |
102 | as defined by the Unicode standard: | |
561c79ed JH |
103 | |
104 | key | |
105 | ||
106 | code code point with at least four hexdigits | |
107 | name name of the character IN UPPER CASE | |
108 | category general category of the character | |
109 | combining classes used in the Canonical Ordering Algorithm | |
ea508aee | 110 | bidi bidirectional type |
561c79ed JH |
111 | decomposition character decomposition mapping |
112 | decimal if decimal digit this is the integer numeric value | |
113 | digit if digit this is the numeric value | |
114 | numeric if numeric is the integer or rational numeric value | |
115 | mirrored if mirrored in bidirectional text | |
116 | unicode10 Unicode 1.0 name if existed and different | |
117 | comment ISO 10646 comment field | |
118 | upper uppercase equivalent mapping | |
119 | lower lowercase equivalent mapping | |
120 | title titlecase equivalent mapping | |
e882dd67 | 121 | |
561c79ed | 122 | block block the character belongs to (used in \p{In...}) |
eb0cc9e3 | 123 | script script the character belongs to |
561c79ed | 124 | |
b08cd201 | 125 | If no match is found, a reference to an empty hash is returned. |
561c79ed | 126 | |
d1be9408 | 127 | The C<block> property is the same as returned by charinfo(). It is |
32c16050 | 128 | not defined in the Unicode Character Database proper (Chapter 4 of the |
78bf21c2 JH |
129 | Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database |
130 | (Chapter 14 of TUS3). Similarly for the C<script> property. | |
32c16050 JH |
131 | |
132 | Note that you cannot do (de)composition and casing based solely on the | |
133 | above C<decomposition> and C<lower>, C<upper>, C<title>, properties, | |
b08cd201 | 134 | you will need also the compexcl(), casefold(), and casespec() functions. |
561c79ed JH |
135 | |
136 | =cut | |
137 | ||
0616d9cf | 138 | # NB: This function is duplicated in charnames.pm |
10a6ecd2 JH |
139 | sub _getcode { |
140 | my $arg = shift; | |
141 | ||
dc0a4417 | 142 | if ($arg =~ /^[1-9]\d*$/) { |
10a6ecd2 | 143 | return $arg; |
dc0a4417 | 144 | } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) { |
10a6ecd2 JH |
145 | return hex($1); |
146 | } | |
147 | ||
148 | return; | |
149 | } | |
150 | ||
ac5ea531 JH |
151 | # Lingua::KO::Hangul::Util not part of the standard distribution |
152 | # but it will be used if available. | |
153 | ||
154 | eval { require Lingua::KO::Hangul::Util }; | |
155 | my $hasHangulUtil = ! $@; | |
156 | if ($hasHangulUtil) { | |
157 | Lingua::KO::Hangul::Util->import(); | |
158 | } | |
9087a70b TS |
159 | |
160 | sub hangul_decomp { # internal: called from charinfo | |
ac5ea531 JH |
161 | if ($hasHangulUtil) { |
162 | my @tmp = decomposeHangul(shift); | |
163 | return sprintf("%04X %04X", @tmp) if @tmp == 2; | |
164 | return sprintf("%04X %04X %04X", @tmp) if @tmp == 3; | |
165 | } | |
166 | return; | |
167 | } | |
168 | ||
169 | sub hangul_charname { # internal: called from charinfo | |
170 | return sprintf("HANGUL SYLLABLE-%04X", shift); | |
a6fa416b TS |
171 | } |
172 | ||
9087a70b TS |
173 | sub han_charname { # internal: called from charinfo |
174 | return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift); | |
a6fa416b TS |
175 | } |
176 | ||
177 | my @CharinfoRanges = ( | |
178 | # block name | |
179 | # [ first, last, coderef to name, coderef to decompose ], | |
180 | # CJK Ideographs Extension A | |
181 | [ 0x3400, 0x4DB5, \&han_charname, undef ], | |
182 | # CJK Ideographs | |
183 | [ 0x4E00, 0x9FA5, \&han_charname, undef ], | |
184 | # Hangul Syllables | |
ac5ea531 | 185 | [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ], |
a6fa416b TS |
186 | # Non-Private Use High Surrogates |
187 | [ 0xD800, 0xDB7F, undef, undef ], | |
188 | # Private Use High Surrogates | |
189 | [ 0xDB80, 0xDBFF, undef, undef ], | |
190 | # Low Surrogates | |
191 | [ 0xDC00, 0xDFFF, undef, undef ], | |
192 | # The Private Use Area | |
193 | [ 0xE000, 0xF8FF, undef, undef ], | |
194 | # CJK Ideographs Extension B | |
195 | [ 0x20000, 0x2A6D6, \&han_charname, undef ], | |
196 | # Plane 15 Private Use Area | |
197 | [ 0xF0000, 0xFFFFD, undef, undef ], | |
198 | # Plane 16 Private Use Area | |
199 | [ 0x100000, 0x10FFFD, undef, undef ], | |
200 | ); | |
201 | ||
561c79ed | 202 | sub charinfo { |
10a6ecd2 JH |
203 | my $arg = shift; |
204 | my $code = _getcode($arg); | |
205 | croak __PACKAGE__, "::charinfo: unknown code '$arg'" | |
206 | unless defined $code; | |
e63dbbf9 | 207 | my $hexk = sprintf("%06X", $code); |
a6fa416b TS |
208 | my($rcode,$rname,$rdec); |
209 | foreach my $range (@CharinfoRanges){ | |
74f8133e | 210 | if ($range->[0] <= $code && $code <= $range->[1]) { |
a6fa416b | 211 | $rcode = $hexk; |
e63dbbf9 JH |
212 | $rcode =~ s/^0+//; |
213 | $rcode = sprintf("%04X", hex($rcode)); | |
a6fa416b TS |
214 | $rname = $range->[2] ? $range->[2]->($code) : ''; |
215 | $rdec = $range->[3] ? $range->[3]->($code) : ''; | |
e63dbbf9 | 216 | $hexk = sprintf("%06X", $range->[0]); # replace by the first |
a6fa416b TS |
217 | last; |
218 | } | |
219 | } | |
551b6b6f | 220 | openunicode(\$UNICODEFH, "UnicodeData.txt"); |
10a6ecd2 | 221 | if (defined $UNICODEFH) { |
e63dbbf9 JH |
222 | use Search::Dict 1.02; |
223 | if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) { | |
10a6ecd2 | 224 | my $line = <$UNICODEFH>; |
c5a29f40 | 225 | return unless defined $line; |
561c79ed JH |
226 | chomp $line; |
227 | my %prop; | |
228 | @prop{qw( | |
229 | code name category | |
230 | combining bidi decomposition | |
231 | decimal digit numeric | |
232 | mirrored unicode10 comment | |
233 | upper lower title | |
234 | )} = split(/;/, $line, -1); | |
e63dbbf9 JH |
235 | $hexk =~ s/^0+//; |
236 | $hexk = sprintf("%04X", hex($hexk)); | |
561c79ed | 237 | if ($prop{code} eq $hexk) { |
a196fbfd JH |
238 | $prop{block} = charblock($code); |
239 | $prop{script} = charscript($code); | |
a6fa416b TS |
240 | if(defined $rname){ |
241 | $prop{code} = $rcode; | |
242 | $prop{name} = $rname; | |
243 | $prop{decomposition} = $rdec; | |
244 | } | |
b08cd201 | 245 | return \%prop; |
561c79ed JH |
246 | } |
247 | } | |
248 | } | |
249 | return; | |
250 | } | |
251 | ||
e882dd67 JH |
252 | sub _search { # Binary search in a [[lo,hi,prop],[...],...] table. |
253 | my ($table, $lo, $hi, $code) = @_; | |
254 | ||
255 | return if $lo > $hi; | |
256 | ||
257 | my $mid = int(($lo+$hi) / 2); | |
258 | ||
259 | if ($table->[$mid]->[0] < $code) { | |
10a6ecd2 | 260 | if ($table->[$mid]->[1] >= $code) { |
e882dd67 JH |
261 | return $table->[$mid]->[2]; |
262 | } else { | |
263 | _search($table, $mid + 1, $hi, $code); | |
264 | } | |
265 | } elsif ($table->[$mid]->[0] > $code) { | |
266 | _search($table, $lo, $mid - 1, $code); | |
267 | } else { | |
268 | return $table->[$mid]->[2]; | |
269 | } | |
270 | } | |
271 | ||
10a6ecd2 JH |
272 | sub charinrange { |
273 | my ($range, $arg) = @_; | |
274 | my $code = _getcode($arg); | |
275 | croak __PACKAGE__, "::charinrange: unknown code '$arg'" | |
276 | unless defined $code; | |
277 | _search($range, 0, $#$range, $code); | |
278 | } | |
279 | ||
354a27bf | 280 | =head2 charblock |
561c79ed | 281 | |
55d7b906 | 282 | use Unicode::UCD 'charblock'; |
561c79ed JH |
283 | |
284 | my $charblock = charblock(0x41); | |
10a6ecd2 JH |
285 | my $charblock = charblock(1234); |
286 | my $charblock = charblock("0x263a"); | |
287 | my $charblock = charblock("U+263a"); | |
288 | ||
78bf21c2 | 289 | my $range = charblock('Armenian'); |
10a6ecd2 | 290 | |
78bf21c2 | 291 | With a B<code point argument> charblock() returns the I<block> the character |
10a6ecd2 | 292 | belongs to, e.g. C<Basic Latin>. Note that not all the character |
b08cd201 | 293 | positions within all blocks are defined. |
10a6ecd2 | 294 | |
78bf21c2 JH |
295 | See also L</Blocks versus Scripts>. |
296 | ||
eb0cc9e3 JH |
297 | If supplied with an argument that can't be a code point, charblock() tries |
298 | to do the opposite and interpret the argument as a character block. The | |
299 | return value is a I<range>: an anonymous list of lists that contain | |
a2bd7410 JH |
300 | I<start-of-range>, I<end-of-range> code point pairs. You can test whether |
301 | a code point is in a range using the L</charinrange> function. If the | |
3c4b39be | 302 | argument is not a known character block, C<undef> is returned. |
561c79ed | 303 | |
561c79ed JH |
304 | =cut |
305 | ||
306 | my @BLOCKS; | |
10a6ecd2 | 307 | my %BLOCKS; |
561c79ed | 308 | |
10a6ecd2 | 309 | sub _charblocks { |
561c79ed | 310 | unless (@BLOCKS) { |
10a6ecd2 | 311 | if (openunicode(\$BLOCKSFH, "Blocks.txt")) { |
6c8d78fb | 312 | local $_; |
10a6ecd2 | 313 | while (<$BLOCKSFH>) { |
2796c109 | 314 | if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) { |
10a6ecd2 JH |
315 | my ($lo, $hi) = (hex($1), hex($2)); |
316 | my $subrange = [ $lo, $hi, $3 ]; | |
317 | push @BLOCKS, $subrange; | |
318 | push @{$BLOCKS{$3}}, $subrange; | |
561c79ed JH |
319 | } |
320 | } | |
10a6ecd2 | 321 | close($BLOCKSFH); |
561c79ed JH |
322 | } |
323 | } | |
10a6ecd2 JH |
324 | } |
325 | ||
326 | sub charblock { | |
327 | my $arg = shift; | |
328 | ||
329 | _charblocks() unless @BLOCKS; | |
330 | ||
331 | my $code = _getcode($arg); | |
561c79ed | 332 | |
10a6ecd2 JH |
333 | if (defined $code) { |
334 | _search(\@BLOCKS, 0, $#BLOCKS, $code); | |
335 | } else { | |
336 | if (exists $BLOCKS{$arg}) { | |
741297c1 | 337 | return dclone $BLOCKS{$arg}; |
10a6ecd2 JH |
338 | } else { |
339 | return; | |
340 | } | |
341 | } | |
e882dd67 JH |
342 | } |
343 | ||
344 | =head2 charscript | |
345 | ||
55d7b906 | 346 | use Unicode::UCD 'charscript'; |
e882dd67 JH |
347 | |
348 | my $charscript = charscript(0x41); | |
10a6ecd2 JH |
349 | my $charscript = charscript(1234); |
350 | my $charscript = charscript("U+263a"); | |
e882dd67 | 351 | |
78bf21c2 | 352 | my $range = charscript('Thai'); |
10a6ecd2 | 353 | |
78bf21c2 | 354 | With a B<code point argument> charscript() returns the I<script> the |
b08cd201 | 355 | character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. |
10a6ecd2 | 356 | |
78bf21c2 JH |
357 | See also L</Blocks versus Scripts>. |
358 | ||
eb0cc9e3 JH |
359 | If supplied with an argument that can't be a code point, charscript() tries |
360 | to do the opposite and interpret the argument as a character script. The | |
361 | return value is a I<range>: an anonymous list of lists that contain | |
362 | I<start-of-range>, I<end-of-range> code point pairs. You can test whether a | |
363 | code point is in a range using the L</charinrange> function. If the | |
3c4b39be | 364 | argument is not a known character script, C<undef> is returned. |
e882dd67 | 365 | |
e882dd67 JH |
366 | =cut |
367 | ||
368 | my @SCRIPTS; | |
10a6ecd2 | 369 | my %SCRIPTS; |
e882dd67 | 370 | |
10a6ecd2 | 371 | sub _charscripts { |
e882dd67 | 372 | unless (@SCRIPTS) { |
10a6ecd2 | 373 | if (openunicode(\$SCRIPTSFH, "Scripts.txt")) { |
6c8d78fb | 374 | local $_; |
10a6ecd2 | 375 | while (<$SCRIPTSFH>) { |
e882dd67 | 376 | if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) { |
10a6ecd2 JH |
377 | my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1)); |
378 | my $script = lc($3); | |
379 | $script =~ s/\b(\w)/uc($1)/ge; | |
380 | my $subrange = [ $lo, $hi, $script ]; | |
381 | push @SCRIPTS, $subrange; | |
382 | push @{$SCRIPTS{$script}}, $subrange; | |
e882dd67 JH |
383 | } |
384 | } | |
10a6ecd2 | 385 | close($SCRIPTSFH); |
e882dd67 JH |
386 | @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS; |
387 | } | |
388 | } | |
10a6ecd2 JH |
389 | } |
390 | ||
391 | sub charscript { | |
392 | my $arg = shift; | |
393 | ||
394 | _charscripts() unless @SCRIPTS; | |
e882dd67 | 395 | |
10a6ecd2 JH |
396 | my $code = _getcode($arg); |
397 | ||
398 | if (defined $code) { | |
399 | _search(\@SCRIPTS, 0, $#SCRIPTS, $code); | |
400 | } else { | |
401 | if (exists $SCRIPTS{$arg}) { | |
741297c1 | 402 | return dclone $SCRIPTS{$arg}; |
10a6ecd2 JH |
403 | } else { |
404 | return; | |
405 | } | |
406 | } | |
407 | } | |
408 | ||
409 | =head2 charblocks | |
410 | ||
55d7b906 | 411 | use Unicode::UCD 'charblocks'; |
10a6ecd2 | 412 | |
b08cd201 | 413 | my $charblocks = charblocks(); |
10a6ecd2 | 414 | |
b08cd201 JH |
415 | charblocks() returns a reference to a hash with the known block names |
416 | as the keys, and the code point ranges (see L</charblock>) as the values. | |
10a6ecd2 | 417 | |
78bf21c2 JH |
418 | See also L</Blocks versus Scripts>. |
419 | ||
10a6ecd2 JH |
420 | =cut |
421 | ||
422 | sub charblocks { | |
b08cd201 | 423 | _charblocks() unless %BLOCKS; |
741297c1 | 424 | return dclone \%BLOCKS; |
10a6ecd2 JH |
425 | } |
426 | ||
427 | =head2 charscripts | |
428 | ||
55d7b906 | 429 | use Unicode::UCD 'charscripts'; |
10a6ecd2 | 430 | |
ea508aee | 431 | my $charscripts = charscripts(); |
10a6ecd2 | 432 | |
ea508aee JH |
433 | charscripts() returns a reference to a hash with the known script |
434 | names as the keys, and the code point ranges (see L</charscript>) as | |
435 | the values. | |
10a6ecd2 | 436 | |
78bf21c2 JH |
437 | See also L</Blocks versus Scripts>. |
438 | ||
10a6ecd2 JH |
439 | =cut |
440 | ||
441 | sub charscripts { | |
b08cd201 | 442 | _charscripts() unless %SCRIPTS; |
741297c1 | 443 | return dclone \%SCRIPTS; |
561c79ed JH |
444 | } |
445 | ||
10a6ecd2 | 446 | =head2 Blocks versus Scripts |
ad9cab37 | 447 | |
10a6ecd2 JH |
448 | The difference between a block and a script is that scripts are closer |
449 | to the linguistic notion of a set of characters required to present | |
450 | languages, while block is more of an artifact of the Unicode character | |
eb0cc9e3 | 451 | numbering and separation into blocks of (mostly) 256 characters. |
3aa957f9 JH |
452 | |
453 | For example the Latin B<script> is spread over several B<blocks>, such | |
454 | as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and | |
455 | C<Latin Extended-B>. On the other hand, the Latin script does not | |
456 | contain all the characters of the C<Basic Latin> block (also known as | |
eb0cc9e3 | 457 | the ASCII): it includes only the letters, and not, for example, the digits |
3aa957f9 | 458 | or the punctuation. |
ad9cab37 | 459 | |
3aa957f9 | 460 | For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt |
ad9cab37 JH |
461 | |
462 | For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/ | |
463 | ||
3aa957f9 JH |
464 | =head2 Matching Scripts and Blocks |
465 | ||
eb0cc9e3 JH |
466 | Scripts are matched with the regular-expression construct |
467 | C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script), | |
468 | while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches | |
469 | any of the 256 code points in the Tibetan block). | |
10a6ecd2 | 470 | |
b08cd201 JH |
471 | =head2 Code Point Arguments |
472 | ||
92e830a9 JH |
473 | A I<code point argument> is either a decimal or a hexadecimal scalar |
474 | designating a Unicode character, or C<U+> followed by hexadecimals | |
dc0a4417 JH |
475 | designating a Unicode character. In other words, if you want a code |
476 | point to be interpreted as a hexadecimal number, you must prefix it | |
43adb1d9 | 477 | with either C<0x> or C<U+>, because a string like e.g. C<123> will |
dc0a4417 JH |
478 | be interpreted as a decimal code point. Also note that Unicode is |
479 | B<not> limited to 16 bits (the number of Unicode characters is | |
480 | open-ended, in theory unlimited): you may have more than 4 hexdigits. | |
b08cd201 | 481 | |
10a6ecd2 JH |
482 | =head2 charinrange |
483 | ||
484 | In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you | |
485 | can also test whether a code point is in the I<range> as returned by | |
486 | L</charblock> and L</charscript> or as the values of the hash returned | |
e618509d | 487 | by L</charblocks> and L</charscripts> by using charinrange(): |
10a6ecd2 | 488 | |
55d7b906 | 489 | use Unicode::UCD qw(charscript charinrange); |
10a6ecd2 JH |
490 | |
491 | $range = charscript('Hiragana'); | |
e145285f | 492 | print "looks like hiragana\n" if charinrange($range, $codepoint); |
10a6ecd2 JH |
493 | |
494 | =cut | |
495 | ||
ea508aee JH |
496 | my %GENERAL_CATEGORIES = |
497 | ( | |
498 | 'L' => 'Letter', | |
499 | 'LC' => 'CasedLetter', | |
500 | 'Lu' => 'UppercaseLetter', | |
501 | 'Ll' => 'LowercaseLetter', | |
502 | 'Lt' => 'TitlecaseLetter', | |
503 | 'Lm' => 'ModifierLetter', | |
504 | 'Lo' => 'OtherLetter', | |
505 | 'M' => 'Mark', | |
506 | 'Mn' => 'NonspacingMark', | |
507 | 'Mc' => 'SpacingMark', | |
508 | 'Me' => 'EnclosingMark', | |
509 | 'N' => 'Number', | |
510 | 'Nd' => 'DecimalNumber', | |
511 | 'Nl' => 'LetterNumber', | |
512 | 'No' => 'OtherNumber', | |
513 | 'P' => 'Punctuation', | |
514 | 'Pc' => 'ConnectorPunctuation', | |
515 | 'Pd' => 'DashPunctuation', | |
516 | 'Ps' => 'OpenPunctuation', | |
517 | 'Pe' => 'ClosePunctuation', | |
518 | 'Pi' => 'InitialPunctuation', | |
519 | 'Pf' => 'FinalPunctuation', | |
520 | 'Po' => 'OtherPunctuation', | |
521 | 'S' => 'Symbol', | |
522 | 'Sm' => 'MathSymbol', | |
523 | 'Sc' => 'CurrencySymbol', | |
524 | 'Sk' => 'ModifierSymbol', | |
525 | 'So' => 'OtherSymbol', | |
526 | 'Z' => 'Separator', | |
527 | 'Zs' => 'SpaceSeparator', | |
528 | 'Zl' => 'LineSeparator', | |
529 | 'Zp' => 'ParagraphSeparator', | |
530 | 'C' => 'Other', | |
531 | 'Cc' => 'Control', | |
532 | 'Cf' => 'Format', | |
533 | 'Cs' => 'Surrogate', | |
534 | 'Co' => 'PrivateUse', | |
535 | 'Cn' => 'Unassigned', | |
536 | ); | |
537 | ||
538 | sub general_categories { | |
539 | return dclone \%GENERAL_CATEGORIES; | |
540 | } | |
541 | ||
542 | =head2 general_categories | |
543 | ||
544 | use Unicode::UCD 'general_categories'; | |
545 | ||
546 | my $categories = general_categories(); | |
547 | ||
548 | The general_categories() returns a reference to a hash which has short | |
549 | general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long | |
550 | names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>, | |
551 | C<Symbol>) as values. The hash is reversible in case you need to go | |
552 | from the long names to the short names. The general category is the | |
553 | one returned from charinfo() under the C<category> key. | |
554 | ||
555 | =cut | |
556 | ||
557 | my %BIDI_TYPES = | |
558 | ( | |
559 | 'L' => 'Left-to-Right', | |
560 | 'LRE' => 'Left-to-Right Embedding', | |
561 | 'LRO' => 'Left-to-Right Override', | |
562 | 'R' => 'Right-to-Left', | |
563 | 'AL' => 'Right-to-Left Arabic', | |
564 | 'RLE' => 'Right-to-Left Embedding', | |
565 | 'RLO' => 'Right-to-Left Override', | |
566 | 'PDF' => 'Pop Directional Format', | |
567 | 'EN' => 'European Number', | |
568 | 'ES' => 'European Number Separator', | |
569 | 'ET' => 'European Number Terminator', | |
570 | 'AN' => 'Arabic Number', | |
571 | 'CS' => 'Common Number Separator', | |
572 | 'NSM' => 'Non-Spacing Mark', | |
573 | 'BN' => 'Boundary Neutral', | |
574 | 'B' => 'Paragraph Separator', | |
575 | 'S' => 'Segment Separator', | |
576 | 'WS' => 'Whitespace', | |
577 | 'ON' => 'Other Neutrals', | |
578 | ); | |
579 | ||
580 | sub bidi_types { | |
581 | return dclone \%BIDI_TYPES; | |
582 | } | |
583 | ||
584 | =head2 bidi_types | |
585 | ||
586 | use Unicode::UCD 'bidi_types'; | |
587 | ||
588 | my $categories = bidi_types(); | |
589 | ||
590 | The bidi_types() returns a reference to a hash which has the short | |
591 | bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long | |
592 | names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The | |
593 | hash is reversible in case you need to go from the long names to the | |
594 | short names. The bidi type is the one returned from charinfo() | |
595 | under the C<bidi> key. For the exact meaning of the various bidi classes | |
596 | the Unicode TR9 is recommended reading: | |
597 | http://www.unicode.org/reports/tr9/tr9-17.html | |
598 | (as of Unicode 5.0.0) | |
599 | ||
600 | =cut | |
601 | ||
b08cd201 JH |
602 | =head2 compexcl |
603 | ||
55d7b906 | 604 | use Unicode::UCD 'compexcl'; |
b08cd201 JH |
605 | |
606 | my $compexcl = compexcl("09dc"); | |
607 | ||
608 | The compexcl() returns the composition exclusion (that is, if the | |
9046a8ae SC |
609 | character should not be produced during a precomposition) of the |
610 | character specified by a B<code point argument>. | |
b08cd201 JH |
611 | |
612 | If there is a composition exclusion for the character, true is | |
613 | returned. Otherwise, false is returned. | |
614 | ||
615 | =cut | |
616 | ||
617 | my %COMPEXCL; | |
618 | ||
619 | sub _compexcl { | |
620 | unless (%COMPEXCL) { | |
551b6b6f | 621 | if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) { |
6c8d78fb | 622 | local $_; |
b08cd201 | 623 | while (<$COMPEXCLFH>) { |
822ebcc8 | 624 | if (/^([0-9A-F]+)\s+\#\s+/) { |
b08cd201 JH |
625 | my $code = hex($1); |
626 | $COMPEXCL{$code} = undef; | |
627 | } | |
628 | } | |
629 | close($COMPEXCLFH); | |
630 | } | |
631 | } | |
632 | } | |
633 | ||
634 | sub compexcl { | |
635 | my $arg = shift; | |
636 | my $code = _getcode($arg); | |
74f8133e JH |
637 | croak __PACKAGE__, "::compexcl: unknown code '$arg'" |
638 | unless defined $code; | |
b08cd201 JH |
639 | |
640 | _compexcl() unless %COMPEXCL; | |
641 | ||
642 | return exists $COMPEXCL{$code}; | |
643 | } | |
644 | ||
645 | =head2 casefold | |
646 | ||
55d7b906 | 647 | use Unicode::UCD 'casefold'; |
b08cd201 | 648 | |
82c0b05b | 649 | my $casefold = casefold("00DF"); |
b08cd201 JH |
650 | |
651 | The casefold() returns the locale-independent case folding of the | |
652 | character specified by a B<code point argument>. | |
653 | ||
654 | If there is a case folding for that character, a reference to a hash | |
655 | with the following fields is returned: | |
656 | ||
657 | key | |
658 | ||
659 | code code point with at least four hexdigits | |
660 | status "C", "F", "S", or "I" | |
661 | mapping one or more codes separated by spaces | |
662 | ||
663 | The meaning of the I<status> is as follows: | |
664 | ||
665 | C common case folding, common mappings shared | |
666 | by both simple and full mappings | |
667 | F full case folding, mappings that cause strings | |
668 | to grow in length. Multiple characters are separated | |
669 | by spaces | |
670 | S simple case folding, mappings to single characters | |
671 | where different from F | |
672 | I special case for dotted uppercase I and | |
673 | dotless lowercase i | |
674 | - If this mapping is included, the result is | |
675 | case-insensitive, but dotless and dotted I's | |
676 | are not distinguished | |
677 | - If this mapping is excluded, the result is not | |
678 | fully case-insensitive, but dotless and dotted | |
679 | I's are distinguished | |
680 | ||
681 | If there is no case folding for that character, C<undef> is returned. | |
682 | ||
683 | For more information about case mappings see | |
684 | http://www.unicode.org/unicode/reports/tr21/ | |
685 | ||
686 | =cut | |
687 | ||
688 | my %CASEFOLD; | |
689 | ||
690 | sub _casefold { | |
691 | unless (%CASEFOLD) { | |
551b6b6f | 692 | if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) { |
6c8d78fb | 693 | local $_; |
b08cd201 JH |
694 | while (<$CASEFOLDFH>) { |
695 | if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) { | |
696 | my $code = hex($1); | |
697 | $CASEFOLD{$code} = { code => $1, | |
698 | status => $2, | |
699 | mapping => $3 }; | |
700 | } | |
701 | } | |
702 | close($CASEFOLDFH); | |
703 | } | |
704 | } | |
705 | } | |
706 | ||
707 | sub casefold { | |
708 | my $arg = shift; | |
709 | my $code = _getcode($arg); | |
74f8133e JH |
710 | croak __PACKAGE__, "::casefold: unknown code '$arg'" |
711 | unless defined $code; | |
b08cd201 JH |
712 | |
713 | _casefold() unless %CASEFOLD; | |
714 | ||
715 | return $CASEFOLD{$code}; | |
716 | } | |
717 | ||
718 | =head2 casespec | |
719 | ||
55d7b906 | 720 | use Unicode::UCD 'casespec'; |
b08cd201 | 721 | |
82c0b05b | 722 | my $casespec = casespec("FB00"); |
b08cd201 JH |
723 | |
724 | The casespec() returns the potentially locale-dependent case mapping | |
725 | of the character specified by a B<code point argument>. The mapping | |
726 | may change the length of the string (which the basic Unicode case | |
727 | mappings as returned by charinfo() never do). | |
728 | ||
729 | If there is a case folding for that character, a reference to a hash | |
730 | with the following fields is returned: | |
731 | ||
732 | key | |
733 | ||
734 | code code point with at least four hexdigits | |
735 | lower lowercase | |
736 | title titlecase | |
737 | upper uppercase | |
738 | condition condition list (may be undef) | |
739 | ||
740 | The C<condition> is optional. Where present, it consists of one or | |
741 | more I<locales> or I<contexts>, separated by spaces (other than as | |
742 | used to separate elements, spaces are to be ignored). A condition | |
743 | list overrides the normal behavior if all of the listed conditions are | |
744 | true. Case distinctions in the condition list are not significant. | |
82c0b05b | 745 | Conditions preceded by "NON_" represent the negation of the condition. |
b08cd201 | 746 | |
f499c386 JH |
747 | Note that when there are multiple case folding definitions for a |
748 | single code point because of different locales, the value returned by | |
749 | casespec() is a hash reference which has the locales as the keys and | |
750 | hash references as described above as the values. | |
751 | ||
b08cd201 | 752 | A I<locale> is defined as a 2-letter ISO 3166 country code, possibly |
e618509d JH |
753 | followed by a "_" and a 2-letter ISO language code (possibly followed |
754 | by a "_" and a variant code). You can find the lists of those codes, | |
755 | see L<Locale::Country> and L<Locale::Language>. | |
b08cd201 JH |
756 | |
757 | A I<context> is one of the following choices: | |
758 | ||
759 | FINAL The letter is not followed by a letter of | |
760 | general category L (e.g. Ll, Lt, Lu, Lm, or Lo) | |
761 | MODERN The mapping is only used for modern text | |
e618509d | 762 | AFTER_i The last base character was "i" (U+0069) |
b08cd201 JH |
763 | |
764 | For more information about case mappings see | |
765 | http://www.unicode.org/unicode/reports/tr21/ | |
766 | ||
767 | =cut | |
768 | ||
769 | my %CASESPEC; | |
770 | ||
771 | sub _casespec { | |
772 | unless (%CASESPEC) { | |
551b6b6f | 773 | if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) { |
6c8d78fb | 774 | local $_; |
b08cd201 JH |
775 | while (<$CASESPECFH>) { |
776 | if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) { | |
f499c386 JH |
777 | my ($hexcode, $lower, $title, $upper, $condition) = |
778 | ($1, $2, $3, $4, $5); | |
779 | my $code = hex($hexcode); | |
780 | if (exists $CASESPEC{$code}) { | |
781 | if (exists $CASESPEC{$code}->{code}) { | |
782 | my ($oldlower, | |
783 | $oldtitle, | |
784 | $oldupper, | |
785 | $oldcondition) = | |
786 | @{$CASESPEC{$code}}{qw(lower | |
787 | title | |
788 | upper | |
789 | condition)}; | |
822ebcc8 JH |
790 | if (defined $oldcondition) { |
791 | my ($oldlocale) = | |
f499c386 | 792 | ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/); |
f499c386 JH |
793 | delete $CASESPEC{$code}; |
794 | $CASESPEC{$code}->{$oldlocale} = | |
795 | { code => $hexcode, | |
796 | lower => $oldlower, | |
797 | title => $oldtitle, | |
798 | upper => $oldupper, | |
799 | condition => $oldcondition }; | |
f499c386 JH |
800 | } |
801 | } | |
802 | my ($locale) = | |
803 | ($condition =~ /^([a-z][a-z](?:_\S+)?)/); | |
804 | $CASESPEC{$code}->{$locale} = | |
805 | { code => $hexcode, | |
806 | lower => $lower, | |
807 | title => $title, | |
808 | upper => $upper, | |
809 | condition => $condition }; | |
810 | } else { | |
811 | $CASESPEC{$code} = | |
812 | { code => $hexcode, | |
813 | lower => $lower, | |
814 | title => $title, | |
815 | upper => $upper, | |
816 | condition => $condition }; | |
817 | } | |
b08cd201 JH |
818 | } |
819 | } | |
820 | close($CASESPECFH); | |
821 | } | |
822 | } | |
823 | } | |
824 | ||
825 | sub casespec { | |
826 | my $arg = shift; | |
827 | my $code = _getcode($arg); | |
74f8133e JH |
828 | croak __PACKAGE__, "::casespec: unknown code '$arg'" |
829 | unless defined $code; | |
b08cd201 JH |
830 | |
831 | _casespec() unless %CASESPEC; | |
832 | ||
741297c1 | 833 | return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; |
b08cd201 JH |
834 | } |
835 | ||
a2bd7410 JH |
836 | =head2 namedseq() |
837 | ||
838 | use Unicode::UCD 'namedseq'; | |
839 | ||
840 | my $namedseq = namedseq("KATAKANA LETTER AINU P"); | |
841 | my @namedseq = namedseq("KATAKANA LETTER AINU P"); | |
842 | my %namedseq = namedseq(); | |
843 | ||
844 | If used with a single argument in a scalar context, returns the string | |
845 | consisting of the code points of the named sequence, or C<undef> if no | |
846 | named sequence by that name exists. If used with a single argument in | |
847 | a list context, returns list of the code points. If used with no | |
848 | arguments in a list context, returns a hash with the names of the | |
849 | named sequences as the keys and the named sequences as strings as | |
850 | the values. Otherwise, returns C<undef> or empty list depending | |
851 | on the context. | |
852 | ||
853 | (New from Unicode 4.1.0) | |
854 | ||
855 | =cut | |
856 | ||
857 | my %NAMEDSEQ; | |
858 | ||
859 | sub _namedseq { | |
860 | unless (%NAMEDSEQ) { | |
861 | if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) { | |
862 | local $_; | |
863 | while (<$NAMEDSEQFH>) { | |
864 | if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) { | |
865 | my ($n, $s) = ($1, $2); | |
866 | my @s = map { chr(hex($_)) } split(' ', $s); | |
867 | $NAMEDSEQ{$n} = join("", @s); | |
868 | } | |
869 | } | |
870 | close($NAMEDSEQFH); | |
871 | } | |
872 | } | |
873 | } | |
874 | ||
875 | sub namedseq { | |
876 | _namedseq() unless %NAMEDSEQ; | |
877 | my $wantarray = wantarray(); | |
878 | if (defined $wantarray) { | |
879 | if ($wantarray) { | |
880 | if (@_ == 0) { | |
881 | return %NAMEDSEQ; | |
882 | } elsif (@_ == 1) { | |
883 | my $s = $NAMEDSEQ{ $_[0] }; | |
884 | return defined $s ? map { ord($_) } split('', $s) : (); | |
885 | } | |
886 | } elsif (@_ == 1) { | |
887 | return $NAMEDSEQ{ $_[0] }; | |
888 | } | |
889 | } | |
890 | return; | |
891 | } | |
892 | ||
55d7b906 | 893 | =head2 Unicode::UCD::UnicodeVersion |
10a6ecd2 | 894 | |
55d7b906 JH |
895 | Unicode::UCD::UnicodeVersion() returns the version of the Unicode |
896 | Character Database, in other words, the version of the Unicode | |
78bf21c2 JH |
897 | standard the database implements. The version is a string |
898 | of numbers delimited by dots (C<'.'>). | |
10a6ecd2 JH |
899 | |
900 | =cut | |
901 | ||
902 | my $UNICODEVERSION; | |
903 | ||
904 | sub UnicodeVersion { | |
905 | unless (defined $UNICODEVERSION) { | |
906 | openunicode(\$VERSIONFH, "version"); | |
907 | chomp($UNICODEVERSION = <$VERSIONFH>); | |
908 | close($VERSIONFH); | |
909 | croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'" | |
910 | unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/; | |
911 | } | |
912 | return $UNICODEVERSION; | |
913 | } | |
3aa957f9 JH |
914 | |
915 | =head2 Implementation Note | |
32c16050 | 916 | |
ad9cab37 JH |
917 | The first use of charinfo() opens a read-only filehandle to the Unicode |
918 | Character Database (the database is included in the Perl distribution). | |
78bf21c2 JH |
919 | The filehandle is then kept open for further queries. In other words, |
920 | if you are wondering where one of your filehandles went, that's where. | |
32c16050 | 921 | |
8b731da2 JH |
922 | =head1 BUGS |
923 | ||
924 | Does not yet support EBCDIC platforms. | |
925 | ||
561c79ed JH |
926 | =head1 AUTHOR |
927 | ||
928 | Jarkko Hietaniemi | |
929 | ||
930 | =cut | |
931 | ||
932 | 1; |