This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
UCD.pm: Use subclassed warnings
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
36c2430c 5no warnings 'surrogate'; # surrogates can be inputs to this
98ef7649 6use charnames ();
561c79ed 7
4a999261 8our $VERSION = '0.32';
561c79ed 9
741297c1
JH
10use Storable qw(dclone);
11
561c79ed
JH
12require Exporter;
13
14our @ISA = qw(Exporter);
74f8133e 15
10a6ecd2
JH
16our @EXPORT_OK = qw(charinfo
17 charblock charscript
18 charblocks charscripts
b08cd201 19 charinrange
ea508aee 20 general_categories bidi_types
b08cd201 21 compexcl
a2bd7410 22 casefold casespec
7319f91d
KW
23 namedseq
24 num
25 );
561c79ed
JH
26
27use Carp;
28
29=head1 NAME
30
55d7b906 31Unicode::UCD - Unicode character database
561c79ed
JH
32
33=head1 SYNOPSIS
34
55d7b906 35 use Unicode::UCD 'charinfo';
b08cd201 36 my $charinfo = charinfo($codepoint);
561c79ed 37
956cae9a
KW
38 use Unicode::UCD 'casefold';
39 my $casefold = casefold(0xFB00);
40
5d8e6e41
KW
41 use Unicode::UCD 'casespec';
42 my $casespec = casespec(0xFB00);
43
55d7b906 44 use Unicode::UCD 'charblock';
e882dd67
JH
45 my $charblock = charblock($codepoint);
46
55d7b906 47 use Unicode::UCD 'charscript';
65044554 48 my $charscript = charscript($codepoint);
561c79ed 49
55d7b906 50 use Unicode::UCD 'charblocks';
e145285f
JH
51 my $charblocks = charblocks();
52
55d7b906 53 use Unicode::UCD 'charscripts';
ea508aee 54 my $charscripts = charscripts();
e145285f 55
55d7b906 56 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
57 my $range = charscript($script);
58 print "looks like $script\n" if charinrange($range, $codepoint);
59
ea508aee
JH
60 use Unicode::UCD qw(general_categories bidi_types);
61 my $categories = general_categories();
62 my $types = bidi_types();
63
55d7b906 64 use Unicode::UCD 'compexcl';
e145285f
JH
65 my $compexcl = compexcl($codepoint);
66
a2bd7410
JH
67 use Unicode::UCD 'namedseq';
68 my $namedseq = namedseq($named_sequence_name);
69
55d7b906 70 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 71
7319f91d
KW
72 my $convert_to_numeric =
73 Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
74
561c79ed
JH
75=head1 DESCRIPTION
76
a452d459
KW
77The Unicode::UCD module offers a series of functions that
78provide a simple interface to the Unicode
8b731da2 79Character Database.
561c79ed 80
a452d459
KW
81=head2 code point argument
82
83Some of the functions are called with a I<code point argument>, which is either
84a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
85followed by hexadecimals designating a Unicode code point. In other words, if
86you want a code point to be interpreted as a hexadecimal number, you must
87prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
88interpreted as a decimal code point. Also note that Unicode is B<not> limited
89to 16 bits (the number of Unicode code points is open-ended, in theory
90unlimited): you may have more than 4 hexdigits.
561c79ed
JH
91=cut
92
10a6ecd2
JH
93my $UNICODEFH;
94my $BLOCKSFH;
10a6ecd2 95my $VERSIONFH;
b08cd201
JH
96my $CASEFOLDFH;
97my $CASESPECFH;
a2bd7410 98my $NAMEDSEQFH;
561c79ed
JH
99
100sub openunicode {
101 my ($rfh, @path) = @_;
102 my $f;
103 unless (defined $$rfh) {
104 for my $d (@INC) {
105 use File::Spec;
55d7b906 106 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 107 last if open($$rfh, $f);
e882dd67 108 undef $f;
561c79ed 109 }
e882dd67
JH
110 croak __PACKAGE__, ": failed to find ",
111 File::Spec->catfile(@path), " in @INC"
112 unless defined $f;
561c79ed
JH
113 }
114 return $f;
115}
116
a452d459 117=head2 B<charinfo()>
561c79ed 118
55d7b906 119 use Unicode::UCD 'charinfo';
561c79ed 120
b08cd201 121 my $charinfo = charinfo(0x41);
561c79ed 122
a452d459
KW
123This returns information about the input L</code point argument>
124as a reference to a hash of fields as defined by the Unicode
125standard. If the L</code point argument> is not assigned in the standard
126(i.e., has the general category C<Cn> meaning C<Unassigned>)
127or is a non-character (meaning it is guaranteed to never be assigned in
128the standard),
129B<undef> is returned.
130
131Fields that aren't applicable to the particular code point argument exist in the
132returned hash, and are empty.
133
134The keys in the hash with the meanings of their values are:
135
136=over
137
138=item B<code>
139
140the input L</code point argument> expressed in hexadecimal, with leading zeros
141added if necessary to make it contain at least four hexdigits
142
143=item B<name>
144
145name of I<code>, all IN UPPER CASE.
146Some control-type code points do not have names.
147This field will be empty for C<Surrogate> and C<Private Use> code points,
148and for the others without a name,
149it will contain a description enclosed in angle brackets, like
150C<E<lt>controlE<gt>>.
151
152
153=item B<category>
154
155The short name of the general category of I<code>.
156This will match one of the keys in the hash returned by L</general_categories()>.
157
158=item B<combining>
159
160the combining class number for I<code> used in the Canonical Ordering Algorithm.
161For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
162available at
163L<http://www.unicode.org/versions/Unicode5.1.0/>
164
165=item B<bidi>
166
167bidirectional type of I<code>.
168This will match one of the keys in the hash returned by L</bidi_types()>.
169
170=item B<decomposition>
171
172is empty if I<code> has no decomposition; or is one or more codes
173(separated by spaces) that taken in order represent a decomposition for
174I<code>. Each has at least four hexdigits.
175The codes may be preceded by a word enclosed in angle brackets then a space,
176like C<E<lt>compatE<gt> >, giving the type of decomposition
177
06bba7d5
KW
178This decomposition may be an intermediate one whose components are also
179decomposable. Use L<Unicode::Normalize> to get the final decomposition.
180
a452d459
KW
181=item B<decimal>
182
183if I<code> is a decimal digit this is its integer numeric value
184
185=item B<digit>
186
89e4a205
KW
187if I<code> represents some other digit-like number, this is its integer
188numeric value
a452d459
KW
189
190=item B<numeric>
191
192if I<code> represents a whole or rational number, this is its numeric value.
193Rational values are expressed as a string like C<1/4>.
194
195=item B<mirrored>
196
197C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
198
199=item B<unicode10>
200
201name of I<code> in the Unicode 1.0 standard if one
202existed for this code point and is different from the current name
203
204=item B<comment>
205
89e4a205 206As of Unicode 6.0, this is always empty.
a452d459
KW
207
208=item B<upper>
209
06bba7d5
KW
210is empty if there is no single code point uppercase mapping for I<code>
211(it's uppercase mapping is itself);
a452d459
KW
212otherwise it is that mapping expressed as at least four hexdigits.
213(L</casespec()> should be used in addition to B<charinfo()>
214for case mappings when the calling program can cope with multiple code point
215mappings.)
216
217=item B<lower>
218
06bba7d5
KW
219is empty if there is no single code point lowercase mapping for I<code>
220(it's lowercase mapping is itself);
a452d459
KW
221otherwise it is that mapping expressed as at least four hexdigits.
222(L</casespec()> should be used in addition to B<charinfo()>
223for case mappings when the calling program can cope with multiple code point
224mappings.)
225
226=item B<title>
227
06bba7d5
KW
228is empty if there is no single code point titlecase mapping for I<code>
229(it's titlecase mapping is itself);
a452d459
KW
230otherwise it is that mapping expressed as at least four hexdigits.
231(L</casespec()> should be used in addition to B<charinfo()>
232for case mappings when the calling program can cope with multiple code point
233mappings.)
234
235=item B<block>
236
89e4a205 237block I<code> belongs to (used in C<\p{Blk=...}>).
a452d459
KW
238See L</Blocks versus Scripts>.
239
240
241=item B<script>
242
243script I<code> belongs to.
244See L</Blocks versus Scripts>.
245
246=back
32c16050
JH
247
248Note that you cannot do (de)composition and casing based solely on the
a452d459
KW
249I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
250you will need also the L</compexcl()>, and L</casespec()> functions.
561c79ed
JH
251
252=cut
253
e10d7780 254# NB: This function is nearly duplicated in charnames.pm
10a6ecd2
JH
255sub _getcode {
256 my $arg = shift;
257
dc0a4417 258 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 259 return $arg;
dc0a4417 260 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
261 return hex($1);
262 }
263
264 return;
265}
266
ac5ea531
JH
267# Lingua::KO::Hangul::Util not part of the standard distribution
268# but it will be used if available.
269
270eval { require Lingua::KO::Hangul::Util };
271my $hasHangulUtil = ! $@;
272if ($hasHangulUtil) {
273 Lingua::KO::Hangul::Util->import();
274}
9087a70b
TS
275
276sub hangul_decomp { # internal: called from charinfo
ac5ea531
JH
277 if ($hasHangulUtil) {
278 my @tmp = decomposeHangul(shift);
279 return sprintf("%04X %04X", @tmp) if @tmp == 2;
280 return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
281 }
282 return;
283}
284
285sub hangul_charname { # internal: called from charinfo
286 return sprintf("HANGUL SYLLABLE-%04X", shift);
a6fa416b
TS
287}
288
9087a70b
TS
289sub han_charname { # internal: called from charinfo
290 return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
a6fa416b
TS
291}
292
5d8e6e41 293# Overwritten by data in file
324f9e44
RB
294my %first_last = (
295 'CJK Ideograph Extension A' => [ 0x3400, 0x4DB5 ],
296 'CJK Ideograph' => [ 0x4E00, 0x9FA5 ],
297 'CJK Ideograph Extension B' => [ 0x20000, 0x2A6D6 ],
298);
299
300get_charinfo_ranges();
301
302sub get_charinfo_ranges {
303 my @blocks = keys %first_last;
304
305 my $fh;
306 openunicode( \$fh, 'UnicodeData.txt' );
307 if( defined $fh ){
308 while( my $line = <$fh> ){
309 next unless $line =~ /(?:First|Last)/;
310 if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
311 my ($number,$block,$type);
312 ($number,$block) = split /;/, $line;
313 $block =~ s/<|>//g;
314 ($block,$type) = split /, /, $block;
315 my $index = $type eq 'First' ? 0 : 1;
316 $first_last{ $block }->[$index] = hex $number;
317 }
318 }
319 }
320}
321
a6fa416b
TS
322my @CharinfoRanges = (
323# block name
324# [ first, last, coderef to name, coderef to decompose ],
325# CJK Ideographs Extension A
324f9e44 326 [ @{ $first_last{'CJK Ideograph Extension A'} }, \&han_charname, undef ],
a6fa416b 327# CJK Ideographs
324f9e44 328 [ @{ $first_last{'CJK Ideograph'} }, \&han_charname, undef ],
a6fa416b 329# Hangul Syllables
ac5ea531 330 [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
a6fa416b
TS
331# Non-Private Use High Surrogates
332 [ 0xD800, 0xDB7F, undef, undef ],
333# Private Use High Surrogates
334 [ 0xDB80, 0xDBFF, undef, undef ],
335# Low Surrogates
336 [ 0xDC00, 0xDFFF, undef, undef ],
337# The Private Use Area
338 [ 0xE000, 0xF8FF, undef, undef ],
339# CJK Ideographs Extension B
324f9e44 340 [ @{ $first_last{'CJK Ideograph Extension B'} }, \&han_charname, undef ],
a6fa416b
TS
341# Plane 15 Private Use Area
342 [ 0xF0000, 0xFFFFD, undef, undef ],
343# Plane 16 Private Use Area
344 [ 0x100000, 0x10FFFD, undef, undef ],
345);
346
561c79ed 347sub charinfo {
10a6ecd2
JH
348 my $arg = shift;
349 my $code = _getcode($arg);
350 croak __PACKAGE__, "::charinfo: unknown code '$arg'"
351 unless defined $code;
e63dbbf9 352 my $hexk = sprintf("%06X", $code);
a6fa416b
TS
353 my($rcode,$rname,$rdec);
354 foreach my $range (@CharinfoRanges){
74f8133e 355 if ($range->[0] <= $code && $code <= $range->[1]) {
a6fa416b 356 $rcode = $hexk;
e63dbbf9
JH
357 $rcode =~ s/^0+//;
358 $rcode = sprintf("%04X", hex($rcode));
a6fa416b
TS
359 $rname = $range->[2] ? $range->[2]->($code) : '';
360 $rdec = $range->[3] ? $range->[3]->($code) : '';
e63dbbf9 361 $hexk = sprintf("%06X", $range->[0]); # replace by the first
a6fa416b
TS
362 last;
363 }
364 }
551b6b6f 365 openunicode(\$UNICODEFH, "UnicodeData.txt");
10a6ecd2 366 if (defined $UNICODEFH) {
e63dbbf9
JH
367 use Search::Dict 1.02;
368 if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
10a6ecd2 369 my $line = <$UNICODEFH>;
c5a29f40 370 return unless defined $line;
561c79ed
JH
371 chomp $line;
372 my %prop;
373 @prop{qw(
374 code name category
375 combining bidi decomposition
376 decimal digit numeric
377 mirrored unicode10 comment
378 upper lower title
379 )} = split(/;/, $line, -1);
e63dbbf9
JH
380 $hexk =~ s/^0+//;
381 $hexk = sprintf("%04X", hex($hexk));
561c79ed 382 if ($prop{code} eq $hexk) {
a196fbfd
JH
383 $prop{block} = charblock($code);
384 $prop{script} = charscript($code);
a6fa416b
TS
385 if(defined $rname){
386 $prop{code} = $rcode;
387 $prop{name} = $rname;
388 $prop{decomposition} = $rdec;
389 }
b08cd201 390 return \%prop;
561c79ed
JH
391 }
392 }
393 }
394 return;
395}
396
e882dd67
JH
397sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
398 my ($table, $lo, $hi, $code) = @_;
399
400 return if $lo > $hi;
401
402 my $mid = int(($lo+$hi) / 2);
403
404 if ($table->[$mid]->[0] < $code) {
10a6ecd2 405 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
406 return $table->[$mid]->[2];
407 } else {
408 _search($table, $mid + 1, $hi, $code);
409 }
410 } elsif ($table->[$mid]->[0] > $code) {
411 _search($table, $lo, $mid - 1, $code);
412 } else {
413 return $table->[$mid]->[2];
414 }
415}
416
3a12600d
KW
417sub _read_table {
418
419 # Returns the contents of the mktables generated table file located at $1
420 # in the form of an array of arrays. Each outer array denotes a range
421 # with [0] the start point of that range; [1] the end point; and [2] the
422 # value that every code point in the range has.
423 #
424 # This has the side effect of setting
425 # $utf8::SwashInfo{$property}{'format'} to be the mktables format of the
426 # table; and
427 # $utf8::SwashInfo{$property}{'missing'} to be the value for all entries
428 # not listed in the table.
429 # where $property is the Unicode property name, preceded by 'To' for map
430 # properties., e.g., 'ToSc'.
431 #
432 # Table entries look like one of:
433 # 0000 0040 Common # [65]
434 # 00AA Latin
435
436 my $table = shift;
437 my @return;
438 local $_;
439
440 for (split /^/m, do $table) {
441 my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
442 \s* ( \# .* )? # Optional comment
443 $ /x;
444 $end = $start if $end eq "";
445 push @return, [ hex $start, hex $end, $value ];
446 }
447 return @return;
448}
449
10a6ecd2
JH
450sub charinrange {
451 my ($range, $arg) = @_;
452 my $code = _getcode($arg);
453 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
454 unless defined $code;
455 _search($range, 0, $#$range, $code);
456}
457
a452d459 458=head2 B<charblock()>
561c79ed 459
55d7b906 460 use Unicode::UCD 'charblock';
561c79ed
JH
461
462 my $charblock = charblock(0x41);
10a6ecd2 463 my $charblock = charblock(1234);
a452d459 464 my $charblock = charblock(0x263a);
10a6ecd2
JH
465 my $charblock = charblock("U+263a");
466
78bf21c2 467 my $range = charblock('Armenian');
10a6ecd2 468
a452d459
KW
469With a L</code point argument> charblock() returns the I<block> the code point
470belongs to, e.g. C<Basic Latin>.
471If the code point is unassigned, this returns the block it would belong to if
472it were assigned (which it may in future versions of the Unicode Standard).
10a6ecd2 473
78bf21c2
JH
474See also L</Blocks versus Scripts>.
475
eb0cc9e3 476If supplied with an argument that can't be a code point, charblock() tries
a452d459 477to do the opposite and interpret the argument as a code point block. The
eb0cc9e3 478return value is a I<range>: an anonymous list of lists that contain
a2bd7410 479I<start-of-range>, I<end-of-range> code point pairs. You can test whether
a452d459
KW
480a code point is in a range using the L</charinrange()> function. If the
481argument is not a known code point block, B<undef> is returned.
561c79ed 482
561c79ed
JH
483=cut
484
485my @BLOCKS;
10a6ecd2 486my %BLOCKS;
561c79ed 487
10a6ecd2 488sub _charblocks {
06bba7d5
KW
489
490 # Can't read from the mktables table because it loses the hyphens in the
491 # original.
561c79ed 492 unless (@BLOCKS) {
10a6ecd2 493 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 494 local $_;
10a6ecd2 495 while (<$BLOCKSFH>) {
2796c109 496 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
497 my ($lo, $hi) = (hex($1), hex($2));
498 my $subrange = [ $lo, $hi, $3 ];
499 push @BLOCKS, $subrange;
500 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
501 }
502 }
10a6ecd2 503 close($BLOCKSFH);
561c79ed
JH
504 }
505 }
10a6ecd2
JH
506}
507
508sub charblock {
509 my $arg = shift;
510
511 _charblocks() unless @BLOCKS;
512
513 my $code = _getcode($arg);
561c79ed 514
10a6ecd2
JH
515 if (defined $code) {
516 _search(\@BLOCKS, 0, $#BLOCKS, $code);
517 } else {
518 if (exists $BLOCKS{$arg}) {
741297c1 519 return dclone $BLOCKS{$arg};
10a6ecd2
JH
520 } else {
521 return;
522 }
523 }
e882dd67
JH
524}
525
a452d459 526=head2 B<charscript()>
e882dd67 527
55d7b906 528 use Unicode::UCD 'charscript';
e882dd67
JH
529
530 my $charscript = charscript(0x41);
10a6ecd2
JH
531 my $charscript = charscript(1234);
532 my $charscript = charscript("U+263a");
e882dd67 533
78bf21c2 534 my $range = charscript('Thai');
10a6ecd2 535
a452d459
KW
536With a L</code point argument> charscript() returns the I<script> the
537code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
538If the code point is unassigned, it returns B<undef>
78bf21c2 539
eb0cc9e3 540If supplied with an argument that can't be a code point, charscript() tries
a452d459 541to do the opposite and interpret the argument as a code point script. The
eb0cc9e3
JH
542return value is a I<range>: an anonymous list of lists that contain
543I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
a452d459
KW
544code point is in a range using the L</charinrange()> function. If the
545argument is not a known code point script, B<undef> is returned.
546
547See also L</Blocks versus Scripts>.
e882dd67 548
e882dd67
JH
549=cut
550
551my @SCRIPTS;
10a6ecd2 552my %SCRIPTS;
e882dd67 553
10a6ecd2 554sub _charscripts {
7bccef0b
KW
555 @SCRIPTS =_read_table("unicore/To/Sc.pl") unless @SCRIPTS;
556 foreach my $entry (@SCRIPTS) {
f3d50ac9 557 $entry->[2] =~ s/(_\w)/\L$1/g; # Preserve old-style casing
7bccef0b 558 push @{$SCRIPTS{$entry->[2]}}, $entry;
e882dd67 559 }
10a6ecd2
JH
560}
561
562sub charscript {
563 my $arg = shift;
564
565 _charscripts() unless @SCRIPTS;
e882dd67 566
10a6ecd2
JH
567 my $code = _getcode($arg);
568
569 if (defined $code) {
7bccef0b
KW
570 my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
571 return $result if defined $result;
572 #return $utf8::SwashInfo{'ToSc'}{'missing'};
573 } elsif (exists $SCRIPTS{$arg}) {
574 return dclone $SCRIPTS{$arg};
10a6ecd2 575 }
7bccef0b
KW
576
577 return;
10a6ecd2
JH
578}
579
a452d459 580=head2 B<charblocks()>
10a6ecd2 581
55d7b906 582 use Unicode::UCD 'charblocks';
10a6ecd2 583
b08cd201 584 my $charblocks = charblocks();
10a6ecd2 585
b08cd201 586charblocks() returns a reference to a hash with the known block names
a452d459 587as the keys, and the code point ranges (see L</charblock()>) as the values.
10a6ecd2 588
78bf21c2
JH
589See also L</Blocks versus Scripts>.
590
10a6ecd2
JH
591=cut
592
593sub charblocks {
b08cd201 594 _charblocks() unless %BLOCKS;
741297c1 595 return dclone \%BLOCKS;
10a6ecd2
JH
596}
597
a452d459 598=head2 B<charscripts()>
10a6ecd2 599
55d7b906 600 use Unicode::UCD 'charscripts';
10a6ecd2 601
ea508aee 602 my $charscripts = charscripts();
10a6ecd2 603
ea508aee 604charscripts() returns a reference to a hash with the known script
a452d459 605names as the keys, and the code point ranges (see L</charscript()>) as
ea508aee 606the values.
10a6ecd2 607
78bf21c2
JH
608See also L</Blocks versus Scripts>.
609
10a6ecd2
JH
610=cut
611
612sub charscripts {
b08cd201 613 _charscripts() unless %SCRIPTS;
741297c1 614 return dclone \%SCRIPTS;
561c79ed
JH
615}
616
a452d459 617=head2 B<charinrange()>
10a6ecd2
JH
618
619In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
620can also test whether a code point is in the I<range> as returned by
a452d459
KW
621L</charblock()> and L</charscript()> or as the values of the hash returned
622by L</charblocks()> and L</charscripts()> by using charinrange():
10a6ecd2 623
55d7b906 624 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
625
626 $range = charscript('Hiragana');
e145285f 627 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
628
629=cut
630
ea508aee
JH
631my %GENERAL_CATEGORIES =
632 (
633 'L' => 'Letter',
634 'LC' => 'CasedLetter',
635 'Lu' => 'UppercaseLetter',
636 'Ll' => 'LowercaseLetter',
637 'Lt' => 'TitlecaseLetter',
638 'Lm' => 'ModifierLetter',
639 'Lo' => 'OtherLetter',
640 'M' => 'Mark',
641 'Mn' => 'NonspacingMark',
642 'Mc' => 'SpacingMark',
643 'Me' => 'EnclosingMark',
644 'N' => 'Number',
645 'Nd' => 'DecimalNumber',
646 'Nl' => 'LetterNumber',
647 'No' => 'OtherNumber',
648 'P' => 'Punctuation',
649 'Pc' => 'ConnectorPunctuation',
650 'Pd' => 'DashPunctuation',
651 'Ps' => 'OpenPunctuation',
652 'Pe' => 'ClosePunctuation',
653 'Pi' => 'InitialPunctuation',
654 'Pf' => 'FinalPunctuation',
655 'Po' => 'OtherPunctuation',
656 'S' => 'Symbol',
657 'Sm' => 'MathSymbol',
658 'Sc' => 'CurrencySymbol',
659 'Sk' => 'ModifierSymbol',
660 'So' => 'OtherSymbol',
661 'Z' => 'Separator',
662 'Zs' => 'SpaceSeparator',
663 'Zl' => 'LineSeparator',
664 'Zp' => 'ParagraphSeparator',
665 'C' => 'Other',
666 'Cc' => 'Control',
667 'Cf' => 'Format',
668 'Cs' => 'Surrogate',
669 'Co' => 'PrivateUse',
670 'Cn' => 'Unassigned',
671 );
672
673sub general_categories {
674 return dclone \%GENERAL_CATEGORIES;
675}
676
a452d459 677=head2 B<general_categories()>
ea508aee
JH
678
679 use Unicode::UCD 'general_categories';
680
681 my $categories = general_categories();
682
a452d459 683This returns a reference to a hash which has short
ea508aee
JH
684general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
685names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
686C<Symbol>) as values. The hash is reversible in case you need to go
687from the long names to the short names. The general category is the
a452d459
KW
688one returned from
689L</charinfo()> under the C<category> key.
ea508aee
JH
690
691=cut
692
693my %BIDI_TYPES =
694 (
695 'L' => 'Left-to-Right',
696 'LRE' => 'Left-to-Right Embedding',
697 'LRO' => 'Left-to-Right Override',
698 'R' => 'Right-to-Left',
699 'AL' => 'Right-to-Left Arabic',
700 'RLE' => 'Right-to-Left Embedding',
701 'RLO' => 'Right-to-Left Override',
702 'PDF' => 'Pop Directional Format',
703 'EN' => 'European Number',
704 'ES' => 'European Number Separator',
705 'ET' => 'European Number Terminator',
706 'AN' => 'Arabic Number',
707 'CS' => 'Common Number Separator',
708 'NSM' => 'Non-Spacing Mark',
709 'BN' => 'Boundary Neutral',
710 'B' => 'Paragraph Separator',
711 'S' => 'Segment Separator',
712 'WS' => 'Whitespace',
713 'ON' => 'Other Neutrals',
714 );
715
a452d459 716=head2 B<bidi_types()>
ea508aee
JH
717
718 use Unicode::UCD 'bidi_types';
719
720 my $categories = bidi_types();
721
a452d459 722This returns a reference to a hash which has the short
ea508aee
JH
723bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
724names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
725hash is reversible in case you need to go from the long names to the
a452d459
KW
726short names. The bidi type is the one returned from
727L</charinfo()>
ea508aee
JH
728under the C<bidi> key. For the exact meaning of the various bidi classes
729the Unicode TR9 is recommended reading:
a452d459 730L<http://www.unicode.org/reports/tr9/>
ea508aee
JH
731(as of Unicode 5.0.0)
732
733=cut
734
a452d459
KW
735sub bidi_types {
736 return dclone \%BIDI_TYPES;
737}
738
739=head2 B<compexcl()>
b08cd201 740
55d7b906 741 use Unicode::UCD 'compexcl';
b08cd201 742
a452d459 743 my $compexcl = compexcl(0x09dc);
b08cd201 744
71a442a8
KW
745This routine is included for backwards compatibility, but as of Perl 5.12, for
746most purposes it is probably more convenient to use one of the following
747instead:
748
749 my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
750 my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
751
752or even
753
754 my $compexcl = chr(0x09dc) =~ /\p{CE};
755 my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
756
757The first two forms return B<true> if the L</code point argument> should not
758be produced by composition normalization. The final two forms
759additionally require that this fact not otherwise be determinable from
760the Unicode data base for them to return B<true>.
761
762This routine behaves identically to the final two forms. That is,
763it does not return B<true> if the code point has a decomposition
a452d459
KW
764consisting of another single code point, nor if its decomposition starts
765with a code point whose combining class is non-zero. Code points that meet
766either of these conditions should also not be produced by composition
71a442a8
KW
767normalization, which is probably why you should use the
768C<Full_Composition_Exclusion> property instead, as shown above.
b08cd201 769
71a442a8 770The routine returns B<false> otherwise.
b08cd201
JH
771
772=cut
773
b08cd201
JH
774sub compexcl {
775 my $arg = shift;
776 my $code = _getcode($arg);
74f8133e
JH
777 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
778 unless defined $code;
b08cd201 779
36c2430c 780 no warnings "non_unicode"; # So works on non-Unicode code points
71a442a8 781 return chr($code) =~ /\p{Composition_Exclusion}/;
b08cd201
JH
782}
783
a452d459 784=head2 B<casefold()>
b08cd201 785
55d7b906 786 use Unicode::UCD 'casefold';
b08cd201 787
a452d459
KW
788 my $casefold = casefold(0xDF);
789 if (defined $casefold) {
790 my @full_fold_hex = split / /, $casefold->{'full'};
791 my $full_fold_string =
792 join "", map {chr(hex($_))} @full_fold_hex;
793 my @turkic_fold_hex =
794 split / /, ($casefold->{'turkic'} ne "")
795 ? $casefold->{'turkic'}
796 : $casefold->{'full'};
797 my $turkic_fold_string =
798 join "", map {chr(hex($_))} @turkic_fold_hex;
799 }
800 if (defined $casefold && $casefold->{'simple'} ne "") {
801 my $simple_fold_hex = $casefold->{'simple'};
802 my $simple_fold_string = chr(hex($simple_fold_hex));
803 }
b08cd201 804
a452d459
KW
805This returns the (almost) locale-independent case folding of the
806character specified by the L</code point argument>.
b08cd201 807
a452d459
KW
808If there is no case folding for that code point, B<undef> is returned.
809
810If there is a case folding for that code point, a reference to a hash
b08cd201
JH
811with the following fields is returned:
812
a452d459
KW
813=over
814
815=item B<code>
816
817the input L</code point argument> expressed in hexadecimal, with leading zeros
818added if necessary to make it contain at least four hexdigits
819
820=item B<full>
821
822one or more codes (separated by spaces) that taken in order give the
823code points for the case folding for I<code>.
824Each has at least four hexdigits.
825
826=item B<simple>
827
828is empty, or is exactly one code with at least four hexdigits which can be used
829as an alternative case folding when the calling program cannot cope with the
830fold being a sequence of multiple code points. If I<full> is just one code
831point, then I<simple> equals I<full>. If there is no single code point folding
832defined for I<code>, then I<simple> is the empty string. Otherwise, it is an
833inferior, but still better-than-nothing alternative folding to I<full>.
834
835=item B<mapping>
836
837is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
838otherwise. It can be considered to be the simplest possible folding for
839I<code>. It is defined primarily for backwards compatibility.
840
841=item B<status>
b08cd201 842
a452d459
KW
843is C<C> (for C<common>) if the best possible fold is a single code point
844(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct
845folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if
846there only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty). Note
847that this
848describes the contents of I<mapping>. It is defined primarily for backwards
849compatibility.
b08cd201 850
a452d459
KW
851On versions 3.1 and earlier of Unicode, I<status> can also be
852C<I> which is the same as C<C> but is a special case for dotted uppercase I and
853dotless lowercase i:
b08cd201 854
a452d459 855=over
b08cd201 856
a452d459
KW
857=item B<*>
858
859If you use this C<I> mapping, the result is case-insensitive,
860but dotless and dotted I's are not distinguished
861
862=item B<*>
863
864If you exclude this C<I> mapping, the result is not fully case-insensitive, but
865dotless and dotted I's are distinguished
866
867=back
868
869=item B<turkic>
870
871contains any special folding for Turkic languages. For versions of Unicode
872starting with 3.2, this field is empty unless I<code> has a different folding
873in Turkic languages, in which case it is one or more codes (separated by
874spaces) that taken in order give the code points for the case folding for
875I<code> in those languages.
876Each code has at least four hexdigits.
877Note that this folding does not maintain canonical equivalence without
878additional processing.
879
880For versions of Unicode 3.1 and earlier, this field is empty unless there is a
881special folding for Turkic languages, in which case I<status> is C<I>, and
882I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
883
884=back
885
886Programs that want complete generality and the best folding results should use
887the folding contained in the I<full> field. But note that the fold for some
888code points will be a sequence of multiple code points.
889
890Programs that can't cope with the fold mapping being multiple code points can
891use the folding contained in the I<simple> field, with the loss of some
892generality. In Unicode 5.1, about 7% of the defined foldings have no single
893code point folding.
894
895The I<mapping> and I<status> fields are provided for backwards compatibility for
896existing programs. They contain the same values as in previous versions of
897this function.
898
899Locale is not completely independent. The I<turkic> field contains results to
900use when the locale is a Turkic language.
b08cd201
JH
901
902For more information about case mappings see
a452d459 903L<http://www.unicode.org/unicode/reports/tr21>
b08cd201
JH
904
905=cut
906
907my %CASEFOLD;
908
909sub _casefold {
910 unless (%CASEFOLD) {
551b6b6f 911 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 912 local $_;
b08cd201 913 while (<$CASEFOLDFH>) {
a452d459 914 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
b08cd201 915 my $code = hex($1);
a452d459
KW
916 $CASEFOLD{$code}{'code'} = $1;
917 $CASEFOLD{$code}{'turkic'} = "" unless
918 defined $CASEFOLD{$code}{'turkic'};
919 if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and
920 # earlier Unicodes
921 # Both entries there (I
922 # only checked 3.1) are
923 # the same as C, and
924 # there are no other
925 # entries for those
926 # codepoints, so treat
927 # as if C, but override
928 # the turkic one for
929 # 'I'.
930 $CASEFOLD{$code}{'status'} = $2;
931 $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
932 $CASEFOLD{$code}{'mapping'} = $3;
933 $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
934 } elsif ($2 eq 'F') {
935 $CASEFOLD{$code}{'full'} = $3;
936 unless (defined $CASEFOLD{$code}{'simple'}) {
937 $CASEFOLD{$code}{'simple'} = "";
938 $CASEFOLD{$code}{'mapping'} = $3;
939 $CASEFOLD{$code}{'status'} = $2;
940 }
941 } elsif ($2 eq 'S') {
942
943
944 # There can't be a simple without a full, and simple
945 # overrides all but full
946
947 $CASEFOLD{$code}{'simple'} = $3;
948 $CASEFOLD{$code}{'mapping'} = $3;
949 $CASEFOLD{$code}{'status'} = $2;
950 } elsif ($2 eq 'T') {
951 $CASEFOLD{$code}{'turkic'} = $3;
952 } # else can't happen because only [CIFST] are possible
b08cd201
JH
953 }
954 }
955 close($CASEFOLDFH);
956 }
957 }
958}
959
960sub casefold {
961 my $arg = shift;
962 my $code = _getcode($arg);
74f8133e
JH
963 croak __PACKAGE__, "::casefold: unknown code '$arg'"
964 unless defined $code;
b08cd201
JH
965
966 _casefold() unless %CASEFOLD;
967
968 return $CASEFOLD{$code};
969}
970
a452d459 971=head2 B<casespec()>
b08cd201 972
55d7b906 973 use Unicode::UCD 'casespec';
b08cd201 974
a452d459 975 my $casespec = casespec(0xFB00);
b08cd201 976
a452d459
KW
977This returns the potentially locale-dependent case mappings of the L</code point
978argument>. The mappings may be longer than a single code point (which the basic
979Unicode case mappings as returned by L</charinfo()> never are).
b08cd201 980
a452d459
KW
981If there are no case mappings for the L</code point argument>, or if all three
982possible mappings (I<lower>, I<title> and I<upper>) result in single code
5d8e6e41
KW
983points and are locale independent and unconditional, B<undef> is returned
984(which means that the case mappings, if any, for the code point are those
985returned by L</charinfo()>).
a452d459
KW
986
987Otherwise, a reference to a hash giving the mappings (or a reference to a hash
5d8e6e41
KW
988of such hashes, explained below) is returned with the following keys and their
989meanings:
a452d459
KW
990
991The keys in the bottom layer hash with the meanings of their values are:
992
993=over
994
995=item B<code>
996
997the input L</code point argument> expressed in hexadecimal, with leading zeros
998added if necessary to make it contain at least four hexdigits
999
1000=item B<lower>
1001
1002one or more codes (separated by spaces) that taken in order give the
1003code points for the lower case of I<code>.
1004Each has at least four hexdigits.
1005
1006=item B<title>
b08cd201 1007
a452d459
KW
1008one or more codes (separated by spaces) that taken in order give the
1009code points for the title case of I<code>.
1010Each has at least four hexdigits.
b08cd201 1011
d2da20e3 1012=item B<upper>
b08cd201 1013
a452d459
KW
1014one or more codes (separated by spaces) that taken in order give the
1015code points for the upper case of I<code>.
1016Each has at least four hexdigits.
1017
1018=item B<condition>
1019
1020the conditions for the mappings to be valid.
1021If B<undef>, the mappings are always valid.
1022When defined, this field is a list of conditions,
1023all of which must be true for the mappings to be valid.
1024The list consists of one or more
1025I<locales> (see below)
1026and/or I<contexts> (explained in the next paragraph),
1027separated by spaces.
1028(Other than as used to separate elements, spaces are to be ignored.)
1029Case distinctions in the condition list are not significant.
82c0b05b 1030Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 1031
a452d459
KW
1032A I<context> is one of those defined in the Unicode standard.
1033For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1034available at
5d8e6e41
KW
1035L<http://www.unicode.org/versions/Unicode5.1.0/>.
1036These are for context-sensitive casing.
f499c386 1037
a452d459
KW
1038=back
1039
5d8e6e41
KW
1040The hash described above is returned for locale-independent casing, where
1041at least one of the mappings has length longer than one. If B<undef> is
1042returned, the code point may have mappings, but if so, all are length one,
1043and are returned by L</charinfo()>.
1044Note that when this function does return a value, it will be for the complete
1045set of mappings for a code point, even those whose length is one.
1046
1047If there are additional casing rules that apply only in certain locales,
1048an additional key for each will be defined in the returned hash. Each such key
1049will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1050followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1051and a variant code). You can find the lists of all possible locales, see
1052L<Locale::Country> and L<Locale::Language>.
89e4a205 1053(In Unicode 6.0, the only locales returned by this function
a452d459 1054are C<lt>, C<tr>, and C<az>.)
b08cd201 1055
5d8e6e41
KW
1056Each locale key is a reference to a hash that has the form above, and gives
1057the casing rules for that particular locale, which take precedence over the
1058locale-independent ones when in that locale.
1059
1060If the only casing for a code point is locale-dependent, then the returned
1061hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1062will contain only locale keys.
1063
b08cd201 1064For more information about case mappings see
a452d459 1065L<http://www.unicode.org/unicode/reports/tr21/>
b08cd201
JH
1066
1067=cut
1068
1069my %CASESPEC;
1070
1071sub _casespec {
1072 unless (%CASESPEC) {
551b6b6f 1073 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 1074 local $_;
b08cd201
JH
1075 while (<$CASESPECFH>) {
1076 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
1077 my ($hexcode, $lower, $title, $upper, $condition) =
1078 ($1, $2, $3, $4, $5);
1079 my $code = hex($hexcode);
1080 if (exists $CASESPEC{$code}) {
1081 if (exists $CASESPEC{$code}->{code}) {
1082 my ($oldlower,
1083 $oldtitle,
1084 $oldupper,
1085 $oldcondition) =
1086 @{$CASESPEC{$code}}{qw(lower
1087 title
1088 upper
1089 condition)};
822ebcc8
JH
1090 if (defined $oldcondition) {
1091 my ($oldlocale) =
f499c386 1092 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
1093 delete $CASESPEC{$code};
1094 $CASESPEC{$code}->{$oldlocale} =
1095 { code => $hexcode,
1096 lower => $oldlower,
1097 title => $oldtitle,
1098 upper => $oldupper,
1099 condition => $oldcondition };
f499c386
JH
1100 }
1101 }
1102 my ($locale) =
1103 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1104 $CASESPEC{$code}->{$locale} =
1105 { code => $hexcode,
1106 lower => $lower,
1107 title => $title,
1108 upper => $upper,
1109 condition => $condition };
1110 } else {
1111 $CASESPEC{$code} =
1112 { code => $hexcode,
1113 lower => $lower,
1114 title => $title,
1115 upper => $upper,
1116 condition => $condition };
1117 }
b08cd201
JH
1118 }
1119 }
1120 close($CASESPECFH);
1121 }
1122 }
1123}
1124
1125sub casespec {
1126 my $arg = shift;
1127 my $code = _getcode($arg);
74f8133e
JH
1128 croak __PACKAGE__, "::casespec: unknown code '$arg'"
1129 unless defined $code;
b08cd201
JH
1130
1131 _casespec() unless %CASESPEC;
1132
741297c1 1133 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
1134}
1135
a452d459 1136=head2 B<namedseq()>
a2bd7410
JH
1137
1138 use Unicode::UCD 'namedseq';
1139
1140 my $namedseq = namedseq("KATAKANA LETTER AINU P");
1141 my @namedseq = namedseq("KATAKANA LETTER AINU P");
1142 my %namedseq = namedseq();
1143
1144If used with a single argument in a scalar context, returns the string
a452d459 1145consisting of the code points of the named sequence, or B<undef> if no
a2bd7410 1146named sequence by that name exists. If used with a single argument in
956cae9a
KW
1147a list context, it returns the list of the ordinals of the code points. If used
1148with no
a2bd7410
JH
1149arguments in a list context, returns a hash with the names of the
1150named sequences as the keys and the named sequences as strings as
a452d459 1151the values. Otherwise, it returns B<undef> or an empty list depending
a2bd7410
JH
1152on the context.
1153
a452d459
KW
1154This function only operates on officially approved (not provisional) named
1155sequences.
a2bd7410 1156
27f853a0
KW
1157Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
1158sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
1159LETTER AINU P")> will return the same string this function does, but will also
1160operate on character names that aren't named sequences, without you having to
1161know which are which. See L<charnames>.
1162
a2bd7410
JH
1163=cut
1164
1165my %NAMEDSEQ;
1166
1167sub _namedseq {
1168 unless (%NAMEDSEQ) {
98ef7649 1169 if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
a2bd7410
JH
1170 local $_;
1171 while (<$NAMEDSEQFH>) {
98ef7649
KW
1172 if (/^ [0-9A-F]+ \ /x) {
1173 chomp;
1174 my ($sequence, $name) = split /\t/;
1175 my @s = map { chr(hex($_)) } split(' ', $sequence);
1176 $NAMEDSEQ{$name} = join("", @s);
a2bd7410
JH
1177 }
1178 }
1179 close($NAMEDSEQFH);
1180 }
1181 }
1182}
1183
1184sub namedseq {
98ef7649
KW
1185
1186 # Use charnames::string_vianame() which now returns this information,
1187 # unless the caller wants the hash returned, in which case we read it in,
1188 # and thereafter use it instead of calling charnames, as it is faster.
1189
a2bd7410
JH
1190 my $wantarray = wantarray();
1191 if (defined $wantarray) {
1192 if ($wantarray) {
1193 if (@_ == 0) {
98ef7649 1194 _namedseq() unless %NAMEDSEQ;
a2bd7410
JH
1195 return %NAMEDSEQ;
1196 } elsif (@_ == 1) {
98ef7649
KW
1197 my $s;
1198 if (%NAMEDSEQ) {
1199 $s = $NAMEDSEQ{ $_[0] };
1200 }
1201 else {
1202 $s = charnames::string_vianame($_[0]);
1203 }
a2bd7410
JH
1204 return defined $s ? map { ord($_) } split('', $s) : ();
1205 }
1206 } elsif (@_ == 1) {
98ef7649
KW
1207 return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
1208 return charnames::string_vianame($_[0]);
a2bd7410
JH
1209 }
1210 }
1211 return;
1212}
1213
7319f91d
KW
1214my %NUMERIC;
1215
1216sub _numeric {
1217
1218 # Unicode 6.0 instituted the rule that only digits in a consecutive
1219 # block of 10 would be considered decimal digits. Before that, the only
1220 # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
1221 # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
1222 # ONE. The code could be modified to handle that, but not bothering, as
1223 # in TUS 6.0, U+19DA was changed to Nt=Di.
1224 if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
1225 croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
1226 }
98025745
KW
1227 my @numbers = _read_table("unicore/To/Nv.pl");
1228 foreach my $entry (@numbers) {
1229 my ($start, $end, $value) = @$entry;
1230
1231 # If value contains a slash, convert to decimal
1232 if ((my @rational = split /\//, $value) == 2) {
1233 my $real = $rational[0] / $rational[1];
1234 $value = $real;
1235 }
1236
1237 for my $i ($start .. $end) {
1238 $NUMERIC{$i} = $value;
7319f91d 1239 }
7319f91d 1240 }
2dc5eb26
KW
1241
1242 # Decided unsafe to use these that aren't officially part of the Unicode
1243 # standard.
1244 #use Math::Trig;
1245 #my $pi = acos(-1.0);
98025745 1246 #$NUMERIC{0x03C0} = $pi;
7319f91d
KW
1247
1248 # Euler's constant, not to be confused with Euler's number
98025745 1249 #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
7319f91d
KW
1250
1251 # Euler's number
98025745 1252 #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
2dc5eb26 1253
7319f91d
KW
1254 return;
1255}
1256
1257=pod
1258
1259=head2 num
1260
1261C<num> returns the numeric value of the input Unicode string; or C<undef> if it
1262doesn't think the entire string has a completely valid, safe numeric value.
1263
1264If the string is just one character in length, the Unicode numeric value
1265is returned if it has one, or C<undef> otherwise. Note that this need
1266not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
2dc5eb26
KW
1267example returns -0.5.
1268
1269=cut
7319f91d 1270
2dc5eb26
KW
1271#A few characters to which Unicode doesn't officially
1272#assign a numeric value are considered numeric by C<num>.
1273#These are:
1274
1275# EULER CONSTANT 0.5772... (this is NOT Euler's number)
1276# SCRIPT SMALL E 2.71828... (this IS Euler's number)
1277# GREEK SMALL LETTER PI 3.14159...
1278
1279=pod
7319f91d
KW
1280
1281If the string is more than one character, C<undef> is returned unless
1282all its characters are decimal digits (that is they would match C<\d+>),
1283from the same script. For example if you have an ASCII '0' and a Bengali
1284'3', mixed together, they aren't considered a valid number, and C<undef>
1285is returned. A further restriction is that the digits all have to be of
1286the same form. A half-width digit mixed with a full-width one will
1287return C<undef>. The Arabic script has two sets of digits; C<num> will
1288return C<undef> unless all the digits in the string come from the same
1289set.
1290
1291C<num> errs on the side of safety, and there may be valid strings of
1292decimal digits that it doesn't recognize. Note that Unicode defines
1293a number of "digit" characters that aren't "decimal digit" characters.
a278d14b 1294"Decimal digits" have the property that they have a positional value, i.e.,
7319f91d
KW
1295there is a units position, a 10's position, a 100's, etc, AND they are
1296arranged in Unicode in blocks of 10 contiguous code points. The Chinese
1297digits, for example, are not in such a contiguous block, and so Unicode
1298doesn't view them as decimal digits, but merely digits, and so C<\d> will not
1299match them. A single-character string containing one of these digits will
1300have its decimal value returned by C<num>, but any longer string containing
1301only these digits will return C<undef>.
1302
a278d14b
KW
1303Strings of multiple sub- and superscripts are not recognized as numbers. You
1304can use either of the compatibility decompositions in Unicode::Normalize to
7319f91d
KW
1305change these into digits, and then call C<num> on the result.
1306
1307=cut
1308
1309# To handle sub, superscripts, this could if called in list context,
1310# consider those, and return the <decomposition> type in the second
1311# array element.
1312
1313sub num {
1314 my $string = $_[0];
1315
1316 _numeric unless %NUMERIC;
1317
1318 my $length = length($string);
98025745 1319 return $NUMERIC{ord($string)} if $length == 1;
7319f91d
KW
1320 return if $string =~ /\D/;
1321 my $first_ord = ord(substr($string, 0, 1));
98025745 1322 my $value = $NUMERIC{$first_ord};
7319f91d
KW
1323 my $zero_ord = $first_ord - $value;
1324
1325 for my $i (1 .. $length -1) {
1326 my $ord = ord(substr($string, $i, 1));
1327 my $digit = $ord - $zero_ord;
1328 return unless $digit >= 0 && $digit <= 9;
1329 $value = $value * 10 + $digit;
1330 }
1331 return $value;
1332}
1333
1334
1335
55d7b906 1336=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 1337
a452d459
KW
1338This returns the version of the Unicode Character Database, in other words, the
1339version of the Unicode standard the database implements. The version is a
1340string of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
1341
1342=cut
1343
1344my $UNICODEVERSION;
1345
1346sub UnicodeVersion {
1347 unless (defined $UNICODEVERSION) {
1348 openunicode(\$VERSIONFH, "version");
1349 chomp($UNICODEVERSION = <$VERSIONFH>);
1350 close($VERSIONFH);
1351 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1352 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1353 }
1354 return $UNICODEVERSION;
1355}
3aa957f9 1356
a452d459
KW
1357=head2 B<Blocks versus Scripts>
1358
1359The difference between a block and a script is that scripts are closer
1360to the linguistic notion of a set of code points required to present
1361languages, while block is more of an artifact of the Unicode code point
1362numbering and separation into blocks of (mostly) 256 code points.
1363
1364For example the Latin B<script> is spread over several B<blocks>, such
1365as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1366C<Latin Extended-B>. On the other hand, the Latin script does not
1367contain all the characters of the C<Basic Latin> block (also known as
1368ASCII): it includes only the letters, and not, for example, the digits
1369or the punctuation.
1370
1371For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1372
1373For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1374
1375=head2 B<Matching Scripts and Blocks>
1376
1377Scripts are matched with the regular-expression construct
1378C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
1379while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
1380any of the 256 code points in the Tibetan block).
1381
1382
3aa957f9 1383=head2 Implementation Note
32c16050 1384
ad9cab37
JH
1385The first use of charinfo() opens a read-only filehandle to the Unicode
1386Character Database (the database is included in the Perl distribution).
78bf21c2
JH
1387The filehandle is then kept open for further queries. In other words,
1388if you are wondering where one of your filehandles went, that's where.
32c16050 1389
8b731da2
JH
1390=head1 BUGS
1391
1392Does not yet support EBCDIC platforms.
1393
561c79ed
JH
1394=head1 AUTHOR
1395
1396Jarkko Hietaniemi
1397
1398=cut
1399
14001;