This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Add %suppressed_properties to UCD.pl
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
36c2430c 5no warnings 'surrogate'; # surrogates can be inputs to this
98ef7649 6use charnames ();
94c91ffc 7use Unicode::Normalize qw(getCombinClass NFD);
561c79ed 8
83fd1222 9our $VERSION = '0.36';
561c79ed 10
741297c1
JH
11use Storable qw(dclone);
12
561c79ed
JH
13require Exporter;
14
15our @ISA = qw(Exporter);
74f8133e 16
10a6ecd2
JH
17our @EXPORT_OK = qw(charinfo
18 charblock charscript
19 charblocks charscripts
b08cd201 20 charinrange
ea508aee 21 general_categories bidi_types
b08cd201 22 compexcl
a2bd7410 23 casefold casespec
7319f91d
KW
24 namedseq
25 num
26 );
561c79ed
JH
27
28use Carp;
29
30=head1 NAME
31
55d7b906 32Unicode::UCD - Unicode character database
561c79ed
JH
33
34=head1 SYNOPSIS
35
55d7b906 36 use Unicode::UCD 'charinfo';
b08cd201 37 my $charinfo = charinfo($codepoint);
561c79ed 38
956cae9a
KW
39 use Unicode::UCD 'casefold';
40 my $casefold = casefold(0xFB00);
41
5d8e6e41
KW
42 use Unicode::UCD 'casespec';
43 my $casespec = casespec(0xFB00);
44
55d7b906 45 use Unicode::UCD 'charblock';
e882dd67
JH
46 my $charblock = charblock($codepoint);
47
55d7b906 48 use Unicode::UCD 'charscript';
65044554 49 my $charscript = charscript($codepoint);
561c79ed 50
55d7b906 51 use Unicode::UCD 'charblocks';
e145285f
JH
52 my $charblocks = charblocks();
53
55d7b906 54 use Unicode::UCD 'charscripts';
ea508aee 55 my $charscripts = charscripts();
e145285f 56
55d7b906 57 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
58 my $range = charscript($script);
59 print "looks like $script\n" if charinrange($range, $codepoint);
60
ea508aee
JH
61 use Unicode::UCD qw(general_categories bidi_types);
62 my $categories = general_categories();
63 my $types = bidi_types();
64
55d7b906 65 use Unicode::UCD 'compexcl';
e145285f
JH
66 my $compexcl = compexcl($codepoint);
67
a2bd7410
JH
68 use Unicode::UCD 'namedseq';
69 my $namedseq = namedseq($named_sequence_name);
70
55d7b906 71 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 72
7319f91d 73 my $convert_to_numeric =
62a8c8c2 74 Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
7319f91d 75
561c79ed
JH
76=head1 DESCRIPTION
77
a452d459
KW
78The Unicode::UCD module offers a series of functions that
79provide a simple interface to the Unicode
8b731da2 80Character Database.
561c79ed 81
a452d459
KW
82=head2 code point argument
83
84Some of the functions are called with a I<code point argument>, which is either
85a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
86followed by hexadecimals designating a Unicode code point. In other words, if
87you want a code point to be interpreted as a hexadecimal number, you must
88prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
f200dd12
KW
89interpreted as a decimal code point. Note that the largest code point in
90Unicode is U+10FFFF.
561c79ed
JH
91=cut
92
10a6ecd2 93my $BLOCKSFH;
10a6ecd2 94my $VERSIONFH;
b08cd201
JH
95my $CASEFOLDFH;
96my $CASESPECFH;
a2bd7410 97my $NAMEDSEQFH;
561c79ed
JH
98
99sub openunicode {
100 my ($rfh, @path) = @_;
101 my $f;
102 unless (defined $$rfh) {
103 for my $d (@INC) {
104 use File::Spec;
55d7b906 105 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 106 last if open($$rfh, $f);
e882dd67 107 undef $f;
561c79ed 108 }
e882dd67
JH
109 croak __PACKAGE__, ": failed to find ",
110 File::Spec->catfile(@path), " in @INC"
111 unless defined $f;
561c79ed
JH
112 }
113 return $f;
114}
115
a452d459 116=head2 B<charinfo()>
561c79ed 117
55d7b906 118 use Unicode::UCD 'charinfo';
561c79ed 119
b08cd201 120 my $charinfo = charinfo(0x41);
561c79ed 121
a452d459
KW
122This returns information about the input L</code point argument>
123as a reference to a hash of fields as defined by the Unicode
124standard. If the L</code point argument> is not assigned in the standard
125(i.e., has the general category C<Cn> meaning C<Unassigned>)
126or is a non-character (meaning it is guaranteed to never be assigned in
127the standard),
a18e976f 128C<undef> is returned.
a452d459
KW
129
130Fields that aren't applicable to the particular code point argument exist in the
131returned hash, and are empty.
132
133The keys in the hash with the meanings of their values are:
134
135=over
136
137=item B<code>
138
139the input L</code point argument> expressed in hexadecimal, with leading zeros
140added if necessary to make it contain at least four hexdigits
141
142=item B<name>
143
144name of I<code>, all IN UPPER CASE.
145Some control-type code points do not have names.
146This field will be empty for C<Surrogate> and C<Private Use> code points,
147and for the others without a name,
148it will contain a description enclosed in angle brackets, like
149C<E<lt>controlE<gt>>.
150
151
152=item B<category>
153
154The short name of the general category of I<code>.
155This will match one of the keys in the hash returned by L</general_categories()>.
156
157=item B<combining>
158
159the combining class number for I<code> used in the Canonical Ordering Algorithm.
160For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
161available at
162L<http://www.unicode.org/versions/Unicode5.1.0/>
163
164=item B<bidi>
165
166bidirectional type of I<code>.
167This will match one of the keys in the hash returned by L</bidi_types()>.
168
169=item B<decomposition>
170
171is empty if I<code> has no decomposition; or is one or more codes
a18e976f 172(separated by spaces) that, taken in order, represent a decomposition for
a452d459
KW
173I<code>. Each has at least four hexdigits.
174The codes may be preceded by a word enclosed in angle brackets then a space,
175like C<E<lt>compatE<gt> >, giving the type of decomposition
176
06bba7d5
KW
177This decomposition may be an intermediate one whose components are also
178decomposable. Use L<Unicode::Normalize> to get the final decomposition.
179
a452d459
KW
180=item B<decimal>
181
182if I<code> is a decimal digit this is its integer numeric value
183
184=item B<digit>
185
89e4a205
KW
186if I<code> represents some other digit-like number, this is its integer
187numeric value
a452d459
KW
188
189=item B<numeric>
190
191if I<code> represents a whole or rational number, this is its numeric value.
192Rational values are expressed as a string like C<1/4>.
193
194=item B<mirrored>
195
196C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
197
198=item B<unicode10>
199
200name of I<code> in the Unicode 1.0 standard if one
201existed for this code point and is different from the current name
202
203=item B<comment>
204
89e4a205 205As of Unicode 6.0, this is always empty.
a452d459
KW
206
207=item B<upper>
208
06bba7d5 209is empty if there is no single code point uppercase mapping for I<code>
4f66642e 210(its uppercase mapping is itself);
a452d459
KW
211otherwise it is that mapping expressed as at least four hexdigits.
212(L</casespec()> should be used in addition to B<charinfo()>
213for case mappings when the calling program can cope with multiple code point
214mappings.)
215
216=item B<lower>
217
06bba7d5 218is empty if there is no single code point lowercase mapping for I<code>
4f66642e 219(its lowercase mapping is itself);
a452d459
KW
220otherwise it is that mapping expressed as at least four hexdigits.
221(L</casespec()> should be used in addition to B<charinfo()>
222for case mappings when the calling program can cope with multiple code point
223mappings.)
224
225=item B<title>
226
06bba7d5 227is empty if there is no single code point titlecase mapping for I<code>
4f66642e 228(its titlecase mapping is itself);
a452d459
KW
229otherwise it is that mapping expressed as at least four hexdigits.
230(L</casespec()> should be used in addition to B<charinfo()>
231for case mappings when the calling program can cope with multiple code point
232mappings.)
233
234=item B<block>
235
a18e976f 236the block I<code> belongs to (used in C<\p{Blk=...}>).
a452d459
KW
237See L</Blocks versus Scripts>.
238
239
240=item B<script>
241
a18e976f 242the script I<code> belongs to.
a452d459
KW
243See L</Blocks versus Scripts>.
244
245=back
32c16050
JH
246
247Note that you cannot do (de)composition and casing based solely on the
a452d459
KW
248I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
249you will need also the L</compexcl()>, and L</casespec()> functions.
561c79ed
JH
250
251=cut
252
e10d7780 253# NB: This function is nearly duplicated in charnames.pm
10a6ecd2
JH
254sub _getcode {
255 my $arg = shift;
256
dc0a4417 257 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 258 return $arg;
dc0a4417 259 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
260 return hex($1);
261 }
262
263 return;
264}
265
05dbc6f8
KW
266# Populated by _num. Converts real number back to input rational
267my %real_to_rational;
268
269# To store the contents of files found on disk.
270my @BIDIS;
271my @CATEGORIES;
272my @DECOMPOSITIONS;
273my @NUMERIC_TYPES;
5c3b35c9
KW
274my %SIMPLE_LOWER;
275my %SIMPLE_TITLE;
276my %SIMPLE_UPPER;
277my %UNICODE_1_NAMES;
05dbc6f8
KW
278
279sub _charinfo_case {
280
281 # Returns the value to set into one of the case fields in the charinfo
282 # structure.
283 # $char is the character,
284 # $cased is the case-changed character
285 # $file is the file in lib/unicore/To/$file that contains the data
286 # needed for this, in the form that _search() understands.
5c3b35c9 287 # $hash_ref points to the hash holding the contents of $file. It will
05dbc6f8
KW
288 # be populated if empty.
289 # By using the 'uc', etc. functions, we avoid loading more files into
290 # memory except for those rare cases where the simple casing (which has
291 # been what charinfo() has always returned, is different than the full
292 # casing.
5c3b35c9 293 my ($char, $cased, $file, $hash_ref) = @_;
05dbc6f8
KW
294
295 return "" if $cased eq $char;
296
297 return sprintf("%04X", ord $cased) if length($cased) == 1;
298
5c3b35c9
KW
299 %$hash_ref =_read_table("unicore/To/$file", 'use_hash') unless %$hash_ref;
300 return $hash_ref->{ord $char} // "";
a6fa416b
TS
301}
302
05dbc6f8 303sub charinfo {
a6fa416b 304
05dbc6f8
KW
305 # This function has traditionally mimicked what is in UnicodeData.txt,
306 # warts and all. This is a re-write that avoids UnicodeData.txt so that
307 # it can be removed to save disk space. Instead, this assembles
308 # information gotten by other methods that get data from various other
309 # files. It uses charnames to get the character name; and various
310 # mktables tables.
324f9e44 311
05dbc6f8 312 use feature 'unicode_strings';
a6fa416b 313
10a6ecd2
JH
314 my $arg = shift;
315 my $code = _getcode($arg);
05dbc6f8
KW
316 croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
317
318 # Non-unicode implies undef.
319 return if $code > 0x10FFFF;
320
321 my %prop;
322 my $char = chr($code);
323
324 @CATEGORIES =_read_table("unicore/To/Gc.pl") unless @CATEGORIES;
325 $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
326 // $utf8::SwashInfo{'ToGc'}{'missing'};
327
328 return if $prop{'category'} eq 'Cn'; # Unassigned code points are undef
329
330 $prop{'code'} = sprintf "%04X", $code;
331 $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
332 : (charnames::viacode($code) // "");
333
334 $prop{'combining'} = getCombinClass($code);
335
336 @BIDIS =_read_table("unicore/To/Bc.pl") unless @BIDIS;
337 $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
338 // $utf8::SwashInfo{'ToBc'}{'missing'};
339
340 # For most code points, we can just read in "unicore/Decomposition.pl", as
341 # its contents are exactly what should be output. But that file doesn't
342 # contain the data for the Hangul syllable decompositions, which can be
94c91ffc
KW
343 # algorithmically computed, and NFD() does that, so we call NFD() for
344 # those. We can't use NFD() for everything, as it does a complete
05dbc6f8 345 # recursive decomposition, and what this function has always done is to
94c91ffc
KW
346 # return what's in UnicodeData.txt which doesn't show that recursiveness.
347 # Fortunately, the NFD() of the Hanguls doesn't have any recursion
348 # issues.
349 # Having no decomposition implies an empty field; otherwise, all but
350 # "Canonical" imply a compatible decomposition, and the type is prefixed
351 # to that, as it is in UnicodeData.txt
05dbc6f8
KW
352 if ($char =~ /\p{Block=Hangul_Syllables}/) {
353 # The code points of the decomposition are output in standard Unicode
354 # hex format, separated by blanks.
355 $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
94c91ffc 356 unpack "U*", NFD($char);
a6fa416b 357 }
05dbc6f8
KW
358 else {
359 @DECOMPOSITIONS = _read_table("unicore/Decomposition.pl")
360 unless @DECOMPOSITIONS;
361 $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
362 $code) // "";
561c79ed 363 }
05dbc6f8
KW
364
365 # Can use num() to get the numeric values, if any.
366 if (! defined (my $value = num($char))) {
367 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
368 }
369 else {
370 if ($char =~ /\d/) {
371 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
372 }
373 else {
374
375 # For non-decimal-digits, we have to read in the Numeric type
376 # to distinguish them. It is not just a matter of integer vs.
377 # rational, as some whole number values are not considered digits,
378 # e.g., TAMIL NUMBER TEN.
379 $prop{'decimal'} = "";
380
381 @NUMERIC_TYPES =_read_table("unicore/To/Nt.pl")
382 unless @NUMERIC_TYPES;
383 if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
384 eq 'Digit')
385 {
386 $prop{'digit'} = $prop{'numeric'} = $value;
387 }
388 else {
389 $prop{'digit'} = "";
390 $prop{'numeric'} = $real_to_rational{$value} // $value;
391 }
392 }
393 }
394
395 $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
396
5c3b35c9
KW
397 %UNICODE_1_NAMES =_read_table("unicore/To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
398 $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
05dbc6f8
KW
399
400 # This is true starting in 6.0, but, num() also requires 6.0, so
401 # don't need to test for version again here.
402 $prop{'comment'} = "";
403
5c3b35c9
KW
404 $prop{'upper'} = _charinfo_case($char, uc $char, '_suc.pl', \%SIMPLE_UPPER);
405 $prop{'lower'} = _charinfo_case($char, lc $char, '_slc.pl', \%SIMPLE_LOWER);
05dbc6f8 406 $prop{'title'} = _charinfo_case($char, ucfirst $char, '_stc.pl',
5c3b35c9 407 \%SIMPLE_TITLE);
05dbc6f8
KW
408
409 $prop{block} = charblock($code);
410 $prop{script} = charscript($code);
411 return \%prop;
561c79ed
JH
412}
413
e882dd67
JH
414sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
415 my ($table, $lo, $hi, $code) = @_;
416
417 return if $lo > $hi;
418
419 my $mid = int(($lo+$hi) / 2);
420
421 if ($table->[$mid]->[0] < $code) {
10a6ecd2 422 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
423 return $table->[$mid]->[2];
424 } else {
425 _search($table, $mid + 1, $hi, $code);
426 }
427 } elsif ($table->[$mid]->[0] > $code) {
428 _search($table, $lo, $mid - 1, $code);
429 } else {
430 return $table->[$mid]->[2];
431 }
432}
433
cb366075 434sub _read_table ($;$) {
3a12600d
KW
435
436 # Returns the contents of the mktables generated table file located at $1
cb366075
KW
437 # in the form of either an array of arrays or a hash, depending on if the
438 # optional second parameter is true (for hash return) or not. In the case
439 # of a hash return, each key is a code point, and its corresponding value
440 # is what the table gives as the code point's corresponding value. In the
441 # case of an array return, each outer array denotes a range with [0] the
442 # start point of that range; [1] the end point; and [2] the value that
443 # every code point in the range has. The hash return is useful for fast
444 # lookup when the table contains only single code point ranges. The array
445 # return takes much less memory when there are large ranges.
3a12600d 446 #
cb366075 447 # This function has the side effect of setting
3a12600d
KW
448 # $utf8::SwashInfo{$property}{'format'} to be the mktables format of the
449 # table; and
450 # $utf8::SwashInfo{$property}{'missing'} to be the value for all entries
451 # not listed in the table.
452 # where $property is the Unicode property name, preceded by 'To' for map
453 # properties., e.g., 'ToSc'.
454 #
455 # Table entries look like one of:
456 # 0000 0040 Common # [65]
457 # 00AA Latin
458
459 my $table = shift;
cb366075
KW
460 my $return_hash = shift;
461 $return_hash = 0 unless defined $return_hash;
3a12600d 462 my @return;
cb366075 463 my %return;
3a12600d
KW
464 local $_;
465
466 for (split /^/m, do $table) {
467 my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
468 \s* ( \# .* )? # Optional comment
469 $ /x;
83fd1222
KW
470 my $decimal_start = hex $start;
471 my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
cb366075 472 if ($return_hash) {
83fd1222 473 foreach my $i ($decimal_start .. $decimal_end) {
cb366075
KW
474 $return{$i} = $value;
475 }
476 }
9a96c106
KW
477 elsif (@return &&
478 $return[-1][1] == $decimal_start - 1
479 && $return[-1][2] eq $value)
480 {
481 # If this is merely extending the previous range, do just that.
482 $return[-1]->[1] = $decimal_end;
483 }
cb366075 484 else {
83fd1222 485 push @return, [ $decimal_start, $decimal_end, $value ];
cb366075 486 }
3a12600d 487 }
cb366075 488 return ($return_hash) ? %return : @return;
3a12600d
KW
489}
490
10a6ecd2
JH
491sub charinrange {
492 my ($range, $arg) = @_;
493 my $code = _getcode($arg);
494 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
495 unless defined $code;
496 _search($range, 0, $#$range, $code);
497}
498
a452d459 499=head2 B<charblock()>
561c79ed 500
55d7b906 501 use Unicode::UCD 'charblock';
561c79ed
JH
502
503 my $charblock = charblock(0x41);
10a6ecd2 504 my $charblock = charblock(1234);
a452d459 505 my $charblock = charblock(0x263a);
10a6ecd2
JH
506 my $charblock = charblock("U+263a");
507
78bf21c2 508 my $range = charblock('Armenian');
10a6ecd2 509
a452d459
KW
510With a L</code point argument> charblock() returns the I<block> the code point
511belongs to, e.g. C<Basic Latin>.
512If the code point is unassigned, this returns the block it would belong to if
a18e976f 513it were assigned.
10a6ecd2 514
78bf21c2
JH
515See also L</Blocks versus Scripts>.
516
18972f4b 517If supplied with an argument that can't be a code point, charblock() tries to
a18e976f
KW
518do the opposite and interpret the argument as a block name. The return value
519is a I<range set> with one range: an anonymous list with a single element that
520consists of another anonymous list whose first element is the first code point
521in the block, and whose second (and final) element is the final code point in
522the block. (The extra list consisting of just one element is so that the same
523program logic can be used to handle both this return, and the return from
524L</charscript()> which can have multiple ranges.) You can test whether a code
525point is in a range using the L</charinrange()> function. If the argument is
526not a known block, C<undef> is returned.
561c79ed 527
561c79ed
JH
528=cut
529
530my @BLOCKS;
10a6ecd2 531my %BLOCKS;
561c79ed 532
10a6ecd2 533sub _charblocks {
06bba7d5
KW
534
535 # Can't read from the mktables table because it loses the hyphens in the
536 # original.
561c79ed 537 unless (@BLOCKS) {
10a6ecd2 538 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 539 local $_;
10a6ecd2 540 while (<$BLOCKSFH>) {
2796c109 541 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
542 my ($lo, $hi) = (hex($1), hex($2));
543 my $subrange = [ $lo, $hi, $3 ];
544 push @BLOCKS, $subrange;
545 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
546 }
547 }
10a6ecd2 548 close($BLOCKSFH);
561c79ed
JH
549 }
550 }
10a6ecd2
JH
551}
552
553sub charblock {
554 my $arg = shift;
555
556 _charblocks() unless @BLOCKS;
557
558 my $code = _getcode($arg);
561c79ed 559
10a6ecd2 560 if (defined $code) {
c707cf8e
KW
561 my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code);
562 return $result if defined $result;
563 return 'No_Block';
564 }
565 elsif (exists $BLOCKS{$arg}) {
566 return dclone $BLOCKS{$arg};
10a6ecd2 567 }
e882dd67
JH
568}
569
a452d459 570=head2 B<charscript()>
e882dd67 571
55d7b906 572 use Unicode::UCD 'charscript';
e882dd67
JH
573
574 my $charscript = charscript(0x41);
10a6ecd2
JH
575 my $charscript = charscript(1234);
576 my $charscript = charscript("U+263a");
e882dd67 577
78bf21c2 578 my $range = charscript('Thai');
10a6ecd2 579
a452d459
KW
580With a L</code point argument> charscript() returns the I<script> the
581code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
bb2d29dc 582If the code point is unassigned, it returns C<"Unknown">.
78bf21c2 583
eb0cc9e3 584If supplied with an argument that can't be a code point, charscript() tries
a18e976f
KW
585to do the opposite and interpret the argument as a script name. The
586return value is a I<range set>: an anonymous list of lists that contain
eb0cc9e3 587I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
a18e976f
KW
588code point is in a range set using the L</charinrange()> function. If the
589argument is not a known script, C<undef> is returned.
a452d459
KW
590
591See also L</Blocks versus Scripts>.
e882dd67 592
e882dd67
JH
593=cut
594
595my @SCRIPTS;
10a6ecd2 596my %SCRIPTS;
e882dd67 597
10a6ecd2 598sub _charscripts {
7bccef0b
KW
599 @SCRIPTS =_read_table("unicore/To/Sc.pl") unless @SCRIPTS;
600 foreach my $entry (@SCRIPTS) {
f3d50ac9 601 $entry->[2] =~ s/(_\w)/\L$1/g; # Preserve old-style casing
7bccef0b 602 push @{$SCRIPTS{$entry->[2]}}, $entry;
e882dd67 603 }
10a6ecd2
JH
604}
605
606sub charscript {
607 my $arg = shift;
608
609 _charscripts() unless @SCRIPTS;
e882dd67 610
10a6ecd2
JH
611 my $code = _getcode($arg);
612
613 if (defined $code) {
7bccef0b
KW
614 my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
615 return $result if defined $result;
8079ad82 616 return $utf8::SwashInfo{'ToSc'}{'missing'};
7bccef0b
KW
617 } elsif (exists $SCRIPTS{$arg}) {
618 return dclone $SCRIPTS{$arg};
10a6ecd2 619 }
7bccef0b
KW
620
621 return;
10a6ecd2
JH
622}
623
a452d459 624=head2 B<charblocks()>
10a6ecd2 625
55d7b906 626 use Unicode::UCD 'charblocks';
10a6ecd2 627
b08cd201 628 my $charblocks = charblocks();
10a6ecd2 629
b08cd201 630charblocks() returns a reference to a hash with the known block names
a452d459 631as the keys, and the code point ranges (see L</charblock()>) as the values.
10a6ecd2 632
78bf21c2
JH
633See also L</Blocks versus Scripts>.
634
10a6ecd2
JH
635=cut
636
637sub charblocks {
b08cd201 638 _charblocks() unless %BLOCKS;
741297c1 639 return dclone \%BLOCKS;
10a6ecd2
JH
640}
641
a452d459 642=head2 B<charscripts()>
10a6ecd2 643
55d7b906 644 use Unicode::UCD 'charscripts';
10a6ecd2 645
ea508aee 646 my $charscripts = charscripts();
10a6ecd2 647
ea508aee 648charscripts() returns a reference to a hash with the known script
a452d459 649names as the keys, and the code point ranges (see L</charscript()>) as
ea508aee 650the values.
10a6ecd2 651
78bf21c2
JH
652See also L</Blocks versus Scripts>.
653
10a6ecd2
JH
654=cut
655
656sub charscripts {
b08cd201 657 _charscripts() unless %SCRIPTS;
741297c1 658 return dclone \%SCRIPTS;
561c79ed
JH
659}
660
a452d459 661=head2 B<charinrange()>
10a6ecd2 662
f200dd12 663In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
10a6ecd2 664can also test whether a code point is in the I<range> as returned by
a452d459
KW
665L</charblock()> and L</charscript()> or as the values of the hash returned
666by L</charblocks()> and L</charscripts()> by using charinrange():
10a6ecd2 667
55d7b906 668 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
669
670 $range = charscript('Hiragana');
e145285f 671 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
672
673=cut
674
ea508aee
JH
675my %GENERAL_CATEGORIES =
676 (
677 'L' => 'Letter',
678 'LC' => 'CasedLetter',
679 'Lu' => 'UppercaseLetter',
680 'Ll' => 'LowercaseLetter',
681 'Lt' => 'TitlecaseLetter',
682 'Lm' => 'ModifierLetter',
683 'Lo' => 'OtherLetter',
684 'M' => 'Mark',
685 'Mn' => 'NonspacingMark',
686 'Mc' => 'SpacingMark',
687 'Me' => 'EnclosingMark',
688 'N' => 'Number',
689 'Nd' => 'DecimalNumber',
690 'Nl' => 'LetterNumber',
691 'No' => 'OtherNumber',
692 'P' => 'Punctuation',
693 'Pc' => 'ConnectorPunctuation',
694 'Pd' => 'DashPunctuation',
695 'Ps' => 'OpenPunctuation',
696 'Pe' => 'ClosePunctuation',
697 'Pi' => 'InitialPunctuation',
698 'Pf' => 'FinalPunctuation',
699 'Po' => 'OtherPunctuation',
700 'S' => 'Symbol',
701 'Sm' => 'MathSymbol',
702 'Sc' => 'CurrencySymbol',
703 'Sk' => 'ModifierSymbol',
704 'So' => 'OtherSymbol',
705 'Z' => 'Separator',
706 'Zs' => 'SpaceSeparator',
707 'Zl' => 'LineSeparator',
708 'Zp' => 'ParagraphSeparator',
709 'C' => 'Other',
710 'Cc' => 'Control',
711 'Cf' => 'Format',
712 'Cs' => 'Surrogate',
713 'Co' => 'PrivateUse',
714 'Cn' => 'Unassigned',
715 );
716
717sub general_categories {
718 return dclone \%GENERAL_CATEGORIES;
719}
720
a452d459 721=head2 B<general_categories()>
ea508aee
JH
722
723 use Unicode::UCD 'general_categories';
724
725 my $categories = general_categories();
726
a452d459 727This returns a reference to a hash which has short
ea508aee
JH
728general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
729names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
730C<Symbol>) as values. The hash is reversible in case you need to go
731from the long names to the short names. The general category is the
a452d459
KW
732one returned from
733L</charinfo()> under the C<category> key.
ea508aee
JH
734
735=cut
736
737my %BIDI_TYPES =
738 (
739 'L' => 'Left-to-Right',
740 'LRE' => 'Left-to-Right Embedding',
741 'LRO' => 'Left-to-Right Override',
742 'R' => 'Right-to-Left',
743 'AL' => 'Right-to-Left Arabic',
744 'RLE' => 'Right-to-Left Embedding',
745 'RLO' => 'Right-to-Left Override',
746 'PDF' => 'Pop Directional Format',
747 'EN' => 'European Number',
748 'ES' => 'European Number Separator',
749 'ET' => 'European Number Terminator',
750 'AN' => 'Arabic Number',
751 'CS' => 'Common Number Separator',
752 'NSM' => 'Non-Spacing Mark',
753 'BN' => 'Boundary Neutral',
754 'B' => 'Paragraph Separator',
755 'S' => 'Segment Separator',
756 'WS' => 'Whitespace',
757 'ON' => 'Other Neutrals',
758 );
759
a452d459 760=head2 B<bidi_types()>
ea508aee
JH
761
762 use Unicode::UCD 'bidi_types';
763
764 my $categories = bidi_types();
765
a452d459 766This returns a reference to a hash which has the short
ea508aee
JH
767bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
768names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
769hash is reversible in case you need to go from the long names to the
a452d459
KW
770short names. The bidi type is the one returned from
771L</charinfo()>
ea508aee
JH
772under the C<bidi> key. For the exact meaning of the various bidi classes
773the Unicode TR9 is recommended reading:
a452d459 774L<http://www.unicode.org/reports/tr9/>
ea508aee
JH
775(as of Unicode 5.0.0)
776
777=cut
778
a452d459
KW
779sub bidi_types {
780 return dclone \%BIDI_TYPES;
781}
782
783=head2 B<compexcl()>
b08cd201 784
55d7b906 785 use Unicode::UCD 'compexcl';
b08cd201 786
a452d459 787 my $compexcl = compexcl(0x09dc);
b08cd201 788
71a442a8
KW
789This routine is included for backwards compatibility, but as of Perl 5.12, for
790most purposes it is probably more convenient to use one of the following
791instead:
792
793 my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
794 my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
795
796or even
797
798 my $compexcl = chr(0x09dc) =~ /\p{CE};
799 my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
800
801The first two forms return B<true> if the L</code point argument> should not
76b05678
KW
802be produced by composition normalization. For the final two forms to return
803B<true>, it is additionally required that this fact not otherwise be
804determinable from the Unicode data base.
71a442a8
KW
805
806This routine behaves identically to the final two forms. That is,
807it does not return B<true> if the code point has a decomposition
a452d459
KW
808consisting of another single code point, nor if its decomposition starts
809with a code point whose combining class is non-zero. Code points that meet
810either of these conditions should also not be produced by composition
71a442a8
KW
811normalization, which is probably why you should use the
812C<Full_Composition_Exclusion> property instead, as shown above.
b08cd201 813
71a442a8 814The routine returns B<false> otherwise.
b08cd201
JH
815
816=cut
817
b08cd201
JH
818sub compexcl {
819 my $arg = shift;
820 my $code = _getcode($arg);
74f8133e
JH
821 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
822 unless defined $code;
b08cd201 823
36c2430c 824 no warnings "non_unicode"; # So works on non-Unicode code points
71a442a8 825 return chr($code) =~ /\p{Composition_Exclusion}/;
b08cd201
JH
826}
827
a452d459 828=head2 B<casefold()>
b08cd201 829
55d7b906 830 use Unicode::UCD 'casefold';
b08cd201 831
a452d459
KW
832 my $casefold = casefold(0xDF);
833 if (defined $casefold) {
834 my @full_fold_hex = split / /, $casefold->{'full'};
835 my $full_fold_string =
836 join "", map {chr(hex($_))} @full_fold_hex;
837 my @turkic_fold_hex =
838 split / /, ($casefold->{'turkic'} ne "")
839 ? $casefold->{'turkic'}
840 : $casefold->{'full'};
841 my $turkic_fold_string =
842 join "", map {chr(hex($_))} @turkic_fold_hex;
843 }
844 if (defined $casefold && $casefold->{'simple'} ne "") {
845 my $simple_fold_hex = $casefold->{'simple'};
846 my $simple_fold_string = chr(hex($simple_fold_hex));
847 }
b08cd201 848
a452d459
KW
849This returns the (almost) locale-independent case folding of the
850character specified by the L</code point argument>.
b08cd201 851
a18e976f 852If there is no case folding for that code point, C<undef> is returned.
a452d459
KW
853
854If there is a case folding for that code point, a reference to a hash
b08cd201
JH
855with the following fields is returned:
856
a452d459
KW
857=over
858
859=item B<code>
860
861the input L</code point argument> expressed in hexadecimal, with leading zeros
862added if necessary to make it contain at least four hexdigits
863
864=item B<full>
865
a18e976f 866one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
867code points for the case folding for I<code>.
868Each has at least four hexdigits.
869
870=item B<simple>
871
872is empty, or is exactly one code with at least four hexdigits which can be used
873as an alternative case folding when the calling program cannot cope with the
874fold being a sequence of multiple code points. If I<full> is just one code
875point, then I<simple> equals I<full>. If there is no single code point folding
876defined for I<code>, then I<simple> is the empty string. Otherwise, it is an
877inferior, but still better-than-nothing alternative folding to I<full>.
878
879=item B<mapping>
880
881is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
882otherwise. It can be considered to be the simplest possible folding for
883I<code>. It is defined primarily for backwards compatibility.
884
885=item B<status>
b08cd201 886
a452d459
KW
887is C<C> (for C<common>) if the best possible fold is a single code point
888(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct
889folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if
a18e976f
KW
890there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).
891Note that this
a452d459
KW
892describes the contents of I<mapping>. It is defined primarily for backwards
893compatibility.
b08cd201 894
a452d459
KW
895On versions 3.1 and earlier of Unicode, I<status> can also be
896C<I> which is the same as C<C> but is a special case for dotted uppercase I and
897dotless lowercase i:
b08cd201 898
a452d459 899=over
b08cd201 900
a18e976f 901=item B<*> If you use this C<I> mapping
a452d459 902
a18e976f 903the result is case-insensitive,
a452d459
KW
904but dotless and dotted I's are not distinguished
905
a18e976f 906=item B<*> If you exclude this C<I> mapping
a452d459 907
a18e976f 908the result is not fully case-insensitive, but
a452d459
KW
909dotless and dotted I's are distinguished
910
911=back
912
913=item B<turkic>
914
915contains any special folding for Turkic languages. For versions of Unicode
916starting with 3.2, this field is empty unless I<code> has a different folding
917in Turkic languages, in which case it is one or more codes (separated by
a18e976f 918spaces) that, taken in order, give the code points for the case folding for
a452d459
KW
919I<code> in those languages.
920Each code has at least four hexdigits.
921Note that this folding does not maintain canonical equivalence without
922additional processing.
923
924For versions of Unicode 3.1 and earlier, this field is empty unless there is a
925special folding for Turkic languages, in which case I<status> is C<I>, and
926I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
927
928=back
929
930Programs that want complete generality and the best folding results should use
931the folding contained in the I<full> field. But note that the fold for some
932code points will be a sequence of multiple code points.
933
934Programs that can't cope with the fold mapping being multiple code points can
935use the folding contained in the I<simple> field, with the loss of some
936generality. In Unicode 5.1, about 7% of the defined foldings have no single
937code point folding.
938
939The I<mapping> and I<status> fields are provided for backwards compatibility for
940existing programs. They contain the same values as in previous versions of
941this function.
942
943Locale is not completely independent. The I<turkic> field contains results to
944use when the locale is a Turkic language.
b08cd201
JH
945
946For more information about case mappings see
a452d459 947L<http://www.unicode.org/unicode/reports/tr21>
b08cd201
JH
948
949=cut
950
951my %CASEFOLD;
952
953sub _casefold {
954 unless (%CASEFOLD) {
551b6b6f 955 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 956 local $_;
b08cd201 957 while (<$CASEFOLDFH>) {
a452d459 958 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
b08cd201 959 my $code = hex($1);
a452d459
KW
960 $CASEFOLD{$code}{'code'} = $1;
961 $CASEFOLD{$code}{'turkic'} = "" unless
962 defined $CASEFOLD{$code}{'turkic'};
963 if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and
964 # earlier Unicodes
965 # Both entries there (I
966 # only checked 3.1) are
967 # the same as C, and
968 # there are no other
969 # entries for those
970 # codepoints, so treat
971 # as if C, but override
972 # the turkic one for
973 # 'I'.
974 $CASEFOLD{$code}{'status'} = $2;
975 $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
976 $CASEFOLD{$code}{'mapping'} = $3;
977 $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
978 } elsif ($2 eq 'F') {
979 $CASEFOLD{$code}{'full'} = $3;
980 unless (defined $CASEFOLD{$code}{'simple'}) {
981 $CASEFOLD{$code}{'simple'} = "";
982 $CASEFOLD{$code}{'mapping'} = $3;
983 $CASEFOLD{$code}{'status'} = $2;
984 }
985 } elsif ($2 eq 'S') {
986
987
988 # There can't be a simple without a full, and simple
989 # overrides all but full
990
991 $CASEFOLD{$code}{'simple'} = $3;
992 $CASEFOLD{$code}{'mapping'} = $3;
993 $CASEFOLD{$code}{'status'} = $2;
994 } elsif ($2 eq 'T') {
995 $CASEFOLD{$code}{'turkic'} = $3;
996 } # else can't happen because only [CIFST] are possible
b08cd201
JH
997 }
998 }
999 close($CASEFOLDFH);
1000 }
1001 }
1002}
1003
1004sub casefold {
1005 my $arg = shift;
1006 my $code = _getcode($arg);
74f8133e
JH
1007 croak __PACKAGE__, "::casefold: unknown code '$arg'"
1008 unless defined $code;
b08cd201
JH
1009
1010 _casefold() unless %CASEFOLD;
1011
1012 return $CASEFOLD{$code};
1013}
1014
a452d459 1015=head2 B<casespec()>
b08cd201 1016
55d7b906 1017 use Unicode::UCD 'casespec';
b08cd201 1018
a452d459 1019 my $casespec = casespec(0xFB00);
b08cd201 1020
a452d459
KW
1021This returns the potentially locale-dependent case mappings of the L</code point
1022argument>. The mappings may be longer than a single code point (which the basic
1023Unicode case mappings as returned by L</charinfo()> never are).
b08cd201 1024
a452d459
KW
1025If there are no case mappings for the L</code point argument>, or if all three
1026possible mappings (I<lower>, I<title> and I<upper>) result in single code
a18e976f 1027points and are locale independent and unconditional, C<undef> is returned
5d8e6e41
KW
1028(which means that the case mappings, if any, for the code point are those
1029returned by L</charinfo()>).
a452d459
KW
1030
1031Otherwise, a reference to a hash giving the mappings (or a reference to a hash
5d8e6e41
KW
1032of such hashes, explained below) is returned with the following keys and their
1033meanings:
a452d459
KW
1034
1035The keys in the bottom layer hash with the meanings of their values are:
1036
1037=over
1038
1039=item B<code>
1040
1041the input L</code point argument> expressed in hexadecimal, with leading zeros
1042added if necessary to make it contain at least four hexdigits
1043
1044=item B<lower>
1045
a18e976f 1046one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1047code points for the lower case of I<code>.
1048Each has at least four hexdigits.
1049
1050=item B<title>
b08cd201 1051
a18e976f 1052one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1053code points for the title case of I<code>.
1054Each has at least four hexdigits.
b08cd201 1055
d2da20e3 1056=item B<upper>
b08cd201 1057
a18e976f 1058one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1059code points for the upper case of I<code>.
1060Each has at least four hexdigits.
1061
1062=item B<condition>
1063
1064the conditions for the mappings to be valid.
a18e976f 1065If C<undef>, the mappings are always valid.
a452d459
KW
1066When defined, this field is a list of conditions,
1067all of which must be true for the mappings to be valid.
1068The list consists of one or more
1069I<locales> (see below)
1070and/or I<contexts> (explained in the next paragraph),
1071separated by spaces.
1072(Other than as used to separate elements, spaces are to be ignored.)
1073Case distinctions in the condition list are not significant.
82c0b05b 1074Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 1075
a452d459
KW
1076A I<context> is one of those defined in the Unicode standard.
1077For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1078available at
5d8e6e41
KW
1079L<http://www.unicode.org/versions/Unicode5.1.0/>.
1080These are for context-sensitive casing.
f499c386 1081
a452d459
KW
1082=back
1083
5d8e6e41 1084The hash described above is returned for locale-independent casing, where
a18e976f 1085at least one of the mappings has length longer than one. If C<undef> is
5d8e6e41
KW
1086returned, the code point may have mappings, but if so, all are length one,
1087and are returned by L</charinfo()>.
1088Note that when this function does return a value, it will be for the complete
1089set of mappings for a code point, even those whose length is one.
1090
1091If there are additional casing rules that apply only in certain locales,
1092an additional key for each will be defined in the returned hash. Each such key
1093will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1094followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1095and a variant code). You can find the lists of all possible locales, see
1096L<Locale::Country> and L<Locale::Language>.
89e4a205 1097(In Unicode 6.0, the only locales returned by this function
a452d459 1098are C<lt>, C<tr>, and C<az>.)
b08cd201 1099
5d8e6e41
KW
1100Each locale key is a reference to a hash that has the form above, and gives
1101the casing rules for that particular locale, which take precedence over the
1102locale-independent ones when in that locale.
1103
1104If the only casing for a code point is locale-dependent, then the returned
1105hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1106will contain only locale keys.
1107
b08cd201 1108For more information about case mappings see
a452d459 1109L<http://www.unicode.org/unicode/reports/tr21/>
b08cd201
JH
1110
1111=cut
1112
1113my %CASESPEC;
1114
1115sub _casespec {
1116 unless (%CASESPEC) {
551b6b6f 1117 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 1118 local $_;
b08cd201
JH
1119 while (<$CASESPECFH>) {
1120 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
1121 my ($hexcode, $lower, $title, $upper, $condition) =
1122 ($1, $2, $3, $4, $5);
1123 my $code = hex($hexcode);
1124 if (exists $CASESPEC{$code}) {
1125 if (exists $CASESPEC{$code}->{code}) {
1126 my ($oldlower,
1127 $oldtitle,
1128 $oldupper,
1129 $oldcondition) =
1130 @{$CASESPEC{$code}}{qw(lower
1131 title
1132 upper
1133 condition)};
822ebcc8
JH
1134 if (defined $oldcondition) {
1135 my ($oldlocale) =
f499c386 1136 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
1137 delete $CASESPEC{$code};
1138 $CASESPEC{$code}->{$oldlocale} =
1139 { code => $hexcode,
1140 lower => $oldlower,
1141 title => $oldtitle,
1142 upper => $oldupper,
1143 condition => $oldcondition };
f499c386
JH
1144 }
1145 }
1146 my ($locale) =
1147 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1148 $CASESPEC{$code}->{$locale} =
1149 { code => $hexcode,
1150 lower => $lower,
1151 title => $title,
1152 upper => $upper,
1153 condition => $condition };
1154 } else {
1155 $CASESPEC{$code} =
1156 { code => $hexcode,
1157 lower => $lower,
1158 title => $title,
1159 upper => $upper,
1160 condition => $condition };
1161 }
b08cd201
JH
1162 }
1163 }
1164 close($CASESPECFH);
1165 }
1166 }
1167}
1168
1169sub casespec {
1170 my $arg = shift;
1171 my $code = _getcode($arg);
74f8133e
JH
1172 croak __PACKAGE__, "::casespec: unknown code '$arg'"
1173 unless defined $code;
b08cd201
JH
1174
1175 _casespec() unless %CASESPEC;
1176
741297c1 1177 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
1178}
1179
a452d459 1180=head2 B<namedseq()>
a2bd7410
JH
1181
1182 use Unicode::UCD 'namedseq';
1183
1184 my $namedseq = namedseq("KATAKANA LETTER AINU P");
1185 my @namedseq = namedseq("KATAKANA LETTER AINU P");
1186 my %namedseq = namedseq();
1187
1188If used with a single argument in a scalar context, returns the string
a18e976f 1189consisting of the code points of the named sequence, or C<undef> if no
a2bd7410 1190named sequence by that name exists. If used with a single argument in
956cae9a
KW
1191a list context, it returns the list of the ordinals of the code points. If used
1192with no
a2bd7410
JH
1193arguments in a list context, returns a hash with the names of the
1194named sequences as the keys and the named sequences as strings as
a18e976f 1195the values. Otherwise, it returns C<undef> or an empty list depending
a2bd7410
JH
1196on the context.
1197
a452d459
KW
1198This function only operates on officially approved (not provisional) named
1199sequences.
a2bd7410 1200
27f853a0
KW
1201Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
1202sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
1203LETTER AINU P")> will return the same string this function does, but will also
1204operate on character names that aren't named sequences, without you having to
1205know which are which. See L<charnames>.
1206
a2bd7410
JH
1207=cut
1208
1209my %NAMEDSEQ;
1210
1211sub _namedseq {
1212 unless (%NAMEDSEQ) {
98ef7649 1213 if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
a2bd7410
JH
1214 local $_;
1215 while (<$NAMEDSEQFH>) {
98ef7649
KW
1216 if (/^ [0-9A-F]+ \ /x) {
1217 chomp;
1218 my ($sequence, $name) = split /\t/;
1219 my @s = map { chr(hex($_)) } split(' ', $sequence);
1220 $NAMEDSEQ{$name} = join("", @s);
a2bd7410
JH
1221 }
1222 }
1223 close($NAMEDSEQFH);
1224 }
1225 }
1226}
1227
1228sub namedseq {
98ef7649
KW
1229
1230 # Use charnames::string_vianame() which now returns this information,
1231 # unless the caller wants the hash returned, in which case we read it in,
1232 # and thereafter use it instead of calling charnames, as it is faster.
1233
a2bd7410
JH
1234 my $wantarray = wantarray();
1235 if (defined $wantarray) {
1236 if ($wantarray) {
1237 if (@_ == 0) {
98ef7649 1238 _namedseq() unless %NAMEDSEQ;
a2bd7410
JH
1239 return %NAMEDSEQ;
1240 } elsif (@_ == 1) {
98ef7649
KW
1241 my $s;
1242 if (%NAMEDSEQ) {
1243 $s = $NAMEDSEQ{ $_[0] };
1244 }
1245 else {
1246 $s = charnames::string_vianame($_[0]);
1247 }
a2bd7410
JH
1248 return defined $s ? map { ord($_) } split('', $s) : ();
1249 }
1250 } elsif (@_ == 1) {
98ef7649
KW
1251 return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
1252 return charnames::string_vianame($_[0]);
a2bd7410
JH
1253 }
1254 }
1255 return;
1256}
1257
7319f91d
KW
1258my %NUMERIC;
1259
1260sub _numeric {
1261
1262 # Unicode 6.0 instituted the rule that only digits in a consecutive
1263 # block of 10 would be considered decimal digits. Before that, the only
1264 # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
1265 # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
1266 # ONE. The code could be modified to handle that, but not bothering, as
1267 # in TUS 6.0, U+19DA was changed to Nt=Di.
1268 if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
1269 croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
1270 }
98025745
KW
1271 my @numbers = _read_table("unicore/To/Nv.pl");
1272 foreach my $entry (@numbers) {
1273 my ($start, $end, $value) = @$entry;
1274
05dbc6f8
KW
1275 # If value contains a slash, convert to decimal, add a reverse hash
1276 # used by charinfo.
98025745
KW
1277 if ((my @rational = split /\//, $value) == 2) {
1278 my $real = $rational[0] / $rational[1];
05dbc6f8 1279 $real_to_rational{$real} = $value;
98025745
KW
1280 $value = $real;
1281 }
1282
1283 for my $i ($start .. $end) {
1284 $NUMERIC{$i} = $value;
7319f91d 1285 }
7319f91d 1286 }
2dc5eb26
KW
1287
1288 # Decided unsafe to use these that aren't officially part of the Unicode
1289 # standard.
1290 #use Math::Trig;
1291 #my $pi = acos(-1.0);
98025745 1292 #$NUMERIC{0x03C0} = $pi;
7319f91d
KW
1293
1294 # Euler's constant, not to be confused with Euler's number
98025745 1295 #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
7319f91d
KW
1296
1297 # Euler's number
98025745 1298 #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
2dc5eb26 1299
7319f91d
KW
1300 return;
1301}
1302
1303=pod
1304
67592e11 1305=head2 B<num()>
7319f91d 1306
eefd7bc2
KW
1307 use Unicode::UCD 'num';
1308
1309 my $val = num("123");
1310 my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
1311
7319f91d
KW
1312C<num> returns the numeric value of the input Unicode string; or C<undef> if it
1313doesn't think the entire string has a completely valid, safe numeric value.
1314
1315If the string is just one character in length, the Unicode numeric value
1316is returned if it has one, or C<undef> otherwise. Note that this need
1317not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
2dc5eb26
KW
1318example returns -0.5.
1319
1320=cut
7319f91d 1321
2dc5eb26
KW
1322#A few characters to which Unicode doesn't officially
1323#assign a numeric value are considered numeric by C<num>.
1324#These are:
1325
1326# EULER CONSTANT 0.5772... (this is NOT Euler's number)
1327# SCRIPT SMALL E 2.71828... (this IS Euler's number)
1328# GREEK SMALL LETTER PI 3.14159...
1329
1330=pod
7319f91d
KW
1331
1332If the string is more than one character, C<undef> is returned unless
8bb4c8e2 1333all its characters are decimal digits (that is, they would match C<\d+>),
7319f91d
KW
1334from the same script. For example if you have an ASCII '0' and a Bengali
1335'3', mixed together, they aren't considered a valid number, and C<undef>
1336is returned. A further restriction is that the digits all have to be of
1337the same form. A half-width digit mixed with a full-width one will
1338return C<undef>. The Arabic script has two sets of digits; C<num> will
1339return C<undef> unless all the digits in the string come from the same
1340set.
1341
1342C<num> errs on the side of safety, and there may be valid strings of
1343decimal digits that it doesn't recognize. Note that Unicode defines
1344a number of "digit" characters that aren't "decimal digit" characters.
a278d14b 1345"Decimal digits" have the property that they have a positional value, i.e.,
7319f91d
KW
1346there is a units position, a 10's position, a 100's, etc, AND they are
1347arranged in Unicode in blocks of 10 contiguous code points. The Chinese
1348digits, for example, are not in such a contiguous block, and so Unicode
1349doesn't view them as decimal digits, but merely digits, and so C<\d> will not
1350match them. A single-character string containing one of these digits will
1351have its decimal value returned by C<num>, but any longer string containing
1352only these digits will return C<undef>.
1353
a278d14b
KW
1354Strings of multiple sub- and superscripts are not recognized as numbers. You
1355can use either of the compatibility decompositions in Unicode::Normalize to
7319f91d
KW
1356change these into digits, and then call C<num> on the result.
1357
1358=cut
1359
1360# To handle sub, superscripts, this could if called in list context,
1361# consider those, and return the <decomposition> type in the second
1362# array element.
1363
1364sub num {
1365 my $string = $_[0];
1366
1367 _numeric unless %NUMERIC;
1368
1369 my $length = length($string);
98025745 1370 return $NUMERIC{ord($string)} if $length == 1;
7319f91d
KW
1371 return if $string =~ /\D/;
1372 my $first_ord = ord(substr($string, 0, 1));
98025745 1373 my $value = $NUMERIC{$first_ord};
7319f91d
KW
1374 my $zero_ord = $first_ord - $value;
1375
1376 for my $i (1 .. $length -1) {
1377 my $ord = ord(substr($string, $i, 1));
1378 my $digit = $ord - $zero_ord;
1379 return unless $digit >= 0 && $digit <= 9;
1380 $value = $value * 10 + $digit;
1381 }
1382 return $value;
1383}
1384
1385
1386
55d7b906 1387=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 1388
a452d459
KW
1389This returns the version of the Unicode Character Database, in other words, the
1390version of the Unicode standard the database implements. The version is a
1391string of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
1392
1393=cut
1394
1395my $UNICODEVERSION;
1396
1397sub UnicodeVersion {
1398 unless (defined $UNICODEVERSION) {
1399 openunicode(\$VERSIONFH, "version");
1400 chomp($UNICODEVERSION = <$VERSIONFH>);
1401 close($VERSIONFH);
1402 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1403 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1404 }
1405 return $UNICODEVERSION;
1406}
3aa957f9 1407
a452d459
KW
1408=head2 B<Blocks versus Scripts>
1409
1410The difference between a block and a script is that scripts are closer
1411to the linguistic notion of a set of code points required to present
1412languages, while block is more of an artifact of the Unicode code point
1413numbering and separation into blocks of (mostly) 256 code points.
1414
1415For example the Latin B<script> is spread over several B<blocks>, such
1416as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1417C<Latin Extended-B>. On the other hand, the Latin script does not
1418contain all the characters of the C<Basic Latin> block (also known as
1419ASCII): it includes only the letters, and not, for example, the digits
1420or the punctuation.
1421
1422For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1423
1424For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1425
1426=head2 B<Matching Scripts and Blocks>
1427
1428Scripts are matched with the regular-expression construct
1429C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
f200dd12 1430while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches
a452d459
KW
1431any of the 256 code points in the Tibetan block).
1432
8b731da2
JH
1433=head1 BUGS
1434
1435Does not yet support EBCDIC platforms.
1436
561c79ed
JH
1437=head1 AUTHOR
1438
a18e976f 1439Jarkko Hietaniemi. Now maintained by perl5 porters.
561c79ed
JH
1440
1441=cut
1442
14431;