This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Generate file for NameAlias property
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
36c2430c 5no warnings 'surrogate'; # surrogates can be inputs to this
98ef7649 6use charnames ();
94c91ffc 7use Unicode::Normalize qw(getCombinClass NFD);
561c79ed 8
7ef25837 9our $VERSION = '0.37';
561c79ed 10
741297c1
JH
11use Storable qw(dclone);
12
561c79ed
JH
13require Exporter;
14
15our @ISA = qw(Exporter);
74f8133e 16
10a6ecd2
JH
17our @EXPORT_OK = qw(charinfo
18 charblock charscript
19 charblocks charscripts
b08cd201 20 charinrange
ea508aee 21 general_categories bidi_types
b08cd201 22 compexcl
a2bd7410 23 casefold casespec
7319f91d
KW
24 namedseq
25 num
7ef25837
KW
26 prop_aliases
27 prop_value_aliases
681d705c
KW
28 prop_invlist
29 MAX_CP
7319f91d 30 );
561c79ed
JH
31
32use Carp;
33
34=head1 NAME
35
55d7b906 36Unicode::UCD - Unicode character database
561c79ed
JH
37
38=head1 SYNOPSIS
39
55d7b906 40 use Unicode::UCD 'charinfo';
b08cd201 41 my $charinfo = charinfo($codepoint);
561c79ed 42
956cae9a
KW
43 use Unicode::UCD 'casefold';
44 my $casefold = casefold(0xFB00);
45
5d8e6e41
KW
46 use Unicode::UCD 'casespec';
47 my $casespec = casespec(0xFB00);
48
55d7b906 49 use Unicode::UCD 'charblock';
e882dd67
JH
50 my $charblock = charblock($codepoint);
51
55d7b906 52 use Unicode::UCD 'charscript';
65044554 53 my $charscript = charscript($codepoint);
561c79ed 54
55d7b906 55 use Unicode::UCD 'charblocks';
e145285f
JH
56 my $charblocks = charblocks();
57
55d7b906 58 use Unicode::UCD 'charscripts';
ea508aee 59 my $charscripts = charscripts();
e145285f 60
55d7b906 61 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
62 my $range = charscript($script);
63 print "looks like $script\n" if charinrange($range, $codepoint);
64
ea508aee
JH
65 use Unicode::UCD qw(general_categories bidi_types);
66 my $categories = general_categories();
67 my $types = bidi_types();
68
7ef25837
KW
69 use Unicode::UCD 'prop_aliases';
70 my @space_names = prop_aliases("space");
71
72 use Unicode::UCD 'prop_value_aliases';
73 my @gc_punct_names = prop_value_aliases("Gc", "Punct");
74
681d705c
KW
75 use Unicode::UCD 'prop_invlist';
76 my @puncts = prop_invlist("gc=punctuation");
77
55d7b906 78 use Unicode::UCD 'compexcl';
e145285f
JH
79 my $compexcl = compexcl($codepoint);
80
a2bd7410
JH
81 use Unicode::UCD 'namedseq';
82 my $namedseq = namedseq($named_sequence_name);
83
55d7b906 84 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 85
7319f91d 86 my $convert_to_numeric =
62a8c8c2 87 Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
7319f91d 88
561c79ed
JH
89=head1 DESCRIPTION
90
a452d459
KW
91The Unicode::UCD module offers a series of functions that
92provide a simple interface to the Unicode
8b731da2 93Character Database.
561c79ed 94
a452d459
KW
95=head2 code point argument
96
97Some of the functions are called with a I<code point argument>, which is either
98a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
99followed by hexadecimals designating a Unicode code point. In other words, if
100you want a code point to be interpreted as a hexadecimal number, you must
101prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
f200dd12
KW
102interpreted as a decimal code point. Note that the largest code point in
103Unicode is U+10FFFF.
561c79ed
JH
104=cut
105
10a6ecd2 106my $BLOCKSFH;
10a6ecd2 107my $VERSIONFH;
b08cd201
JH
108my $CASEFOLDFH;
109my $CASESPECFH;
a2bd7410 110my $NAMEDSEQFH;
561c79ed
JH
111
112sub openunicode {
113 my ($rfh, @path) = @_;
114 my $f;
115 unless (defined $$rfh) {
116 for my $d (@INC) {
117 use File::Spec;
55d7b906 118 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 119 last if open($$rfh, $f);
e882dd67 120 undef $f;
561c79ed 121 }
e882dd67
JH
122 croak __PACKAGE__, ": failed to find ",
123 File::Spec->catfile(@path), " in @INC"
124 unless defined $f;
561c79ed
JH
125 }
126 return $f;
127}
128
a452d459 129=head2 B<charinfo()>
561c79ed 130
55d7b906 131 use Unicode::UCD 'charinfo';
561c79ed 132
b08cd201 133 my $charinfo = charinfo(0x41);
561c79ed 134
a452d459
KW
135This returns information about the input L</code point argument>
136as a reference to a hash of fields as defined by the Unicode
137standard. If the L</code point argument> is not assigned in the standard
138(i.e., has the general category C<Cn> meaning C<Unassigned>)
139or is a non-character (meaning it is guaranteed to never be assigned in
140the standard),
a18e976f 141C<undef> is returned.
a452d459
KW
142
143Fields that aren't applicable to the particular code point argument exist in the
144returned hash, and are empty.
145
146The keys in the hash with the meanings of their values are:
147
148=over
149
150=item B<code>
151
152the input L</code point argument> expressed in hexadecimal, with leading zeros
153added if necessary to make it contain at least four hexdigits
154
155=item B<name>
156
157name of I<code>, all IN UPPER CASE.
158Some control-type code points do not have names.
159This field will be empty for C<Surrogate> and C<Private Use> code points,
160and for the others without a name,
161it will contain a description enclosed in angle brackets, like
162C<E<lt>controlE<gt>>.
163
164
165=item B<category>
166
167The short name of the general category of I<code>.
168This will match one of the keys in the hash returned by L</general_categories()>.
169
7ef25837
KW
170The L</prop_value_aliases()> function can be used to get all the synonyms
171of the category name.
172
a452d459
KW
173=item B<combining>
174
175the combining class number for I<code> used in the Canonical Ordering Algorithm.
176For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
177available at
178L<http://www.unicode.org/versions/Unicode5.1.0/>
179
7ef25837
KW
180The L</prop_value_aliases()> function can be used to get all the synonyms
181of the combining class number.
182
a452d459
KW
183=item B<bidi>
184
185bidirectional type of I<code>.
186This will match one of the keys in the hash returned by L</bidi_types()>.
187
7ef25837
KW
188The L</prop_value_aliases()> function can be used to get all the synonyms
189of the bidi type name.
190
a452d459
KW
191=item B<decomposition>
192
193is empty if I<code> has no decomposition; or is one or more codes
a18e976f 194(separated by spaces) that, taken in order, represent a decomposition for
a452d459
KW
195I<code>. Each has at least four hexdigits.
196The codes may be preceded by a word enclosed in angle brackets then a space,
197like C<E<lt>compatE<gt> >, giving the type of decomposition
198
06bba7d5
KW
199This decomposition may be an intermediate one whose components are also
200decomposable. Use L<Unicode::Normalize> to get the final decomposition.
201
a452d459
KW
202=item B<decimal>
203
204if I<code> is a decimal digit this is its integer numeric value
205
206=item B<digit>
207
89e4a205
KW
208if I<code> represents some other digit-like number, this is its integer
209numeric value
a452d459
KW
210
211=item B<numeric>
212
213if I<code> represents a whole or rational number, this is its numeric value.
214Rational values are expressed as a string like C<1/4>.
215
216=item B<mirrored>
217
218C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
219
220=item B<unicode10>
221
222name of I<code> in the Unicode 1.0 standard if one
223existed for this code point and is different from the current name
224
225=item B<comment>
226
89e4a205 227As of Unicode 6.0, this is always empty.
a452d459
KW
228
229=item B<upper>
230
06bba7d5 231is empty if there is no single code point uppercase mapping for I<code>
4f66642e 232(its uppercase mapping is itself);
a452d459
KW
233otherwise it is that mapping expressed as at least four hexdigits.
234(L</casespec()> should be used in addition to B<charinfo()>
235for case mappings when the calling program can cope with multiple code point
236mappings.)
237
238=item B<lower>
239
06bba7d5 240is empty if there is no single code point lowercase mapping for I<code>
4f66642e 241(its lowercase mapping is itself);
a452d459
KW
242otherwise it is that mapping expressed as at least four hexdigits.
243(L</casespec()> should be used in addition to B<charinfo()>
244for case mappings when the calling program can cope with multiple code point
245mappings.)
246
247=item B<title>
248
06bba7d5 249is empty if there is no single code point titlecase mapping for I<code>
4f66642e 250(its titlecase mapping is itself);
a452d459
KW
251otherwise it is that mapping expressed as at least four hexdigits.
252(L</casespec()> should be used in addition to B<charinfo()>
253for case mappings when the calling program can cope with multiple code point
254mappings.)
255
256=item B<block>
257
a18e976f 258the block I<code> belongs to (used in C<\p{Blk=...}>).
a452d459
KW
259See L</Blocks versus Scripts>.
260
261
262=item B<script>
263
a18e976f 264the script I<code> belongs to.
a452d459
KW
265See L</Blocks versus Scripts>.
266
267=back
32c16050
JH
268
269Note that you cannot do (de)composition and casing based solely on the
a452d459
KW
270I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
271you will need also the L</compexcl()>, and L</casespec()> functions.
561c79ed
JH
272
273=cut
274
e10d7780 275# NB: This function is nearly duplicated in charnames.pm
10a6ecd2
JH
276sub _getcode {
277 my $arg = shift;
278
dc0a4417 279 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 280 return $arg;
dc0a4417 281 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
282 return hex($1);
283 }
284
285 return;
286}
287
05dbc6f8
KW
288# Populated by _num. Converts real number back to input rational
289my %real_to_rational;
290
291# To store the contents of files found on disk.
292my @BIDIS;
293my @CATEGORIES;
294my @DECOMPOSITIONS;
295my @NUMERIC_TYPES;
5c3b35c9
KW
296my %SIMPLE_LOWER;
297my %SIMPLE_TITLE;
298my %SIMPLE_UPPER;
299my %UNICODE_1_NAMES;
05dbc6f8
KW
300
301sub _charinfo_case {
302
303 # Returns the value to set into one of the case fields in the charinfo
304 # structure.
305 # $char is the character,
306 # $cased is the case-changed character
307 # $file is the file in lib/unicore/To/$file that contains the data
308 # needed for this, in the form that _search() understands.
5c3b35c9 309 # $hash_ref points to the hash holding the contents of $file. It will
05dbc6f8
KW
310 # be populated if empty.
311 # By using the 'uc', etc. functions, we avoid loading more files into
312 # memory except for those rare cases where the simple casing (which has
313 # been what charinfo() has always returned, is different than the full
314 # casing.
5c3b35c9 315 my ($char, $cased, $file, $hash_ref) = @_;
05dbc6f8
KW
316
317 return "" if $cased eq $char;
318
319 return sprintf("%04X", ord $cased) if length($cased) == 1;
320
5c3b35c9
KW
321 %$hash_ref =_read_table("unicore/To/$file", 'use_hash') unless %$hash_ref;
322 return $hash_ref->{ord $char} // "";
a6fa416b
ST
323}
324
05dbc6f8 325sub charinfo {
a6fa416b 326
05dbc6f8
KW
327 # This function has traditionally mimicked what is in UnicodeData.txt,
328 # warts and all. This is a re-write that avoids UnicodeData.txt so that
329 # it can be removed to save disk space. Instead, this assembles
330 # information gotten by other methods that get data from various other
331 # files. It uses charnames to get the character name; and various
332 # mktables tables.
324f9e44 333
05dbc6f8 334 use feature 'unicode_strings';
a6fa416b 335
10a6ecd2
JH
336 my $arg = shift;
337 my $code = _getcode($arg);
05dbc6f8
KW
338 croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
339
340 # Non-unicode implies undef.
341 return if $code > 0x10FFFF;
342
343 my %prop;
344 my $char = chr($code);
345
346 @CATEGORIES =_read_table("unicore/To/Gc.pl") unless @CATEGORIES;
347 $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
348 // $utf8::SwashInfo{'ToGc'}{'missing'};
349
350 return if $prop{'category'} eq 'Cn'; # Unassigned code points are undef
351
352 $prop{'code'} = sprintf "%04X", $code;
353 $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
354 : (charnames::viacode($code) // "");
355
356 $prop{'combining'} = getCombinClass($code);
357
358 @BIDIS =_read_table("unicore/To/Bc.pl") unless @BIDIS;
359 $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
360 // $utf8::SwashInfo{'ToBc'}{'missing'};
361
362 # For most code points, we can just read in "unicore/Decomposition.pl", as
363 # its contents are exactly what should be output. But that file doesn't
364 # contain the data for the Hangul syllable decompositions, which can be
94c91ffc
KW
365 # algorithmically computed, and NFD() does that, so we call NFD() for
366 # those. We can't use NFD() for everything, as it does a complete
05dbc6f8 367 # recursive decomposition, and what this function has always done is to
94c91ffc
KW
368 # return what's in UnicodeData.txt which doesn't show that recursiveness.
369 # Fortunately, the NFD() of the Hanguls doesn't have any recursion
370 # issues.
371 # Having no decomposition implies an empty field; otherwise, all but
372 # "Canonical" imply a compatible decomposition, and the type is prefixed
373 # to that, as it is in UnicodeData.txt
05dbc6f8
KW
374 if ($char =~ /\p{Block=Hangul_Syllables}/) {
375 # The code points of the decomposition are output in standard Unicode
376 # hex format, separated by blanks.
377 $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
94c91ffc 378 unpack "U*", NFD($char);
a6fa416b 379 }
05dbc6f8
KW
380 else {
381 @DECOMPOSITIONS = _read_table("unicore/Decomposition.pl")
382 unless @DECOMPOSITIONS;
383 $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
384 $code) // "";
561c79ed 385 }
05dbc6f8
KW
386
387 # Can use num() to get the numeric values, if any.
388 if (! defined (my $value = num($char))) {
389 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
390 }
391 else {
392 if ($char =~ /\d/) {
393 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
394 }
395 else {
396
397 # For non-decimal-digits, we have to read in the Numeric type
398 # to distinguish them. It is not just a matter of integer vs.
399 # rational, as some whole number values are not considered digits,
400 # e.g., TAMIL NUMBER TEN.
401 $prop{'decimal'} = "";
402
403 @NUMERIC_TYPES =_read_table("unicore/To/Nt.pl")
404 unless @NUMERIC_TYPES;
405 if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
406 eq 'Digit')
407 {
408 $prop{'digit'} = $prop{'numeric'} = $value;
409 }
410 else {
411 $prop{'digit'} = "";
412 $prop{'numeric'} = $real_to_rational{$value} // $value;
413 }
414 }
415 }
416
417 $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
418
5c3b35c9
KW
419 %UNICODE_1_NAMES =_read_table("unicore/To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
420 $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
05dbc6f8
KW
421
422 # This is true starting in 6.0, but, num() also requires 6.0, so
423 # don't need to test for version again here.
424 $prop{'comment'} = "";
425
5c3b35c9
KW
426 $prop{'upper'} = _charinfo_case($char, uc $char, '_suc.pl', \%SIMPLE_UPPER);
427 $prop{'lower'} = _charinfo_case($char, lc $char, '_slc.pl', \%SIMPLE_LOWER);
05dbc6f8 428 $prop{'title'} = _charinfo_case($char, ucfirst $char, '_stc.pl',
5c3b35c9 429 \%SIMPLE_TITLE);
05dbc6f8
KW
430
431 $prop{block} = charblock($code);
432 $prop{script} = charscript($code);
433 return \%prop;
561c79ed
JH
434}
435
e882dd67
JH
436sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
437 my ($table, $lo, $hi, $code) = @_;
438
439 return if $lo > $hi;
440
441 my $mid = int(($lo+$hi) / 2);
442
443 if ($table->[$mid]->[0] < $code) {
10a6ecd2 444 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
445 return $table->[$mid]->[2];
446 } else {
447 _search($table, $mid + 1, $hi, $code);
448 }
449 } elsif ($table->[$mid]->[0] > $code) {
450 _search($table, $lo, $mid - 1, $code);
451 } else {
452 return $table->[$mid]->[2];
453 }
454}
455
cb366075 456sub _read_table ($;$) {
3a12600d
KW
457
458 # Returns the contents of the mktables generated table file located at $1
cb366075
KW
459 # in the form of either an array of arrays or a hash, depending on if the
460 # optional second parameter is true (for hash return) or not. In the case
461 # of a hash return, each key is a code point, and its corresponding value
462 # is what the table gives as the code point's corresponding value. In the
463 # case of an array return, each outer array denotes a range with [0] the
464 # start point of that range; [1] the end point; and [2] the value that
465 # every code point in the range has. The hash return is useful for fast
466 # lookup when the table contains only single code point ranges. The array
467 # return takes much less memory when there are large ranges.
3a12600d 468 #
cb366075 469 # This function has the side effect of setting
3a12600d
KW
470 # $utf8::SwashInfo{$property}{'format'} to be the mktables format of the
471 # table; and
472 # $utf8::SwashInfo{$property}{'missing'} to be the value for all entries
473 # not listed in the table.
474 # where $property is the Unicode property name, preceded by 'To' for map
475 # properties., e.g., 'ToSc'.
476 #
477 # Table entries look like one of:
478 # 0000 0040 Common # [65]
479 # 00AA Latin
480
481 my $table = shift;
cb366075
KW
482 my $return_hash = shift;
483 $return_hash = 0 unless defined $return_hash;
3a12600d 484 my @return;
cb366075 485 my %return;
3a12600d
KW
486 local $_;
487
488 for (split /^/m, do $table) {
489 my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
490 \s* ( \# .* )? # Optional comment
491 $ /x;
83fd1222
KW
492 my $decimal_start = hex $start;
493 my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
cb366075 494 if ($return_hash) {
83fd1222 495 foreach my $i ($decimal_start .. $decimal_end) {
cb366075
KW
496 $return{$i} = $value;
497 }
498 }
9a96c106
KW
499 elsif (@return &&
500 $return[-1][1] == $decimal_start - 1
501 && $return[-1][2] eq $value)
502 {
503 # If this is merely extending the previous range, do just that.
504 $return[-1]->[1] = $decimal_end;
505 }
cb366075 506 else {
83fd1222 507 push @return, [ $decimal_start, $decimal_end, $value ];
cb366075 508 }
3a12600d 509 }
cb366075 510 return ($return_hash) ? %return : @return;
3a12600d
KW
511}
512
10a6ecd2
JH
513sub charinrange {
514 my ($range, $arg) = @_;
515 my $code = _getcode($arg);
516 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
517 unless defined $code;
518 _search($range, 0, $#$range, $code);
519}
520
a452d459 521=head2 B<charblock()>
561c79ed 522
55d7b906 523 use Unicode::UCD 'charblock';
561c79ed
JH
524
525 my $charblock = charblock(0x41);
10a6ecd2 526 my $charblock = charblock(1234);
a452d459 527 my $charblock = charblock(0x263a);
10a6ecd2
JH
528 my $charblock = charblock("U+263a");
529
78bf21c2 530 my $range = charblock('Armenian');
10a6ecd2 531
a452d459 532With a L</code point argument> charblock() returns the I<block> the code point
430fe03d
KW
533belongs to, e.g. C<Basic Latin>. The old-style block name is returned (see
534L</Old-style versus new-style block names>).
a452d459 535If the code point is unassigned, this returns the block it would belong to if
a18e976f 536it were assigned.
10a6ecd2 537
78bf21c2
JH
538See also L</Blocks versus Scripts>.
539
18972f4b 540If supplied with an argument that can't be a code point, charblock() tries to
430fe03d
KW
541do the opposite and interpret the argument as an old-style block name. The
542return value
a18e976f
KW
543is a I<range set> with one range: an anonymous list with a single element that
544consists of another anonymous list whose first element is the first code point
545in the block, and whose second (and final) element is the final code point in
546the block. (The extra list consisting of just one element is so that the same
547program logic can be used to handle both this return, and the return from
548L</charscript()> which can have multiple ranges.) You can test whether a code
549point is in a range using the L</charinrange()> function. If the argument is
550not a known block, C<undef> is returned.
561c79ed 551
561c79ed
JH
552=cut
553
554my @BLOCKS;
10a6ecd2 555my %BLOCKS;
561c79ed 556
10a6ecd2 557sub _charblocks {
06bba7d5
KW
558
559 # Can't read from the mktables table because it loses the hyphens in the
560 # original.
561c79ed 561 unless (@BLOCKS) {
10a6ecd2 562 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 563 local $_;
10a6ecd2 564 while (<$BLOCKSFH>) {
2796c109 565 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
566 my ($lo, $hi) = (hex($1), hex($2));
567 my $subrange = [ $lo, $hi, $3 ];
568 push @BLOCKS, $subrange;
569 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
570 }
571 }
10a6ecd2 572 close($BLOCKSFH);
561c79ed
JH
573 }
574 }
10a6ecd2
JH
575}
576
577sub charblock {
578 my $arg = shift;
579
580 _charblocks() unless @BLOCKS;
581
582 my $code = _getcode($arg);
561c79ed 583
10a6ecd2 584 if (defined $code) {
c707cf8e
KW
585 my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code);
586 return $result if defined $result;
587 return 'No_Block';
588 }
589 elsif (exists $BLOCKS{$arg}) {
590 return dclone $BLOCKS{$arg};
10a6ecd2 591 }
e882dd67
JH
592}
593
a452d459 594=head2 B<charscript()>
e882dd67 595
55d7b906 596 use Unicode::UCD 'charscript';
e882dd67
JH
597
598 my $charscript = charscript(0x41);
10a6ecd2
JH
599 my $charscript = charscript(1234);
600 my $charscript = charscript("U+263a");
e882dd67 601
78bf21c2 602 my $range = charscript('Thai');
10a6ecd2 603
a452d459
KW
604With a L</code point argument> charscript() returns the I<script> the
605code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
bb2d29dc 606If the code point is unassigned, it returns C<"Unknown">.
78bf21c2 607
eb0cc9e3 608If supplied with an argument that can't be a code point, charscript() tries
a18e976f
KW
609to do the opposite and interpret the argument as a script name. The
610return value is a I<range set>: an anonymous list of lists that contain
eb0cc9e3 611I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
a18e976f
KW
612code point is in a range set using the L</charinrange()> function. If the
613argument is not a known script, C<undef> is returned.
a452d459
KW
614
615See also L</Blocks versus Scripts>.
e882dd67 616
e882dd67
JH
617=cut
618
619my @SCRIPTS;
10a6ecd2 620my %SCRIPTS;
e882dd67 621
10a6ecd2 622sub _charscripts {
7bccef0b
KW
623 @SCRIPTS =_read_table("unicore/To/Sc.pl") unless @SCRIPTS;
624 foreach my $entry (@SCRIPTS) {
f3d50ac9 625 $entry->[2] =~ s/(_\w)/\L$1/g; # Preserve old-style casing
7bccef0b 626 push @{$SCRIPTS{$entry->[2]}}, $entry;
e882dd67 627 }
10a6ecd2
JH
628}
629
630sub charscript {
631 my $arg = shift;
632
633 _charscripts() unless @SCRIPTS;
e882dd67 634
10a6ecd2
JH
635 my $code = _getcode($arg);
636
637 if (defined $code) {
7bccef0b
KW
638 my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
639 return $result if defined $result;
8079ad82 640 return $utf8::SwashInfo{'ToSc'}{'missing'};
7bccef0b
KW
641 } elsif (exists $SCRIPTS{$arg}) {
642 return dclone $SCRIPTS{$arg};
10a6ecd2 643 }
7bccef0b
KW
644
645 return;
10a6ecd2
JH
646}
647
a452d459 648=head2 B<charblocks()>
10a6ecd2 649
55d7b906 650 use Unicode::UCD 'charblocks';
10a6ecd2 651
b08cd201 652 my $charblocks = charblocks();
10a6ecd2 653
b08cd201 654charblocks() returns a reference to a hash with the known block names
a452d459 655as the keys, and the code point ranges (see L</charblock()>) as the values.
10a6ecd2 656
430fe03d
KW
657The names are in the old-style (see L</Old-style versus new-style block
658names>).
659
78bf21c2
JH
660See also L</Blocks versus Scripts>.
661
10a6ecd2
JH
662=cut
663
664sub charblocks {
b08cd201 665 _charblocks() unless %BLOCKS;
741297c1 666 return dclone \%BLOCKS;
10a6ecd2
JH
667}
668
a452d459 669=head2 B<charscripts()>
10a6ecd2 670
55d7b906 671 use Unicode::UCD 'charscripts';
10a6ecd2 672
ea508aee 673 my $charscripts = charscripts();
10a6ecd2 674
ea508aee 675charscripts() returns a reference to a hash with the known script
a452d459 676names as the keys, and the code point ranges (see L</charscript()>) as
ea508aee 677the values.
10a6ecd2 678
78bf21c2
JH
679See also L</Blocks versus Scripts>.
680
10a6ecd2
JH
681=cut
682
683sub charscripts {
b08cd201 684 _charscripts() unless %SCRIPTS;
741297c1 685 return dclone \%SCRIPTS;
561c79ed
JH
686}
687
a452d459 688=head2 B<charinrange()>
10a6ecd2 689
f200dd12 690In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
10a6ecd2 691can also test whether a code point is in the I<range> as returned by
a452d459
KW
692L</charblock()> and L</charscript()> or as the values of the hash returned
693by L</charblocks()> and L</charscripts()> by using charinrange():
10a6ecd2 694
55d7b906 695 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
696
697 $range = charscript('Hiragana');
e145285f 698 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
699
700=cut
701
ea508aee
JH
702my %GENERAL_CATEGORIES =
703 (
704 'L' => 'Letter',
705 'LC' => 'CasedLetter',
706 'Lu' => 'UppercaseLetter',
707 'Ll' => 'LowercaseLetter',
708 'Lt' => 'TitlecaseLetter',
709 'Lm' => 'ModifierLetter',
710 'Lo' => 'OtherLetter',
711 'M' => 'Mark',
712 'Mn' => 'NonspacingMark',
713 'Mc' => 'SpacingMark',
714 'Me' => 'EnclosingMark',
715 'N' => 'Number',
716 'Nd' => 'DecimalNumber',
717 'Nl' => 'LetterNumber',
718 'No' => 'OtherNumber',
719 'P' => 'Punctuation',
720 'Pc' => 'ConnectorPunctuation',
721 'Pd' => 'DashPunctuation',
722 'Ps' => 'OpenPunctuation',
723 'Pe' => 'ClosePunctuation',
724 'Pi' => 'InitialPunctuation',
725 'Pf' => 'FinalPunctuation',
726 'Po' => 'OtherPunctuation',
727 'S' => 'Symbol',
728 'Sm' => 'MathSymbol',
729 'Sc' => 'CurrencySymbol',
730 'Sk' => 'ModifierSymbol',
731 'So' => 'OtherSymbol',
732 'Z' => 'Separator',
733 'Zs' => 'SpaceSeparator',
734 'Zl' => 'LineSeparator',
735 'Zp' => 'ParagraphSeparator',
736 'C' => 'Other',
737 'Cc' => 'Control',
738 'Cf' => 'Format',
739 'Cs' => 'Surrogate',
740 'Co' => 'PrivateUse',
741 'Cn' => 'Unassigned',
742 );
743
744sub general_categories {
745 return dclone \%GENERAL_CATEGORIES;
746}
747
a452d459 748=head2 B<general_categories()>
ea508aee
JH
749
750 use Unicode::UCD 'general_categories';
751
752 my $categories = general_categories();
753
a452d459 754This returns a reference to a hash which has short
ea508aee
JH
755general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
756names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
757C<Symbol>) as values. The hash is reversible in case you need to go
758from the long names to the short names. The general category is the
a452d459
KW
759one returned from
760L</charinfo()> under the C<category> key.
ea508aee 761
7ef25837
KW
762The L</prop_value_aliases()> function can be used to get all the synonyms of
763the category name.
764
ea508aee
JH
765=cut
766
767my %BIDI_TYPES =
768 (
769 'L' => 'Left-to-Right',
770 'LRE' => 'Left-to-Right Embedding',
771 'LRO' => 'Left-to-Right Override',
772 'R' => 'Right-to-Left',
773 'AL' => 'Right-to-Left Arabic',
774 'RLE' => 'Right-to-Left Embedding',
775 'RLO' => 'Right-to-Left Override',
776 'PDF' => 'Pop Directional Format',
777 'EN' => 'European Number',
778 'ES' => 'European Number Separator',
779 'ET' => 'European Number Terminator',
780 'AN' => 'Arabic Number',
781 'CS' => 'Common Number Separator',
782 'NSM' => 'Non-Spacing Mark',
783 'BN' => 'Boundary Neutral',
784 'B' => 'Paragraph Separator',
785 'S' => 'Segment Separator',
786 'WS' => 'Whitespace',
787 'ON' => 'Other Neutrals',
788 );
789
a452d459 790=head2 B<bidi_types()>
ea508aee
JH
791
792 use Unicode::UCD 'bidi_types';
793
794 my $categories = bidi_types();
795
a452d459 796This returns a reference to a hash which has the short
ea508aee
JH
797bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
798names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
799hash is reversible in case you need to go from the long names to the
a452d459
KW
800short names. The bidi type is the one returned from
801L</charinfo()>
ea508aee
JH
802under the C<bidi> key. For the exact meaning of the various bidi classes
803the Unicode TR9 is recommended reading:
a452d459 804L<http://www.unicode.org/reports/tr9/>
ea508aee
JH
805(as of Unicode 5.0.0)
806
7ef25837
KW
807The L</prop_value_aliases()> function can be used to get all the synonyms of
808the bidi type name.
809
ea508aee
JH
810=cut
811
a452d459
KW
812sub bidi_types {
813 return dclone \%BIDI_TYPES;
814}
815
816=head2 B<compexcl()>
b08cd201 817
55d7b906 818 use Unicode::UCD 'compexcl';
b08cd201 819
a452d459 820 my $compexcl = compexcl(0x09dc);
b08cd201 821
71a442a8
KW
822This routine is included for backwards compatibility, but as of Perl 5.12, for
823most purposes it is probably more convenient to use one of the following
824instead:
825
826 my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
827 my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
828
829or even
830
831 my $compexcl = chr(0x09dc) =~ /\p{CE};
832 my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
833
834The first two forms return B<true> if the L</code point argument> should not
76b05678
KW
835be produced by composition normalization. For the final two forms to return
836B<true>, it is additionally required that this fact not otherwise be
837determinable from the Unicode data base.
71a442a8
KW
838
839This routine behaves identically to the final two forms. That is,
840it does not return B<true> if the code point has a decomposition
a452d459
KW
841consisting of another single code point, nor if its decomposition starts
842with a code point whose combining class is non-zero. Code points that meet
843either of these conditions should also not be produced by composition
71a442a8
KW
844normalization, which is probably why you should use the
845C<Full_Composition_Exclusion> property instead, as shown above.
b08cd201 846
71a442a8 847The routine returns B<false> otherwise.
b08cd201
JH
848
849=cut
850
b08cd201
JH
851sub compexcl {
852 my $arg = shift;
853 my $code = _getcode($arg);
74f8133e
JH
854 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
855 unless defined $code;
b08cd201 856
36c2430c 857 no warnings "non_unicode"; # So works on non-Unicode code points
71a442a8 858 return chr($code) =~ /\p{Composition_Exclusion}/;
b08cd201
JH
859}
860
a452d459 861=head2 B<casefold()>
b08cd201 862
55d7b906 863 use Unicode::UCD 'casefold';
b08cd201 864
a452d459
KW
865 my $casefold = casefold(0xDF);
866 if (defined $casefold) {
867 my @full_fold_hex = split / /, $casefold->{'full'};
868 my $full_fold_string =
869 join "", map {chr(hex($_))} @full_fold_hex;
870 my @turkic_fold_hex =
871 split / /, ($casefold->{'turkic'} ne "")
872 ? $casefold->{'turkic'}
873 : $casefold->{'full'};
874 my $turkic_fold_string =
875 join "", map {chr(hex($_))} @turkic_fold_hex;
876 }
877 if (defined $casefold && $casefold->{'simple'} ne "") {
878 my $simple_fold_hex = $casefold->{'simple'};
879 my $simple_fold_string = chr(hex($simple_fold_hex));
880 }
b08cd201 881
a452d459
KW
882This returns the (almost) locale-independent case folding of the
883character specified by the L</code point argument>.
b08cd201 884
a18e976f 885If there is no case folding for that code point, C<undef> is returned.
a452d459
KW
886
887If there is a case folding for that code point, a reference to a hash
b08cd201
JH
888with the following fields is returned:
889
a452d459
KW
890=over
891
892=item B<code>
893
894the input L</code point argument> expressed in hexadecimal, with leading zeros
895added if necessary to make it contain at least four hexdigits
896
897=item B<full>
898
a18e976f 899one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
900code points for the case folding for I<code>.
901Each has at least four hexdigits.
902
903=item B<simple>
904
905is empty, or is exactly one code with at least four hexdigits which can be used
906as an alternative case folding when the calling program cannot cope with the
907fold being a sequence of multiple code points. If I<full> is just one code
908point, then I<simple> equals I<full>. If there is no single code point folding
909defined for I<code>, then I<simple> is the empty string. Otherwise, it is an
910inferior, but still better-than-nothing alternative folding to I<full>.
911
912=item B<mapping>
913
914is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
915otherwise. It can be considered to be the simplest possible folding for
916I<code>. It is defined primarily for backwards compatibility.
917
918=item B<status>
b08cd201 919
a452d459
KW
920is C<C> (for C<common>) if the best possible fold is a single code point
921(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct
922folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if
a18e976f
KW
923there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).
924Note that this
a452d459
KW
925describes the contents of I<mapping>. It is defined primarily for backwards
926compatibility.
b08cd201 927
a452d459
KW
928On versions 3.1 and earlier of Unicode, I<status> can also be
929C<I> which is the same as C<C> but is a special case for dotted uppercase I and
930dotless lowercase i:
b08cd201 931
a452d459 932=over
b08cd201 933
a18e976f 934=item B<*> If you use this C<I> mapping
a452d459 935
a18e976f 936the result is case-insensitive,
a452d459
KW
937but dotless and dotted I's are not distinguished
938
a18e976f 939=item B<*> If you exclude this C<I> mapping
a452d459 940
a18e976f 941the result is not fully case-insensitive, but
a452d459
KW
942dotless and dotted I's are distinguished
943
944=back
945
946=item B<turkic>
947
948contains any special folding for Turkic languages. For versions of Unicode
949starting with 3.2, this field is empty unless I<code> has a different folding
950in Turkic languages, in which case it is one or more codes (separated by
a18e976f 951spaces) that, taken in order, give the code points for the case folding for
a452d459
KW
952I<code> in those languages.
953Each code has at least four hexdigits.
954Note that this folding does not maintain canonical equivalence without
955additional processing.
956
957For versions of Unicode 3.1 and earlier, this field is empty unless there is a
958special folding for Turkic languages, in which case I<status> is C<I>, and
959I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
960
961=back
962
963Programs that want complete generality and the best folding results should use
964the folding contained in the I<full> field. But note that the fold for some
965code points will be a sequence of multiple code points.
966
967Programs that can't cope with the fold mapping being multiple code points can
968use the folding contained in the I<simple> field, with the loss of some
969generality. In Unicode 5.1, about 7% of the defined foldings have no single
970code point folding.
971
972The I<mapping> and I<status> fields are provided for backwards compatibility for
973existing programs. They contain the same values as in previous versions of
974this function.
975
976Locale is not completely independent. The I<turkic> field contains results to
977use when the locale is a Turkic language.
b08cd201
JH
978
979For more information about case mappings see
a452d459 980L<http://www.unicode.org/unicode/reports/tr21>
b08cd201
JH
981
982=cut
983
984my %CASEFOLD;
985
986sub _casefold {
987 unless (%CASEFOLD) {
551b6b6f 988 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 989 local $_;
b08cd201 990 while (<$CASEFOLDFH>) {
a452d459 991 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
b08cd201 992 my $code = hex($1);
a452d459
KW
993 $CASEFOLD{$code}{'code'} = $1;
994 $CASEFOLD{$code}{'turkic'} = "" unless
995 defined $CASEFOLD{$code}{'turkic'};
996 if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and
997 # earlier Unicodes
998 # Both entries there (I
999 # only checked 3.1) are
1000 # the same as C, and
1001 # there are no other
1002 # entries for those
1003 # codepoints, so treat
1004 # as if C, but override
1005 # the turkic one for
1006 # 'I'.
1007 $CASEFOLD{$code}{'status'} = $2;
1008 $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
1009 $CASEFOLD{$code}{'mapping'} = $3;
1010 $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
1011 } elsif ($2 eq 'F') {
1012 $CASEFOLD{$code}{'full'} = $3;
1013 unless (defined $CASEFOLD{$code}{'simple'}) {
1014 $CASEFOLD{$code}{'simple'} = "";
1015 $CASEFOLD{$code}{'mapping'} = $3;
1016 $CASEFOLD{$code}{'status'} = $2;
1017 }
1018 } elsif ($2 eq 'S') {
1019
1020
1021 # There can't be a simple without a full, and simple
1022 # overrides all but full
1023
1024 $CASEFOLD{$code}{'simple'} = $3;
1025 $CASEFOLD{$code}{'mapping'} = $3;
1026 $CASEFOLD{$code}{'status'} = $2;
1027 } elsif ($2 eq 'T') {
1028 $CASEFOLD{$code}{'turkic'} = $3;
1029 } # else can't happen because only [CIFST] are possible
b08cd201
JH
1030 }
1031 }
1032 close($CASEFOLDFH);
1033 }
1034 }
1035}
1036
1037sub casefold {
1038 my $arg = shift;
1039 my $code = _getcode($arg);
74f8133e
JH
1040 croak __PACKAGE__, "::casefold: unknown code '$arg'"
1041 unless defined $code;
b08cd201
JH
1042
1043 _casefold() unless %CASEFOLD;
1044
1045 return $CASEFOLD{$code};
1046}
1047
a452d459 1048=head2 B<casespec()>
b08cd201 1049
55d7b906 1050 use Unicode::UCD 'casespec';
b08cd201 1051
a452d459 1052 my $casespec = casespec(0xFB00);
b08cd201 1053
a452d459
KW
1054This returns the potentially locale-dependent case mappings of the L</code point
1055argument>. The mappings may be longer than a single code point (which the basic
1056Unicode case mappings as returned by L</charinfo()> never are).
b08cd201 1057
a452d459
KW
1058If there are no case mappings for the L</code point argument>, or if all three
1059possible mappings (I<lower>, I<title> and I<upper>) result in single code
a18e976f 1060points and are locale independent and unconditional, C<undef> is returned
5d8e6e41
KW
1061(which means that the case mappings, if any, for the code point are those
1062returned by L</charinfo()>).
a452d459
KW
1063
1064Otherwise, a reference to a hash giving the mappings (or a reference to a hash
5d8e6e41
KW
1065of such hashes, explained below) is returned with the following keys and their
1066meanings:
a452d459
KW
1067
1068The keys in the bottom layer hash with the meanings of their values are:
1069
1070=over
1071
1072=item B<code>
1073
1074the input L</code point argument> expressed in hexadecimal, with leading zeros
1075added if necessary to make it contain at least four hexdigits
1076
1077=item B<lower>
1078
a18e976f 1079one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1080code points for the lower case of I<code>.
1081Each has at least four hexdigits.
1082
1083=item B<title>
b08cd201 1084
a18e976f 1085one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1086code points for the title case of I<code>.
1087Each has at least four hexdigits.
b08cd201 1088
d2da20e3 1089=item B<upper>
b08cd201 1090
a18e976f 1091one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1092code points for the upper case of I<code>.
1093Each has at least four hexdigits.
1094
1095=item B<condition>
1096
1097the conditions for the mappings to be valid.
a18e976f 1098If C<undef>, the mappings are always valid.
a452d459
KW
1099When defined, this field is a list of conditions,
1100all of which must be true for the mappings to be valid.
1101The list consists of one or more
1102I<locales> (see below)
1103and/or I<contexts> (explained in the next paragraph),
1104separated by spaces.
1105(Other than as used to separate elements, spaces are to be ignored.)
1106Case distinctions in the condition list are not significant.
82c0b05b 1107Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 1108
a452d459
KW
1109A I<context> is one of those defined in the Unicode standard.
1110For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1111available at
5d8e6e41
KW
1112L<http://www.unicode.org/versions/Unicode5.1.0/>.
1113These are for context-sensitive casing.
f499c386 1114
a452d459
KW
1115=back
1116
5d8e6e41 1117The hash described above is returned for locale-independent casing, where
a18e976f 1118at least one of the mappings has length longer than one. If C<undef> is
5d8e6e41
KW
1119returned, the code point may have mappings, but if so, all are length one,
1120and are returned by L</charinfo()>.
1121Note that when this function does return a value, it will be for the complete
1122set of mappings for a code point, even those whose length is one.
1123
1124If there are additional casing rules that apply only in certain locales,
1125an additional key for each will be defined in the returned hash. Each such key
1126will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1127followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1128and a variant code). You can find the lists of all possible locales, see
1129L<Locale::Country> and L<Locale::Language>.
89e4a205 1130(In Unicode 6.0, the only locales returned by this function
a452d459 1131are C<lt>, C<tr>, and C<az>.)
b08cd201 1132
5d8e6e41
KW
1133Each locale key is a reference to a hash that has the form above, and gives
1134the casing rules for that particular locale, which take precedence over the
1135locale-independent ones when in that locale.
1136
1137If the only casing for a code point is locale-dependent, then the returned
1138hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1139will contain only locale keys.
1140
b08cd201 1141For more information about case mappings see
a452d459 1142L<http://www.unicode.org/unicode/reports/tr21/>
b08cd201
JH
1143
1144=cut
1145
1146my %CASESPEC;
1147
1148sub _casespec {
1149 unless (%CASESPEC) {
551b6b6f 1150 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 1151 local $_;
b08cd201
JH
1152 while (<$CASESPECFH>) {
1153 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
1154 my ($hexcode, $lower, $title, $upper, $condition) =
1155 ($1, $2, $3, $4, $5);
1156 my $code = hex($hexcode);
1157 if (exists $CASESPEC{$code}) {
1158 if (exists $CASESPEC{$code}->{code}) {
1159 my ($oldlower,
1160 $oldtitle,
1161 $oldupper,
1162 $oldcondition) =
1163 @{$CASESPEC{$code}}{qw(lower
1164 title
1165 upper
1166 condition)};
822ebcc8
JH
1167 if (defined $oldcondition) {
1168 my ($oldlocale) =
f499c386 1169 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
1170 delete $CASESPEC{$code};
1171 $CASESPEC{$code}->{$oldlocale} =
1172 { code => $hexcode,
1173 lower => $oldlower,
1174 title => $oldtitle,
1175 upper => $oldupper,
1176 condition => $oldcondition };
f499c386
JH
1177 }
1178 }
1179 my ($locale) =
1180 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1181 $CASESPEC{$code}->{$locale} =
1182 { code => $hexcode,
1183 lower => $lower,
1184 title => $title,
1185 upper => $upper,
1186 condition => $condition };
1187 } else {
1188 $CASESPEC{$code} =
1189 { code => $hexcode,
1190 lower => $lower,
1191 title => $title,
1192 upper => $upper,
1193 condition => $condition };
1194 }
b08cd201
JH
1195 }
1196 }
1197 close($CASESPECFH);
1198 }
1199 }
1200}
1201
1202sub casespec {
1203 my $arg = shift;
1204 my $code = _getcode($arg);
74f8133e
JH
1205 croak __PACKAGE__, "::casespec: unknown code '$arg'"
1206 unless defined $code;
b08cd201
JH
1207
1208 _casespec() unless %CASESPEC;
1209
741297c1 1210 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
1211}
1212
a452d459 1213=head2 B<namedseq()>
a2bd7410
JH
1214
1215 use Unicode::UCD 'namedseq';
1216
1217 my $namedseq = namedseq("KATAKANA LETTER AINU P");
1218 my @namedseq = namedseq("KATAKANA LETTER AINU P");
1219 my %namedseq = namedseq();
1220
1221If used with a single argument in a scalar context, returns the string
a18e976f 1222consisting of the code points of the named sequence, or C<undef> if no
a2bd7410 1223named sequence by that name exists. If used with a single argument in
956cae9a
KW
1224a list context, it returns the list of the ordinals of the code points. If used
1225with no
a2bd7410
JH
1226arguments in a list context, returns a hash with the names of the
1227named sequences as the keys and the named sequences as strings as
a18e976f 1228the values. Otherwise, it returns C<undef> or an empty list depending
a2bd7410
JH
1229on the context.
1230
a452d459
KW
1231This function only operates on officially approved (not provisional) named
1232sequences.
a2bd7410 1233
27f853a0
KW
1234Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
1235sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
1236LETTER AINU P")> will return the same string this function does, but will also
1237operate on character names that aren't named sequences, without you having to
1238know which are which. See L<charnames>.
1239
a2bd7410
JH
1240=cut
1241
1242my %NAMEDSEQ;
1243
1244sub _namedseq {
1245 unless (%NAMEDSEQ) {
98ef7649 1246 if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
a2bd7410
JH
1247 local $_;
1248 while (<$NAMEDSEQFH>) {
98ef7649
KW
1249 if (/^ [0-9A-F]+ \ /x) {
1250 chomp;
1251 my ($sequence, $name) = split /\t/;
1252 my @s = map { chr(hex($_)) } split(' ', $sequence);
1253 $NAMEDSEQ{$name} = join("", @s);
a2bd7410
JH
1254 }
1255 }
1256 close($NAMEDSEQFH);
1257 }
1258 }
1259}
1260
1261sub namedseq {
98ef7649
KW
1262
1263 # Use charnames::string_vianame() which now returns this information,
1264 # unless the caller wants the hash returned, in which case we read it in,
1265 # and thereafter use it instead of calling charnames, as it is faster.
1266
a2bd7410
JH
1267 my $wantarray = wantarray();
1268 if (defined $wantarray) {
1269 if ($wantarray) {
1270 if (@_ == 0) {
98ef7649 1271 _namedseq() unless %NAMEDSEQ;
a2bd7410
JH
1272 return %NAMEDSEQ;
1273 } elsif (@_ == 1) {
98ef7649
KW
1274 my $s;
1275 if (%NAMEDSEQ) {
1276 $s = $NAMEDSEQ{ $_[0] };
1277 }
1278 else {
1279 $s = charnames::string_vianame($_[0]);
1280 }
a2bd7410
JH
1281 return defined $s ? map { ord($_) } split('', $s) : ();
1282 }
1283 } elsif (@_ == 1) {
98ef7649
KW
1284 return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
1285 return charnames::string_vianame($_[0]);
a2bd7410
JH
1286 }
1287 }
1288 return;
1289}
1290
7319f91d
KW
1291my %NUMERIC;
1292
1293sub _numeric {
1294
1295 # Unicode 6.0 instituted the rule that only digits in a consecutive
1296 # block of 10 would be considered decimal digits. Before that, the only
1297 # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
1298 # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
1299 # ONE. The code could be modified to handle that, but not bothering, as
1300 # in TUS 6.0, U+19DA was changed to Nt=Di.
1301 if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
1302 croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
1303 }
98025745
KW
1304 my @numbers = _read_table("unicore/To/Nv.pl");
1305 foreach my $entry (@numbers) {
1306 my ($start, $end, $value) = @$entry;
1307
05dbc6f8
KW
1308 # If value contains a slash, convert to decimal, add a reverse hash
1309 # used by charinfo.
98025745
KW
1310 if ((my @rational = split /\//, $value) == 2) {
1311 my $real = $rational[0] / $rational[1];
05dbc6f8 1312 $real_to_rational{$real} = $value;
98025745
KW
1313 $value = $real;
1314 }
1315
1316 for my $i ($start .. $end) {
1317 $NUMERIC{$i} = $value;
7319f91d 1318 }
7319f91d 1319 }
2dc5eb26
KW
1320
1321 # Decided unsafe to use these that aren't officially part of the Unicode
1322 # standard.
1323 #use Math::Trig;
1324 #my $pi = acos(-1.0);
98025745 1325 #$NUMERIC{0x03C0} = $pi;
7319f91d
KW
1326
1327 # Euler's constant, not to be confused with Euler's number
98025745 1328 #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
7319f91d
KW
1329
1330 # Euler's number
98025745 1331 #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
2dc5eb26 1332
7319f91d
KW
1333 return;
1334}
1335
1336=pod
1337
67592e11 1338=head2 B<num()>
7319f91d 1339
eefd7bc2
KW
1340 use Unicode::UCD 'num';
1341
1342 my $val = num("123");
1343 my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
1344
7319f91d
KW
1345C<num> returns the numeric value of the input Unicode string; or C<undef> if it
1346doesn't think the entire string has a completely valid, safe numeric value.
1347
1348If the string is just one character in length, the Unicode numeric value
1349is returned if it has one, or C<undef> otherwise. Note that this need
1350not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
2dc5eb26
KW
1351example returns -0.5.
1352
1353=cut
7319f91d 1354
2dc5eb26
KW
1355#A few characters to which Unicode doesn't officially
1356#assign a numeric value are considered numeric by C<num>.
1357#These are:
1358
1359# EULER CONSTANT 0.5772... (this is NOT Euler's number)
1360# SCRIPT SMALL E 2.71828... (this IS Euler's number)
1361# GREEK SMALL LETTER PI 3.14159...
1362
1363=pod
7319f91d
KW
1364
1365If the string is more than one character, C<undef> is returned unless
8bb4c8e2 1366all its characters are decimal digits (that is, they would match C<\d+>),
7319f91d
KW
1367from the same script. For example if you have an ASCII '0' and a Bengali
1368'3', mixed together, they aren't considered a valid number, and C<undef>
1369is returned. A further restriction is that the digits all have to be of
1370the same form. A half-width digit mixed with a full-width one will
1371return C<undef>. The Arabic script has two sets of digits; C<num> will
1372return C<undef> unless all the digits in the string come from the same
1373set.
1374
1375C<num> errs on the side of safety, and there may be valid strings of
1376decimal digits that it doesn't recognize. Note that Unicode defines
1377a number of "digit" characters that aren't "decimal digit" characters.
a278d14b 1378"Decimal digits" have the property that they have a positional value, i.e.,
7319f91d
KW
1379there is a units position, a 10's position, a 100's, etc, AND they are
1380arranged in Unicode in blocks of 10 contiguous code points. The Chinese
1381digits, for example, are not in such a contiguous block, and so Unicode
1382doesn't view them as decimal digits, but merely digits, and so C<\d> will not
1383match them. A single-character string containing one of these digits will
1384have its decimal value returned by C<num>, but any longer string containing
1385only these digits will return C<undef>.
1386
a278d14b
KW
1387Strings of multiple sub- and superscripts are not recognized as numbers. You
1388can use either of the compatibility decompositions in Unicode::Normalize to
7319f91d
KW
1389change these into digits, and then call C<num> on the result.
1390
1391=cut
1392
1393# To handle sub, superscripts, this could if called in list context,
1394# consider those, and return the <decomposition> type in the second
1395# array element.
1396
1397sub num {
1398 my $string = $_[0];
1399
1400 _numeric unless %NUMERIC;
1401
1402 my $length = length($string);
98025745 1403 return $NUMERIC{ord($string)} if $length == 1;
7319f91d
KW
1404 return if $string =~ /\D/;
1405 my $first_ord = ord(substr($string, 0, 1));
98025745 1406 my $value = $NUMERIC{$first_ord};
7319f91d
KW
1407 my $zero_ord = $first_ord - $value;
1408
1409 for my $i (1 .. $length -1) {
1410 my $ord = ord(substr($string, $i, 1));
1411 my $digit = $ord - $zero_ord;
1412 return unless $digit >= 0 && $digit <= 9;
1413 $value = $value * 10 + $digit;
1414 }
1415 return $value;
1416}
1417
7ef25837
KW
1418=pod
1419
1420=head2 B<prop_aliases()>
1421
1422 use Unicode::UCD 'prop_aliases';
1423
1424 my ($short_name, $full_name, @other_names) = prop_aliases("space");
1425 my $same_full_name = prop_aliases("Space"); # Scalar context
1426 my ($same_short_name) = prop_aliases("Space"); # gets 0th element
1427 print "The full name is $full_name\n";
1428 print "The short name is $short_name\n";
1429 print "The other aliases are: ", join(", ", @other_names), "\n";
1430
1431 prints:
1432 The full name is White_Space
1433 The short name is WSpace
1434 The other aliases are: Space
1435
1436Most Unicode properties have several synonymous names. Typically, there is at
1437least a short name, convenient to type, and a long name that more fully
1438describes the property, and hence is more easily understood.
1439
1440If you know one name for a Unicode property, you can use C<prop_aliases> to find
1441either the long name (when called in scalar context), or a list of all of the
1442names, somewhat ordered so that the short name is in the 0th element, the long
1443name in the next element, and any other synonyms are in the remaining
1444elements, in no particular order.
1445
1446The long name is returned in a form nicely capitalized, suitable for printing.
1447
1448The input parameter name is loosely matched, which means that white space,
1449hyphens, and underscores are ignored (except for the trailing underscore in
1450the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and
1451both of which mean C<General_Category=Cased Letter>).
1452
1453If the name is unknown, C<undef> is returned (or an empty list in list
1454context). Note that Perl typically recognizes property names in regular
1455expressions with an optional C<"Is_>" (with or without the underscore)
1456prefixed to them, such as C<\p{isgc=punct}>. This function does not recognize
1457those in the input, returning C<undef>. Nor are they included in the output
1458as possible synonyms.
1459
1460C<prop_aliases> does know about the Perl extensions to Unicode properties,
1461such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode
1462properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>. The
1463final example demonstrates that the C<"Is_"> prefix is recognized for these
1464extensions; it is needed to resolve ambiguities. For example,
1465C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but
1466C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>. This is
1467because C<islc> is a Perl extension which is short for
1468C<General_Category=Cased Letter>. The lists returned for the Perl extensions
1469will not include the C<"Is_"> prefix (whether or not the input had it) unless
1470needed to resolve ambiguities, as shown in the C<"islc"> example, where the
1471returned list had one element containing C<"Is_">, and the other without.
1472
1473It is also possible for the reverse to happen: C<prop_aliases('isc')> returns
1474the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns
1475C<(C, Other)> (the latter being a Perl extension meaning
1476C<General_Category=Other>. L<perluniprops> lists the available forms,
1477including which ones are discouraged from use.
1478
1479Those discouraged forms are accepted as input to C<prop_aliases>, but are not
1480returned in the lists. C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
1481which are old synonyms for C<"Is_LC"> and should not be used in new code, are
1482examples of this. These both return C<(Is_LC, Cased_Letter)>. Thus this
1483function allows you to take a discourarged form, and find its acceptable
1484alternatives. The same goes with single-form Block property equivalences.
1485Only the forms that begin with C<"In_"> are not discouraged; if you pass
1486C<prop_aliases> a discouraged form, you will get back the equivalent ones that
1487begin with C<"In_">. It will otherwise look like a new-style block name (see.
1488L</Old-style versus new-style block names>).
1489
1490C<prop_aliases> does not know about any user-defined properties, and will
1491return C<undef> if called with one of those. Likewise for Perl internal
1492properties, with the exception of "Perl_Decimal_Digit" which it does know
1493about (and which is documented below in L</prop_invmap()>).
1494
1495=cut
1496
1497# It may be that there are use cases where the discouraged forms should be
1498# returned. If that comes up, an optional boolean second parameter to the
1499# function could be created, for example.
1500
1501# These are created by mktables for this routine and stored in unicore/UCD.pl
1502# where their structures are described.
1503our %string_property_loose_to_name;
1504our %ambiguous_names;
1505our %loose_perlprop_to_name;
1506our %prop_aliases;
1507
1508sub prop_aliases ($) {
1509 my $prop = $_[0];
1510 return unless defined $prop;
1511
1512 require "unicore/UCD.pl";
1513 require "unicore/Heavy.pl";
1514 require "utf8_heavy.pl";
1515
1516 # The property name may be loosely or strictly matched; we don't know yet.
1517 # But both types use lower-case.
1518 $prop = lc $prop;
1519
1520 # It is loosely matched if its lower case isn't known to be strict.
1521 my $list_ref;
1522 if (! exists $utf8::stricter_to_file_of{$prop}) {
1523 my $loose = utf8::_loose_name($prop);
1524
1525 # There is a hash that converts from any loose name to its standard
1526 # form, mapping all synonyms for a name to one name that can be used
1527 # as a key into another hash. The whole concept is for memory
1528 # savings, as the second hash doesn't have to have all the
1529 # combinations. Actually, there are two hashes that do the
1530 # converstion. One is used in utf8_heavy.pl (stored in Heavy.pl) for
1531 # looking up properties matchable in regexes. This function needs to
1532 # access string properties, which aren't available in regexes, so a
1533 # second conversion hash is made for them (stored in UCD.pl). Look in
1534 # the string one now, as the rest can have an optional 'is' prefix,
1535 # which these don't.
1536 if (exists $string_property_loose_to_name{$loose}) {
1537
1538 # Convert to its standard loose name.
1539 $prop = $string_property_loose_to_name{$loose};
1540 }
1541 else {
1542 my $retrying = 0; # bool. ? Has an initial 'is' been stripped
1543 RETRY:
1544 if (exists $utf8::loose_property_name_of{$loose}
1545 && (! $retrying
1546 || ! exists $ambiguous_names{$loose}))
1547 {
1548 # Found an entry giving the standard form. We don't get here
1549 # (in the test above) when we've stripped off an
1550 # 'is' and the result is an ambiguous name. That is because
1551 # these are official Unicode properties (though Perl can have
1552 # an optional 'is' prefix meaning the official property), and
1553 # all ambiguous cases involve a Perl single-form extension
1554 # for the gc, script, or block properties, and the stripped
1555 # 'is' means that they mean one of those, and not one of
1556 # these
1557 $prop = $utf8::loose_property_name_of{$loose};
1558 }
1559 elsif (exists $loose_perlprop_to_name{$loose}) {
1560
1561 # This hash is specifically for this function to list Perl
1562 # extensions that aren't in the earlier hashes. If there is
1563 # only one element, the short and long names are identical.
1564 # Otherwise the form is already in the same form as
1565 # %prop_aliases, which is handled at the end of the function.
1566 $list_ref = $loose_perlprop_to_name{$loose};
1567 if (@$list_ref == 1) {
1568 my @list = ($list_ref->[0], $list_ref->[0]);
1569 $list_ref = \@list;
1570 }
1571 }
1572 elsif (! exists $utf8::loose_to_file_of{$loose}) {
1573
1574 # loose_to_file_of is a complete list of loose names. If not
1575 # there, the input is unknown.
1576 return;
1577 }
1578 else {
1579
1580 # Here we found the name but not its aliases, so it has to
1581 # exist. This means it must be one of the Perl single-form
1582 # extensions. First see if it is for a property-value
1583 # combination in one of the following properties.
1584 my @list;
1585 foreach my $property ("gc", "script") {
1586 @list = prop_value_aliases($property, $loose);
1587 last if @list;
1588 }
1589 if (@list) {
1590
1591 # Here, it is one of those property-value combination
1592 # single-form synonyms. There are ambiguities with some
1593 # of these. Check against the list for these, and adjust
1594 # if necessary.
1595 for my $i (0 .. @list -1) {
1596 if (exists $ambiguous_names
1597 {utf8::_loose_name(lc $list[$i])})
1598 {
1599 # The ambiguity is resolved by toggling whether or
1600 # not it has an 'is' prefix
1601 $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/;
1602 }
1603 }
1604 return @list;
1605 }
1606
1607 # Here, it wasn't one of the gc or script single-form
1608 # extensions. It could be a block property single-form
1609 # extension. An 'in' prefix definitely means that, and should
1610 # be looked up without the prefix.
1611 my $began_with_in = $loose =~ s/^in//;
1612 @list = prop_value_aliases("block", $loose);
1613 if (@list) {
1614 map { $_ =~ s/^/In_/ } @list;
1615 return @list;
1616 }
1617
1618 # Here still haven't found it. The last opportunity for it
1619 # being valid is only if it began with 'is'. We retry without
1620 # the 'is', setting a flag to that effect so that we don't
1621 # accept things that begin with 'isis...'
1622 if (! $retrying && ! $began_with_in && $loose =~ s/^is//) {
1623 $retrying = 1;
1624 goto RETRY;
1625 }
1626
1627 # Here, didn't find it. Since it was in %loose_to_file_of, we
1628 # should have been able to find it.
1629 carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'. Send bug report to perlbug\@perl.org";
1630 return;
1631 }
1632 }
1633 }
1634
1635 if (! $list_ref) {
1636 # Here, we have set $prop to a standard form name of the input. Look
1637 # it up in the structure created by mktables for this purpose, which
1638 # contains both strict and loosely matched properties. Avoid
1639 # autovivifying.
1640 $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop};
1641 return unless $list_ref;
1642 }
1643
1644 # The full name is in element 1.
1645 return $list_ref->[1] unless wantarray;
1646
1647 return @{dclone $list_ref};
1648}
1649
1650=pod
1651
1652=head2 B<prop_value_aliases()>
1653
1654 use Unicode::UCD 'prop_value_aliases';
1655
1656 my ($short_name, $full_name, @other_names)
1657 = prop_value_aliases("Gc", "Punct");
1658 my $same_full_name = prop_value_aliases("Gc", "P"); # Scalar cntxt
1659 my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
1660 # element
1661 print "The full name is $full_name\n";
1662 print "The short name is $short_name\n";
1663 print "The other aliases are: ", join(", ", @other_names), "\n";
1664
1665 prints:
1666 The full name is Punctuation
1667 The short name is P
1668 The other aliases are: Punct
1669
1670Some Unicode properties have a restricted set of legal values. For example,
1671all binary properties are restricted to just C<true> or C<false>; and there
1672are only a few dozen possible General Categories.
1673
1674For such properties, there are usually several synonyms for each possible
1675value. For example, in binary properties, I<truth> can be represented by any of
1676the strings "Y", "Yes", "T", or "True"; and the General Category
1677"Punctuation" by that string, or "Punct", or simply "P".
1678
1679Like property names, there is typically at least a short name for each such
1680property-value, and a long name. If you know any name of the property-value,
1681you can use C<prop_value_aliases>() to get the long name (when called in
1682scalar context), or a list of all the names, with the short name in the 0th
1683element, the long name in the next element, and any other synonyms in the
1684remaining elements, in no particular order, except that any all-numeric
1685synonyms will be last.
1686
1687The long name is returned in a form nicely capitalized, suitable for printing.
1688
1689Case, white space, hyphens, and underscores are ignored in the input parameters
1690(except for the trailing underscore in the old-form grandfathered-in general
1691category property value C<"L_">, which is better written as C<"LC">).
1692
1693If either name is unknown, C<undef> is returned. Note that Perl typically
1694recognizes property names in regular expressions with an optional C<"Is_>"
1695(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
1696This function does not recognize those in the property parameter, returning
1697C<undef>.
1698
1699If called with a property that doesn't have synonyms for its values, it
1700returns the input value, possibly normalized with capitalization and
1701underscores.
1702
1703For the block property, new-style block names are returned (see
1704L</Old-style versus new-style block names>).
1705
1706To find the synonyms for single-forms, such as C<\p{Any}>, use
1707L</prop_aliases()> instead.
1708
1709C<prop_value_aliases> does not know about any user-defined properties, and
1710will return C<undef> if called with one of those.
1711
1712=cut
1713
1714# These are created by mktables for this routine and stored in unicore/UCD.pl
1715# where their structures are described.
1716our %loose_to_standard_value;
1717our %prop_value_aliases;
1718
1719sub prop_value_aliases ($$) {
1720 my ($prop, $value) = @_;
1721 return unless defined $prop && defined $value;
1722
1723 require "unicore/UCD.pl";
1724 require "utf8_heavy.pl";
1725
1726 # Find the property name synonym that's used as the key in other hashes,
1727 # which is element 0 in the returned list.
1728 ($prop) = prop_aliases($prop);
1729 return if ! $prop;
1730 $prop = utf8::_loose_name(lc $prop);
1731
1732 # Here is a legal property, but the hash below (created by mktables for
1733 # this purpose) only knows about the properties that have a very finite
1734 # number of potential values, that is not ones whose value could be
1735 # anything, like most (if not all) string properties. These don't have
1736 # synonyms anyway. Simply return the input. For example, there is no
1737 # synonym for ('Uppercase_Mapping', A').
1738 return $value if ! exists $prop_value_aliases{$prop};
1739
1740 # The value name may be loosely or strictly matched; we don't know yet.
1741 # But both types use lower-case.
1742 $value = lc $value;
1743
1744 # If the name isn't found under loose matching, it certainly won't be
1745 # found under strict
1746 my $loose_value = utf8::_loose_name($value);
1747 return unless exists $loose_to_standard_value{"$prop=$loose_value"};
1748
1749 # Similarly if the combination under loose matching doesn't exist, it
1750 # won't exist under strict.
1751 my $standard_value = $loose_to_standard_value{"$prop=$loose_value"};
1752 return unless exists $prop_value_aliases{$prop}{$standard_value};
1753
1754 # Here we did find a combination under loose matching rules. But it could
1755 # be that is a strict property match that shouldn't have matched.
1756 # %prop_value_aliases is set up so that the strict matches will appear as
1757 # if they were in loose form. Thus, if the non-loose version is legal,
1758 # we're ok, can skip the further check.
1759 if (! exists $utf8::stricter_to_file_of{"$prop=$value"}
1760
1761 # We're also ok and skip the further check if value loosely matches.
1762 # mktables has verified that no strict name under loose rules maps to
1763 # an existing loose name. This code relies on the very limited
1764 # circumstances that strict names can be here. Strict name matching
1765 # happens under two conditions:
1766 # 1) when the name begins with an underscore. But this function
1767 # doesn't accept those, and %prop_value_aliases doesn't have
1768 # them.
1769 # 2) When the values are numeric, in which case we need to look
1770 # further, but their squeezed-out loose values will be in
1771 # %stricter_to_file_of
1772 && exists $utf8::stricter_to_file_of{"$prop=$loose_value"})
1773 {
1774 # The only thing that's legal loosely under strict is that can have an
1775 # underscore between digit pairs XXX
1776 while ($value =~ s/(\d)_(\d)/$1$2/g) {}
1777 return unless exists $utf8::stricter_to_file_of{"$prop=$value"};
1778 }
1779
1780 # Here, we know that the combination exists. Return it.
1781 my $list_ref = $prop_value_aliases{$prop}{$standard_value};
1782 if (@$list_ref > 1) {
1783 # The full name is in element 1.
1784 return $list_ref->[1] unless wantarray;
1785
1786 return @{dclone $list_ref};
1787 }
1788
1789 return $list_ref->[0] unless wantarray;
1790
1791 # Only 1 element means that it repeats
1792 return ( $list_ref->[0], $list_ref->[0] );
1793}
7319f91d 1794
681d705c
KW
1795# All 1 bits is the largest possible UV.
1796$Unicode::UCD::MAX_CP = ~0;
1797
1798=pod
1799
1800=head2 B<prop_invlist()>
1801
1802C<prop_invlist> returns an inversion list (described below) that defines all the
1803code points for the binary Unicode property (or "property=value" pair) given
1804by the input parameter string:
1805
1806 use feature 'say';
1807 use Unicode::UCD 'prop_invlist';
1808 say join ", ", prop_invlist("Any");
1809
1810 prints:
1811 0, 1114112
1812
1813An empty list is returned if the input is unknown; the number of elements in
1814the list is returned if called in scalar context.
1815
1816L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives
1817the list of properties that this function accepts, as well as all the possible
1818forms for them (including with the optional "Is_" prefixes). (Except this
1819function doesn't accept any Perl-internal properties, some of which are listed
1820there.) This function uses the same loose or tighter matching rules for
1821resolving the input property's name as is done for regular expressions. These
1822are also specified in L<perluniprops|perluniprops/Properties accessible
1823through \p{} and \P{}>. Examples of using the "property=value" form are:
1824
1825 say join ", ", prop_invlist("Script=Shavian");
1826
1827 prints:
1828 66640, 66688
1829
1830 say join ", ", prop_invlist("ASCII_Hex_Digit=No");
1831
1832 prints:
1833 0, 48, 58, 65, 71, 97, 103
1834
1835 say join ", ", prop_invlist("ASCII_Hex_Digit=Yes");
1836
1837 prints:
1838 48, 58, 65, 71, 97, 103
1839
1840Inversion lists are a compact way of specifying Unicode property-value
1841definitions. The 0th item in the list is the lowest code point that has the
1842property-value. The next item (item [1]) is the lowest code point beyond that
1843one that does NOT have the property-value. And the next item beyond that
1844([2]) is the lowest code point beyond that one that does have the
1845property-value, and so on. Put another way, each element in the list gives
1846the beginning of a range that has the property-value (for even numbered
1847elements), or doesn't have the property-value (for odd numbered elements).
1848The name for this data structure stems from the fact that each element in the
1849list toggles (or inverts) whether the corresponding range is or isn't on the
1850list.
1851
1852In the final example above, the first ASCII Hex digit is code point 48, the
1853character "0", and all code points from it through 57 (a "9") are ASCII hex
1854digits. Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F")
1855are, as are 97 ("a") through 102 ("f"). 103 starts a range of code points
1856that aren't ASCII hex digits. That range extends to infinity, which on your
1857computer can be found in the variable C<$Unicode::UCD::MAX_CP>. (This
1858variable is as close to infinity as Perl can get on your platform, and may be
1859too high for some operations to work; you may wish to use a smaller number for
1860your purposes.)
1861
1862Note that the inversion lists returned by this function can possibly include
1863non-Unicode code points, that is anything above 0x10FFFF. This is in
1864contrast to Perl regular expression matches on those code points, in which a
1865non-Unicode code point always fails to match. For example, both of these have
1866the same result:
1867
1868 chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails.
1869 chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Fails!
1870
1871And both raise a warning that a Unicode property is being used on a
1872non-Unicode code point. It is arguable as to which is the correct thing to do
1873here. This function has chosen the way opposite to the Perl regular
1874expression behavior. This allows you to easily flip to to the Perl regular
1875expression way (for you to go in the other direction would be far harder).
1876Simply add 0x110000 at the end of the non-empty returned list if it isn't
1877already that value; and pop that value if it is; like:
1878
1879 my @list = prop_invlist("foo");
1880 if (@list) {
1881 if ($list[-1] == 0x110000) {
1882 pop @list; # Defeat the turning on for above Unicode
1883 }
1884 else {
1885 push @list, 0x110000; # Turn off for above Unicode
1886 }
1887 }
1888
1889It is a simple matter to expand out an inversion list to a full list of all
1890code points that have the property-value:
1891
1892 my @invlist = prop_invlist($property_name);
1893 die "empty" unless @invlist;
1894 my @full_list;
1895 for (my $i = 0; $i < @invlist; $i += 2) {
1896 my $upper = ($i + 1) < @invlist
1897 ? $invlist[$i+1] - 1 # In range
1898 : $Unicode::UCD::MAX_CP; # To infinity. You may want
1899 # to stop much much earlier;
1900 # going this high may expose
1901 # perl deficiencies with very
1902 # large numbers.
1903 for my $j ($invlist[$i] .. $upper) {
1904 push @full_list, $j;
1905 }
1906 }
1907
1908C<prop_invlist> does not know about any user-defined nor Perl internal-only
1909properties, and will return C<undef> if called with one of those.
1910
1911=cut
1912
1913# User-defined properties could be handled with some changes to utf8_heavy.pl;
1914# and implementing here of dealing with EXTRAS. If done, consideration should
1915# be given to the fact that the user subroutine could return different results
1916# with each call; security issues need to be thought about.
1917
1918# These are created by mktables for this routine and stored in unicore/UCD.pl
1919# where their structures are described.
1920our %loose_defaults;
1921our $MAX_UNICODE_CODEPOINT;
1922
1923sub prop_invlist ($) {
1924 my $prop = $_[0];
1925 return if ! defined $prop;
1926
1927 require "utf8_heavy.pl";
1928
1929 # Warnings for these are only for regexes, so not applicable to us
1930 no warnings 'deprecated';
1931
1932 # Get the swash definition of the property-value.
1933 my $swash = utf8::SWASHNEW(__PACKAGE__, $prop, undef, 1, 0);
1934
1935 # Fail if not found, or isn't a boolean property-value, or is a
1936 # user-defined property, or is internal-only.
1937 return if ! $swash
1938 || ref $swash eq ""
1939 || $swash->{'BITS'} != 1
1940 || $swash->{'USER_DEFINED'}
1941 || $prop =~ /^\s*_/;
1942
1943 if ($swash->{'EXTRAS'}) {
1944 carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
1945 return;
1946 }
1947 if ($swash->{'SPECIALS'}) {
1948 carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has SPECIALS magic";
1949 return;
1950 }
1951
1952 my @invlist;
1953
1954 # The input lines look like:
1955 # 0041\t005A # [26]
1956 # 005F
1957
1958 # Split into lines, stripped of trailing comments
1959 foreach my $range (split "\n",
1960 $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr)
1961 {
1962 # And find the beginning and end of the range on the line
1963 my ($hex_begin, $hex_end) = split "\t", $range;
1964 my $begin = hex $hex_begin;
1965
1966 # Add the beginning of the range
1967 push @invlist, $begin;
1968
1969 if (defined $hex_end) { # The next item starts with the code point 1
1970 # beyond the end of the range.
1971 push @invlist, hex($hex_end) + 1;
1972 }
1973 else { # No end of range, is a single code point.
1974 push @invlist, $begin + 1;
1975 }
1976 }
1977
1978 require "unicore/UCD.pl";
1979 my $FIRST_NON_UNICODE = $MAX_UNICODE_CODEPOINT + 1;
1980
1981 # Could need to be inverted: add or subtract a 0 at the beginning of the
1982 # list. And to keep it from matching non-Unicode, add or subtract the
1983 # first non-unicode code point.
1984 if ($swash->{'INVERT_IT'}) {
1985 if (@invlist && $invlist[0] == 0) {
1986 shift @invlist;
1987 }
1988 else {
1989 unshift @invlist, 0;
1990 }
1991 if (@invlist && $invlist[-1] == $FIRST_NON_UNICODE) {
1992 pop @invlist;
1993 }
1994 else {
1995 push @invlist, $FIRST_NON_UNICODE;
1996 }
1997 }
1998
1999 # Here, the list is set up to include only Unicode code points. But, if
2000 # the table is the default one for the property, it should contain all
2001 # non-Unicode code points. First calculate the loose name for the
2002 # property. This is done even for strict-name properties, as the data
2003 # structure that mktables generates for us is set up so that we don't have
2004 # to worry about that. The property-value needs to be split if compound,
2005 # as the loose rules need to be independently calculated on each part. We
2006 # know that it is syntactically valid, or SWASHNEW would have failed.
2007
2008 $prop = lc $prop;
2009 my ($prop_only, $table) = split /\s*[:=]\s*/, $prop;
2010 if ($table) {
2011
2012 # May have optional prefixed 'is'
2013 $prop = utf8::_loose_name($prop_only) =~ s/^is//r;
2014 $prop = $utf8::loose_property_name_of{$prop};
2015 $prop .= "=" . utf8::_loose_name($table);
2016 }
2017 else {
2018 $prop = utf8::_loose_name($prop);
2019 }
2020 if (exists $loose_defaults{$prop}) {
2021
2022 # Here, is the default table. If a range ended with 10ffff, instead
2023 # continue that range to infinity, by popping the 110000; otherwise,
2024 # add the range from 11000 to infinity
2025 if (! @invlist || $invlist[-1] != $FIRST_NON_UNICODE) {
2026 push @invlist, $FIRST_NON_UNICODE;
2027 }
2028 else {
2029 pop @invlist;
2030 }
2031 }
2032
2033 return @invlist;
2034}
7319f91d 2035
55d7b906 2036=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 2037
a452d459
KW
2038This returns the version of the Unicode Character Database, in other words, the
2039version of the Unicode standard the database implements. The version is a
2040string of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
2041
2042=cut
2043
2044my $UNICODEVERSION;
2045
2046sub UnicodeVersion {
2047 unless (defined $UNICODEVERSION) {
2048 openunicode(\$VERSIONFH, "version");
2049 chomp($UNICODEVERSION = <$VERSIONFH>);
2050 close($VERSIONFH);
2051 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
2052 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
2053 }
2054 return $UNICODEVERSION;
2055}
3aa957f9 2056
a452d459
KW
2057=head2 B<Blocks versus Scripts>
2058
2059The difference between a block and a script is that scripts are closer
2060to the linguistic notion of a set of code points required to present
2061languages, while block is more of an artifact of the Unicode code point
2062numbering and separation into blocks of (mostly) 256 code points.
2063
2064For example the Latin B<script> is spread over several B<blocks>, such
2065as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
2066C<Latin Extended-B>. On the other hand, the Latin script does not
2067contain all the characters of the C<Basic Latin> block (also known as
2068ASCII): it includes only the letters, and not, for example, the digits
2069or the punctuation.
2070
2071For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
2072
2073For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
2074
2075=head2 B<Matching Scripts and Blocks>
2076
2077Scripts are matched with the regular-expression construct
2078C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
f200dd12 2079while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches
a452d459
KW
2080any of the 256 code points in the Tibetan block).
2081
430fe03d
KW
2082=head2 Old-style versus new-style block names
2083
2084Unicode publishes the names of blocks in two different styles, though the two
2085are equivalent under Unicode's loose matching rules.
2086
2087The original style uses blanks and hyphens in the block names (except for
2088C<No_Block>), like so:
2089
2090 Miscellaneous Mathematical Symbols-B
2091
2092The newer style replaces these with underscores, like this:
2093
2094 Miscellaneous_Mathematical_Symbols_B
2095
2096This newer style is consistent with the values of other Unicode properties.
2097To preserve backward compatibility, all the functions in Unicode::UCD that
2098return block names (except one) return the old-style ones. That one function,
2099L</prop_value_aliases()> can be used to convert from old-style to new-style:
2100
2101 my $new_style = prop_values_aliases("block", $old_style);
2102
2103Perl also has single-form extensions that refer to blocks, C<In_Cyrillic>,
2104meaning C<Block=Cyrillic>. These have always been written in the new style.
2105
2106To convert from new-style to old-style, follow this recipe:
2107
2108 $old_style = charblock((prop_invlist("block=$new_style"))[0]);
2109
2110(which finds the range of code points in the block using C<prop_invlist>,
2111gets the lower end of the range (0th element) and then looks up the old name
2112for its block using C<charblock>).
2113
8b731da2
JH
2114=head1 BUGS
2115
2116Does not yet support EBCDIC platforms.
2117
561c79ed
JH
2118=head1 AUTHOR
2119
a18e976f 2120Jarkko Hietaniemi. Now maintained by perl5 porters.
561c79ed
JH
2121
2122=cut
2123
21241;