This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Unicode::UCD: Add prop_invlist()
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
36c2430c 5no warnings 'surrogate'; # surrogates can be inputs to this
98ef7649 6use charnames ();
94c91ffc 7use Unicode::Normalize qw(getCombinClass NFD);
561c79ed 8
7ef25837 9our $VERSION = '0.37';
561c79ed 10
741297c1
JH
11use Storable qw(dclone);
12
561c79ed
JH
13require Exporter;
14
15our @ISA = qw(Exporter);
74f8133e 16
10a6ecd2
JH
17our @EXPORT_OK = qw(charinfo
18 charblock charscript
19 charblocks charscripts
b08cd201 20 charinrange
ea508aee 21 general_categories bidi_types
b08cd201 22 compexcl
a2bd7410 23 casefold casespec
7319f91d
KW
24 namedseq
25 num
7ef25837
KW
26 prop_aliases
27 prop_value_aliases
681d705c
KW
28 prop_invlist
29 MAX_CP
7319f91d 30 );
561c79ed
JH
31
32use Carp;
33
34=head1 NAME
35
55d7b906 36Unicode::UCD - Unicode character database
561c79ed
JH
37
38=head1 SYNOPSIS
39
55d7b906 40 use Unicode::UCD 'charinfo';
b08cd201 41 my $charinfo = charinfo($codepoint);
561c79ed 42
956cae9a
KW
43 use Unicode::UCD 'casefold';
44 my $casefold = casefold(0xFB00);
45
5d8e6e41
KW
46 use Unicode::UCD 'casespec';
47 my $casespec = casespec(0xFB00);
48
55d7b906 49 use Unicode::UCD 'charblock';
e882dd67
JH
50 my $charblock = charblock($codepoint);
51
55d7b906 52 use Unicode::UCD 'charscript';
65044554 53 my $charscript = charscript($codepoint);
561c79ed 54
55d7b906 55 use Unicode::UCD 'charblocks';
e145285f
JH
56 my $charblocks = charblocks();
57
55d7b906 58 use Unicode::UCD 'charscripts';
ea508aee 59 my $charscripts = charscripts();
e145285f 60
55d7b906 61 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
62 my $range = charscript($script);
63 print "looks like $script\n" if charinrange($range, $codepoint);
64
ea508aee
JH
65 use Unicode::UCD qw(general_categories bidi_types);
66 my $categories = general_categories();
67 my $types = bidi_types();
68
7ef25837
KW
69 use Unicode::UCD 'prop_aliases';
70 my @space_names = prop_aliases("space");
71
72 use Unicode::UCD 'prop_value_aliases';
73 my @gc_punct_names = prop_value_aliases("Gc", "Punct");
74
681d705c
KW
75 use Unicode::UCD 'prop_invlist';
76 my @puncts = prop_invlist("gc=punctuation");
77
55d7b906 78 use Unicode::UCD 'compexcl';
e145285f
JH
79 my $compexcl = compexcl($codepoint);
80
a2bd7410
JH
81 use Unicode::UCD 'namedseq';
82 my $namedseq = namedseq($named_sequence_name);
83
55d7b906 84 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 85
7319f91d 86 my $convert_to_numeric =
62a8c8c2 87 Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
7319f91d 88
561c79ed
JH
89=head1 DESCRIPTION
90
a452d459
KW
91The Unicode::UCD module offers a series of functions that
92provide a simple interface to the Unicode
8b731da2 93Character Database.
561c79ed 94
a452d459
KW
95=head2 code point argument
96
97Some of the functions are called with a I<code point argument>, which is either
98a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
99followed by hexadecimals designating a Unicode code point. In other words, if
100you want a code point to be interpreted as a hexadecimal number, you must
101prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
f200dd12
KW
102interpreted as a decimal code point. Note that the largest code point in
103Unicode is U+10FFFF.
561c79ed
JH
104=cut
105
10a6ecd2 106my $BLOCKSFH;
10a6ecd2 107my $VERSIONFH;
b08cd201
JH
108my $CASEFOLDFH;
109my $CASESPECFH;
a2bd7410 110my $NAMEDSEQFH;
561c79ed
JH
111
112sub openunicode {
113 my ($rfh, @path) = @_;
114 my $f;
115 unless (defined $$rfh) {
116 for my $d (@INC) {
117 use File::Spec;
55d7b906 118 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 119 last if open($$rfh, $f);
e882dd67 120 undef $f;
561c79ed 121 }
e882dd67
JH
122 croak __PACKAGE__, ": failed to find ",
123 File::Spec->catfile(@path), " in @INC"
124 unless defined $f;
561c79ed
JH
125 }
126 return $f;
127}
128
a452d459 129=head2 B<charinfo()>
561c79ed 130
55d7b906 131 use Unicode::UCD 'charinfo';
561c79ed 132
b08cd201 133 my $charinfo = charinfo(0x41);
561c79ed 134
a452d459
KW
135This returns information about the input L</code point argument>
136as a reference to a hash of fields as defined by the Unicode
137standard. If the L</code point argument> is not assigned in the standard
138(i.e., has the general category C<Cn> meaning C<Unassigned>)
139or is a non-character (meaning it is guaranteed to never be assigned in
140the standard),
a18e976f 141C<undef> is returned.
a452d459
KW
142
143Fields that aren't applicable to the particular code point argument exist in the
144returned hash, and are empty.
145
146The keys in the hash with the meanings of their values are:
147
148=over
149
150=item B<code>
151
152the input L</code point argument> expressed in hexadecimal, with leading zeros
153added if necessary to make it contain at least four hexdigits
154
155=item B<name>
156
157name of I<code>, all IN UPPER CASE.
158Some control-type code points do not have names.
159This field will be empty for C<Surrogate> and C<Private Use> code points,
160and for the others without a name,
161it will contain a description enclosed in angle brackets, like
162C<E<lt>controlE<gt>>.
163
164
165=item B<category>
166
167The short name of the general category of I<code>.
168This will match one of the keys in the hash returned by L</general_categories()>.
169
7ef25837
KW
170The L</prop_value_aliases()> function can be used to get all the synonyms
171of the category name.
172
a452d459
KW
173=item B<combining>
174
175the combining class number for I<code> used in the Canonical Ordering Algorithm.
176For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
177available at
178L<http://www.unicode.org/versions/Unicode5.1.0/>
179
7ef25837
KW
180The L</prop_value_aliases()> function can be used to get all the synonyms
181of the combining class number.
182
a452d459
KW
183=item B<bidi>
184
185bidirectional type of I<code>.
186This will match one of the keys in the hash returned by L</bidi_types()>.
187
7ef25837
KW
188The L</prop_value_aliases()> function can be used to get all the synonyms
189of the bidi type name.
190
a452d459
KW
191=item B<decomposition>
192
193is empty if I<code> has no decomposition; or is one or more codes
a18e976f 194(separated by spaces) that, taken in order, represent a decomposition for
a452d459
KW
195I<code>. Each has at least four hexdigits.
196The codes may be preceded by a word enclosed in angle brackets then a space,
197like C<E<lt>compatE<gt> >, giving the type of decomposition
198
06bba7d5
KW
199This decomposition may be an intermediate one whose components are also
200decomposable. Use L<Unicode::Normalize> to get the final decomposition.
201
a452d459
KW
202=item B<decimal>
203
204if I<code> is a decimal digit this is its integer numeric value
205
206=item B<digit>
207
89e4a205
KW
208if I<code> represents some other digit-like number, this is its integer
209numeric value
a452d459
KW
210
211=item B<numeric>
212
213if I<code> represents a whole or rational number, this is its numeric value.
214Rational values are expressed as a string like C<1/4>.
215
216=item B<mirrored>
217
218C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
219
220=item B<unicode10>
221
222name of I<code> in the Unicode 1.0 standard if one
223existed for this code point and is different from the current name
224
225=item B<comment>
226
89e4a205 227As of Unicode 6.0, this is always empty.
a452d459
KW
228
229=item B<upper>
230
06bba7d5 231is empty if there is no single code point uppercase mapping for I<code>
4f66642e 232(its uppercase mapping is itself);
a452d459
KW
233otherwise it is that mapping expressed as at least four hexdigits.
234(L</casespec()> should be used in addition to B<charinfo()>
235for case mappings when the calling program can cope with multiple code point
236mappings.)
237
238=item B<lower>
239
06bba7d5 240is empty if there is no single code point lowercase mapping for I<code>
4f66642e 241(its lowercase mapping is itself);
a452d459
KW
242otherwise it is that mapping expressed as at least four hexdigits.
243(L</casespec()> should be used in addition to B<charinfo()>
244for case mappings when the calling program can cope with multiple code point
245mappings.)
246
247=item B<title>
248
06bba7d5 249is empty if there is no single code point titlecase mapping for I<code>
4f66642e 250(its titlecase mapping is itself);
a452d459
KW
251otherwise it is that mapping expressed as at least four hexdigits.
252(L</casespec()> should be used in addition to B<charinfo()>
253for case mappings when the calling program can cope with multiple code point
254mappings.)
255
256=item B<block>
257
a18e976f 258the block I<code> belongs to (used in C<\p{Blk=...}>).
a452d459
KW
259See L</Blocks versus Scripts>.
260
261
262=item B<script>
263
a18e976f 264the script I<code> belongs to.
a452d459
KW
265See L</Blocks versus Scripts>.
266
267=back
32c16050
JH
268
269Note that you cannot do (de)composition and casing based solely on the
a452d459
KW
270I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
271you will need also the L</compexcl()>, and L</casespec()> functions.
561c79ed
JH
272
273=cut
274
e10d7780 275# NB: This function is nearly duplicated in charnames.pm
10a6ecd2
JH
276sub _getcode {
277 my $arg = shift;
278
dc0a4417 279 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 280 return $arg;
dc0a4417 281 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
282 return hex($1);
283 }
284
285 return;
286}
287
05dbc6f8
KW
288# Populated by _num. Converts real number back to input rational
289my %real_to_rational;
290
291# To store the contents of files found on disk.
292my @BIDIS;
293my @CATEGORIES;
294my @DECOMPOSITIONS;
295my @NUMERIC_TYPES;
5c3b35c9
KW
296my %SIMPLE_LOWER;
297my %SIMPLE_TITLE;
298my %SIMPLE_UPPER;
299my %UNICODE_1_NAMES;
05dbc6f8
KW
300
301sub _charinfo_case {
302
303 # Returns the value to set into one of the case fields in the charinfo
304 # structure.
305 # $char is the character,
306 # $cased is the case-changed character
307 # $file is the file in lib/unicore/To/$file that contains the data
308 # needed for this, in the form that _search() understands.
5c3b35c9 309 # $hash_ref points to the hash holding the contents of $file. It will
05dbc6f8
KW
310 # be populated if empty.
311 # By using the 'uc', etc. functions, we avoid loading more files into
312 # memory except for those rare cases where the simple casing (which has
313 # been what charinfo() has always returned, is different than the full
314 # casing.
5c3b35c9 315 my ($char, $cased, $file, $hash_ref) = @_;
05dbc6f8
KW
316
317 return "" if $cased eq $char;
318
319 return sprintf("%04X", ord $cased) if length($cased) == 1;
320
5c3b35c9
KW
321 %$hash_ref =_read_table("unicore/To/$file", 'use_hash') unless %$hash_ref;
322 return $hash_ref->{ord $char} // "";
a6fa416b
TS
323}
324
05dbc6f8 325sub charinfo {
a6fa416b 326
05dbc6f8
KW
327 # This function has traditionally mimicked what is in UnicodeData.txt,
328 # warts and all. This is a re-write that avoids UnicodeData.txt so that
329 # it can be removed to save disk space. Instead, this assembles
330 # information gotten by other methods that get data from various other
331 # files. It uses charnames to get the character name; and various
332 # mktables tables.
324f9e44 333
05dbc6f8 334 use feature 'unicode_strings';
a6fa416b 335
10a6ecd2
JH
336 my $arg = shift;
337 my $code = _getcode($arg);
05dbc6f8
KW
338 croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
339
340 # Non-unicode implies undef.
341 return if $code > 0x10FFFF;
342
343 my %prop;
344 my $char = chr($code);
345
346 @CATEGORIES =_read_table("unicore/To/Gc.pl") unless @CATEGORIES;
347 $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
348 // $utf8::SwashInfo{'ToGc'}{'missing'};
349
350 return if $prop{'category'} eq 'Cn'; # Unassigned code points are undef
351
352 $prop{'code'} = sprintf "%04X", $code;
353 $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
354 : (charnames::viacode($code) // "");
355
356 $prop{'combining'} = getCombinClass($code);
357
358 @BIDIS =_read_table("unicore/To/Bc.pl") unless @BIDIS;
359 $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
360 // $utf8::SwashInfo{'ToBc'}{'missing'};
361
362 # For most code points, we can just read in "unicore/Decomposition.pl", as
363 # its contents are exactly what should be output. But that file doesn't
364 # contain the data for the Hangul syllable decompositions, which can be
94c91ffc
KW
365 # algorithmically computed, and NFD() does that, so we call NFD() for
366 # those. We can't use NFD() for everything, as it does a complete
05dbc6f8 367 # recursive decomposition, and what this function has always done is to
94c91ffc
KW
368 # return what's in UnicodeData.txt which doesn't show that recursiveness.
369 # Fortunately, the NFD() of the Hanguls doesn't have any recursion
370 # issues.
371 # Having no decomposition implies an empty field; otherwise, all but
372 # "Canonical" imply a compatible decomposition, and the type is prefixed
373 # to that, as it is in UnicodeData.txt
05dbc6f8
KW
374 if ($char =~ /\p{Block=Hangul_Syllables}/) {
375 # The code points of the decomposition are output in standard Unicode
376 # hex format, separated by blanks.
377 $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
94c91ffc 378 unpack "U*", NFD($char);
a6fa416b 379 }
05dbc6f8
KW
380 else {
381 @DECOMPOSITIONS = _read_table("unicore/Decomposition.pl")
382 unless @DECOMPOSITIONS;
383 $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
384 $code) // "";
561c79ed 385 }
05dbc6f8
KW
386
387 # Can use num() to get the numeric values, if any.
388 if (! defined (my $value = num($char))) {
389 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
390 }
391 else {
392 if ($char =~ /\d/) {
393 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
394 }
395 else {
396
397 # For non-decimal-digits, we have to read in the Numeric type
398 # to distinguish them. It is not just a matter of integer vs.
399 # rational, as some whole number values are not considered digits,
400 # e.g., TAMIL NUMBER TEN.
401 $prop{'decimal'} = "";
402
403 @NUMERIC_TYPES =_read_table("unicore/To/Nt.pl")
404 unless @NUMERIC_TYPES;
405 if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
406 eq 'Digit')
407 {
408 $prop{'digit'} = $prop{'numeric'} = $value;
409 }
410 else {
411 $prop{'digit'} = "";
412 $prop{'numeric'} = $real_to_rational{$value} // $value;
413 }
414 }
415 }
416
417 $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
418
5c3b35c9
KW
419 %UNICODE_1_NAMES =_read_table("unicore/To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
420 $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
05dbc6f8
KW
421
422 # This is true starting in 6.0, but, num() also requires 6.0, so
423 # don't need to test for version again here.
424 $prop{'comment'} = "";
425
5c3b35c9
KW
426 $prop{'upper'} = _charinfo_case($char, uc $char, '_suc.pl', \%SIMPLE_UPPER);
427 $prop{'lower'} = _charinfo_case($char, lc $char, '_slc.pl', \%SIMPLE_LOWER);
05dbc6f8 428 $prop{'title'} = _charinfo_case($char, ucfirst $char, '_stc.pl',
5c3b35c9 429 \%SIMPLE_TITLE);
05dbc6f8
KW
430
431 $prop{block} = charblock($code);
432 $prop{script} = charscript($code);
433 return \%prop;
561c79ed
JH
434}
435
e882dd67
JH
436sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
437 my ($table, $lo, $hi, $code) = @_;
438
439 return if $lo > $hi;
440
441 my $mid = int(($lo+$hi) / 2);
442
443 if ($table->[$mid]->[0] < $code) {
10a6ecd2 444 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
445 return $table->[$mid]->[2];
446 } else {
447 _search($table, $mid + 1, $hi, $code);
448 }
449 } elsif ($table->[$mid]->[0] > $code) {
450 _search($table, $lo, $mid - 1, $code);
451 } else {
452 return $table->[$mid]->[2];
453 }
454}
455
cb366075 456sub _read_table ($;$) {
3a12600d
KW
457
458 # Returns the contents of the mktables generated table file located at $1
cb366075
KW
459 # in the form of either an array of arrays or a hash, depending on if the
460 # optional second parameter is true (for hash return) or not. In the case
461 # of a hash return, each key is a code point, and its corresponding value
462 # is what the table gives as the code point's corresponding value. In the
463 # case of an array return, each outer array denotes a range with [0] the
464 # start point of that range; [1] the end point; and [2] the value that
465 # every code point in the range has. The hash return is useful for fast
466 # lookup when the table contains only single code point ranges. The array
467 # return takes much less memory when there are large ranges.
3a12600d 468 #
cb366075 469 # This function has the side effect of setting
3a12600d
KW
470 # $utf8::SwashInfo{$property}{'format'} to be the mktables format of the
471 # table; and
472 # $utf8::SwashInfo{$property}{'missing'} to be the value for all entries
473 # not listed in the table.
474 # where $property is the Unicode property name, preceded by 'To' for map
475 # properties., e.g., 'ToSc'.
476 #
477 # Table entries look like one of:
478 # 0000 0040 Common # [65]
479 # 00AA Latin
480
481 my $table = shift;
cb366075
KW
482 my $return_hash = shift;
483 $return_hash = 0 unless defined $return_hash;
3a12600d 484 my @return;
cb366075 485 my %return;
3a12600d
KW
486 local $_;
487
488 for (split /^/m, do $table) {
489 my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
490 \s* ( \# .* )? # Optional comment
491 $ /x;
83fd1222
KW
492 my $decimal_start = hex $start;
493 my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
cb366075 494 if ($return_hash) {
83fd1222 495 foreach my $i ($decimal_start .. $decimal_end) {
cb366075
KW
496 $return{$i} = $value;
497 }
498 }
9a96c106
KW
499 elsif (@return &&
500 $return[-1][1] == $decimal_start - 1
501 && $return[-1][2] eq $value)
502 {
503 # If this is merely extending the previous range, do just that.
504 $return[-1]->[1] = $decimal_end;
505 }
cb366075 506 else {
83fd1222 507 push @return, [ $decimal_start, $decimal_end, $value ];
cb366075 508 }
3a12600d 509 }
cb366075 510 return ($return_hash) ? %return : @return;
3a12600d
KW
511}
512
10a6ecd2
JH
513sub charinrange {
514 my ($range, $arg) = @_;
515 my $code = _getcode($arg);
516 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
517 unless defined $code;
518 _search($range, 0, $#$range, $code);
519}
520
a452d459 521=head2 B<charblock()>
561c79ed 522
55d7b906 523 use Unicode::UCD 'charblock';
561c79ed
JH
524
525 my $charblock = charblock(0x41);
10a6ecd2 526 my $charblock = charblock(1234);
a452d459 527 my $charblock = charblock(0x263a);
10a6ecd2
JH
528 my $charblock = charblock("U+263a");
529
78bf21c2 530 my $range = charblock('Armenian');
10a6ecd2 531
a452d459
KW
532With a L</code point argument> charblock() returns the I<block> the code point
533belongs to, e.g. C<Basic Latin>.
534If the code point is unassigned, this returns the block it would belong to if
a18e976f 535it were assigned.
10a6ecd2 536
78bf21c2
JH
537See also L</Blocks versus Scripts>.
538
18972f4b 539If supplied with an argument that can't be a code point, charblock() tries to
a18e976f
KW
540do the opposite and interpret the argument as a block name. The return value
541is a I<range set> with one range: an anonymous list with a single element that
542consists of another anonymous list whose first element is the first code point
543in the block, and whose second (and final) element is the final code point in
544the block. (The extra list consisting of just one element is so that the same
545program logic can be used to handle both this return, and the return from
546L</charscript()> which can have multiple ranges.) You can test whether a code
547point is in a range using the L</charinrange()> function. If the argument is
548not a known block, C<undef> is returned.
561c79ed 549
561c79ed
JH
550=cut
551
552my @BLOCKS;
10a6ecd2 553my %BLOCKS;
561c79ed 554
10a6ecd2 555sub _charblocks {
06bba7d5
KW
556
557 # Can't read from the mktables table because it loses the hyphens in the
558 # original.
561c79ed 559 unless (@BLOCKS) {
10a6ecd2 560 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 561 local $_;
10a6ecd2 562 while (<$BLOCKSFH>) {
2796c109 563 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
564 my ($lo, $hi) = (hex($1), hex($2));
565 my $subrange = [ $lo, $hi, $3 ];
566 push @BLOCKS, $subrange;
567 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
568 }
569 }
10a6ecd2 570 close($BLOCKSFH);
561c79ed
JH
571 }
572 }
10a6ecd2
JH
573}
574
575sub charblock {
576 my $arg = shift;
577
578 _charblocks() unless @BLOCKS;
579
580 my $code = _getcode($arg);
561c79ed 581
10a6ecd2 582 if (defined $code) {
c707cf8e
KW
583 my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code);
584 return $result if defined $result;
585 return 'No_Block';
586 }
587 elsif (exists $BLOCKS{$arg}) {
588 return dclone $BLOCKS{$arg};
10a6ecd2 589 }
e882dd67
JH
590}
591
a452d459 592=head2 B<charscript()>
e882dd67 593
55d7b906 594 use Unicode::UCD 'charscript';
e882dd67
JH
595
596 my $charscript = charscript(0x41);
10a6ecd2
JH
597 my $charscript = charscript(1234);
598 my $charscript = charscript("U+263a");
e882dd67 599
78bf21c2 600 my $range = charscript('Thai');
10a6ecd2 601
a452d459
KW
602With a L</code point argument> charscript() returns the I<script> the
603code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
bb2d29dc 604If the code point is unassigned, it returns C<"Unknown">.
78bf21c2 605
eb0cc9e3 606If supplied with an argument that can't be a code point, charscript() tries
a18e976f
KW
607to do the opposite and interpret the argument as a script name. The
608return value is a I<range set>: an anonymous list of lists that contain
eb0cc9e3 609I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
a18e976f
KW
610code point is in a range set using the L</charinrange()> function. If the
611argument is not a known script, C<undef> is returned.
a452d459
KW
612
613See also L</Blocks versus Scripts>.
e882dd67 614
e882dd67
JH
615=cut
616
617my @SCRIPTS;
10a6ecd2 618my %SCRIPTS;
e882dd67 619
10a6ecd2 620sub _charscripts {
7bccef0b
KW
621 @SCRIPTS =_read_table("unicore/To/Sc.pl") unless @SCRIPTS;
622 foreach my $entry (@SCRIPTS) {
f3d50ac9 623 $entry->[2] =~ s/(_\w)/\L$1/g; # Preserve old-style casing
7bccef0b 624 push @{$SCRIPTS{$entry->[2]}}, $entry;
e882dd67 625 }
10a6ecd2
JH
626}
627
628sub charscript {
629 my $arg = shift;
630
631 _charscripts() unless @SCRIPTS;
e882dd67 632
10a6ecd2
JH
633 my $code = _getcode($arg);
634
635 if (defined $code) {
7bccef0b
KW
636 my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
637 return $result if defined $result;
8079ad82 638 return $utf8::SwashInfo{'ToSc'}{'missing'};
7bccef0b
KW
639 } elsif (exists $SCRIPTS{$arg}) {
640 return dclone $SCRIPTS{$arg};
10a6ecd2 641 }
7bccef0b
KW
642
643 return;
10a6ecd2
JH
644}
645
a452d459 646=head2 B<charblocks()>
10a6ecd2 647
55d7b906 648 use Unicode::UCD 'charblocks';
10a6ecd2 649
b08cd201 650 my $charblocks = charblocks();
10a6ecd2 651
b08cd201 652charblocks() returns a reference to a hash with the known block names
a452d459 653as the keys, and the code point ranges (see L</charblock()>) as the values.
10a6ecd2 654
78bf21c2
JH
655See also L</Blocks versus Scripts>.
656
10a6ecd2
JH
657=cut
658
659sub charblocks {
b08cd201 660 _charblocks() unless %BLOCKS;
741297c1 661 return dclone \%BLOCKS;
10a6ecd2
JH
662}
663
a452d459 664=head2 B<charscripts()>
10a6ecd2 665
55d7b906 666 use Unicode::UCD 'charscripts';
10a6ecd2 667
ea508aee 668 my $charscripts = charscripts();
10a6ecd2 669
ea508aee 670charscripts() returns a reference to a hash with the known script
a452d459 671names as the keys, and the code point ranges (see L</charscript()>) as
ea508aee 672the values.
10a6ecd2 673
78bf21c2
JH
674See also L</Blocks versus Scripts>.
675
10a6ecd2
JH
676=cut
677
678sub charscripts {
b08cd201 679 _charscripts() unless %SCRIPTS;
741297c1 680 return dclone \%SCRIPTS;
561c79ed
JH
681}
682
a452d459 683=head2 B<charinrange()>
10a6ecd2 684
f200dd12 685In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
10a6ecd2 686can also test whether a code point is in the I<range> as returned by
a452d459
KW
687L</charblock()> and L</charscript()> or as the values of the hash returned
688by L</charblocks()> and L</charscripts()> by using charinrange():
10a6ecd2 689
55d7b906 690 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
691
692 $range = charscript('Hiragana');
e145285f 693 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
694
695=cut
696
ea508aee
JH
697my %GENERAL_CATEGORIES =
698 (
699 'L' => 'Letter',
700 'LC' => 'CasedLetter',
701 'Lu' => 'UppercaseLetter',
702 'Ll' => 'LowercaseLetter',
703 'Lt' => 'TitlecaseLetter',
704 'Lm' => 'ModifierLetter',
705 'Lo' => 'OtherLetter',
706 'M' => 'Mark',
707 'Mn' => 'NonspacingMark',
708 'Mc' => 'SpacingMark',
709 'Me' => 'EnclosingMark',
710 'N' => 'Number',
711 'Nd' => 'DecimalNumber',
712 'Nl' => 'LetterNumber',
713 'No' => 'OtherNumber',
714 'P' => 'Punctuation',
715 'Pc' => 'ConnectorPunctuation',
716 'Pd' => 'DashPunctuation',
717 'Ps' => 'OpenPunctuation',
718 'Pe' => 'ClosePunctuation',
719 'Pi' => 'InitialPunctuation',
720 'Pf' => 'FinalPunctuation',
721 'Po' => 'OtherPunctuation',
722 'S' => 'Symbol',
723 'Sm' => 'MathSymbol',
724 'Sc' => 'CurrencySymbol',
725 'Sk' => 'ModifierSymbol',
726 'So' => 'OtherSymbol',
727 'Z' => 'Separator',
728 'Zs' => 'SpaceSeparator',
729 'Zl' => 'LineSeparator',
730 'Zp' => 'ParagraphSeparator',
731 'C' => 'Other',
732 'Cc' => 'Control',
733 'Cf' => 'Format',
734 'Cs' => 'Surrogate',
735 'Co' => 'PrivateUse',
736 'Cn' => 'Unassigned',
737 );
738
739sub general_categories {
740 return dclone \%GENERAL_CATEGORIES;
741}
742
a452d459 743=head2 B<general_categories()>
ea508aee
JH
744
745 use Unicode::UCD 'general_categories';
746
747 my $categories = general_categories();
748
a452d459 749This returns a reference to a hash which has short
ea508aee
JH
750general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
751names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
752C<Symbol>) as values. The hash is reversible in case you need to go
753from the long names to the short names. The general category is the
a452d459
KW
754one returned from
755L</charinfo()> under the C<category> key.
ea508aee 756
7ef25837
KW
757The L</prop_value_aliases()> function can be used to get all the synonyms of
758the category name.
759
ea508aee
JH
760=cut
761
762my %BIDI_TYPES =
763 (
764 'L' => 'Left-to-Right',
765 'LRE' => 'Left-to-Right Embedding',
766 'LRO' => 'Left-to-Right Override',
767 'R' => 'Right-to-Left',
768 'AL' => 'Right-to-Left Arabic',
769 'RLE' => 'Right-to-Left Embedding',
770 'RLO' => 'Right-to-Left Override',
771 'PDF' => 'Pop Directional Format',
772 'EN' => 'European Number',
773 'ES' => 'European Number Separator',
774 'ET' => 'European Number Terminator',
775 'AN' => 'Arabic Number',
776 'CS' => 'Common Number Separator',
777 'NSM' => 'Non-Spacing Mark',
778 'BN' => 'Boundary Neutral',
779 'B' => 'Paragraph Separator',
780 'S' => 'Segment Separator',
781 'WS' => 'Whitespace',
782 'ON' => 'Other Neutrals',
783 );
784
a452d459 785=head2 B<bidi_types()>
ea508aee
JH
786
787 use Unicode::UCD 'bidi_types';
788
789 my $categories = bidi_types();
790
a452d459 791This returns a reference to a hash which has the short
ea508aee
JH
792bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
793names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
794hash is reversible in case you need to go from the long names to the
a452d459
KW
795short names. The bidi type is the one returned from
796L</charinfo()>
ea508aee
JH
797under the C<bidi> key. For the exact meaning of the various bidi classes
798the Unicode TR9 is recommended reading:
a452d459 799L<http://www.unicode.org/reports/tr9/>
ea508aee
JH
800(as of Unicode 5.0.0)
801
7ef25837
KW
802The L</prop_value_aliases()> function can be used to get all the synonyms of
803the bidi type name.
804
ea508aee
JH
805=cut
806
a452d459
KW
807sub bidi_types {
808 return dclone \%BIDI_TYPES;
809}
810
811=head2 B<compexcl()>
b08cd201 812
55d7b906 813 use Unicode::UCD 'compexcl';
b08cd201 814
a452d459 815 my $compexcl = compexcl(0x09dc);
b08cd201 816
71a442a8
KW
817This routine is included for backwards compatibility, but as of Perl 5.12, for
818most purposes it is probably more convenient to use one of the following
819instead:
820
821 my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
822 my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
823
824or even
825
826 my $compexcl = chr(0x09dc) =~ /\p{CE};
827 my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
828
829The first two forms return B<true> if the L</code point argument> should not
76b05678
KW
830be produced by composition normalization. For the final two forms to return
831B<true>, it is additionally required that this fact not otherwise be
832determinable from the Unicode data base.
71a442a8
KW
833
834This routine behaves identically to the final two forms. That is,
835it does not return B<true> if the code point has a decomposition
a452d459
KW
836consisting of another single code point, nor if its decomposition starts
837with a code point whose combining class is non-zero. Code points that meet
838either of these conditions should also not be produced by composition
71a442a8
KW
839normalization, which is probably why you should use the
840C<Full_Composition_Exclusion> property instead, as shown above.
b08cd201 841
71a442a8 842The routine returns B<false> otherwise.
b08cd201
JH
843
844=cut
845
b08cd201
JH
846sub compexcl {
847 my $arg = shift;
848 my $code = _getcode($arg);
74f8133e
JH
849 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
850 unless defined $code;
b08cd201 851
36c2430c 852 no warnings "non_unicode"; # So works on non-Unicode code points
71a442a8 853 return chr($code) =~ /\p{Composition_Exclusion}/;
b08cd201
JH
854}
855
a452d459 856=head2 B<casefold()>
b08cd201 857
55d7b906 858 use Unicode::UCD 'casefold';
b08cd201 859
a452d459
KW
860 my $casefold = casefold(0xDF);
861 if (defined $casefold) {
862 my @full_fold_hex = split / /, $casefold->{'full'};
863 my $full_fold_string =
864 join "", map {chr(hex($_))} @full_fold_hex;
865 my @turkic_fold_hex =
866 split / /, ($casefold->{'turkic'} ne "")
867 ? $casefold->{'turkic'}
868 : $casefold->{'full'};
869 my $turkic_fold_string =
870 join "", map {chr(hex($_))} @turkic_fold_hex;
871 }
872 if (defined $casefold && $casefold->{'simple'} ne "") {
873 my $simple_fold_hex = $casefold->{'simple'};
874 my $simple_fold_string = chr(hex($simple_fold_hex));
875 }
b08cd201 876
a452d459
KW
877This returns the (almost) locale-independent case folding of the
878character specified by the L</code point argument>.
b08cd201 879
a18e976f 880If there is no case folding for that code point, C<undef> is returned.
a452d459
KW
881
882If there is a case folding for that code point, a reference to a hash
b08cd201
JH
883with the following fields is returned:
884
a452d459
KW
885=over
886
887=item B<code>
888
889the input L</code point argument> expressed in hexadecimal, with leading zeros
890added if necessary to make it contain at least four hexdigits
891
892=item B<full>
893
a18e976f 894one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
895code points for the case folding for I<code>.
896Each has at least four hexdigits.
897
898=item B<simple>
899
900is empty, or is exactly one code with at least four hexdigits which can be used
901as an alternative case folding when the calling program cannot cope with the
902fold being a sequence of multiple code points. If I<full> is just one code
903point, then I<simple> equals I<full>. If there is no single code point folding
904defined for I<code>, then I<simple> is the empty string. Otherwise, it is an
905inferior, but still better-than-nothing alternative folding to I<full>.
906
907=item B<mapping>
908
909is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
910otherwise. It can be considered to be the simplest possible folding for
911I<code>. It is defined primarily for backwards compatibility.
912
913=item B<status>
b08cd201 914
a452d459
KW
915is C<C> (for C<common>) if the best possible fold is a single code point
916(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct
917folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if
a18e976f
KW
918there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).
919Note that this
a452d459
KW
920describes the contents of I<mapping>. It is defined primarily for backwards
921compatibility.
b08cd201 922
a452d459
KW
923On versions 3.1 and earlier of Unicode, I<status> can also be
924C<I> which is the same as C<C> but is a special case for dotted uppercase I and
925dotless lowercase i:
b08cd201 926
a452d459 927=over
b08cd201 928
a18e976f 929=item B<*> If you use this C<I> mapping
a452d459 930
a18e976f 931the result is case-insensitive,
a452d459
KW
932but dotless and dotted I's are not distinguished
933
a18e976f 934=item B<*> If you exclude this C<I> mapping
a452d459 935
a18e976f 936the result is not fully case-insensitive, but
a452d459
KW
937dotless and dotted I's are distinguished
938
939=back
940
941=item B<turkic>
942
943contains any special folding for Turkic languages. For versions of Unicode
944starting with 3.2, this field is empty unless I<code> has a different folding
945in Turkic languages, in which case it is one or more codes (separated by
a18e976f 946spaces) that, taken in order, give the code points for the case folding for
a452d459
KW
947I<code> in those languages.
948Each code has at least four hexdigits.
949Note that this folding does not maintain canonical equivalence without
950additional processing.
951
952For versions of Unicode 3.1 and earlier, this field is empty unless there is a
953special folding for Turkic languages, in which case I<status> is C<I>, and
954I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
955
956=back
957
958Programs that want complete generality and the best folding results should use
959the folding contained in the I<full> field. But note that the fold for some
960code points will be a sequence of multiple code points.
961
962Programs that can't cope with the fold mapping being multiple code points can
963use the folding contained in the I<simple> field, with the loss of some
964generality. In Unicode 5.1, about 7% of the defined foldings have no single
965code point folding.
966
967The I<mapping> and I<status> fields are provided for backwards compatibility for
968existing programs. They contain the same values as in previous versions of
969this function.
970
971Locale is not completely independent. The I<turkic> field contains results to
972use when the locale is a Turkic language.
b08cd201
JH
973
974For more information about case mappings see
a452d459 975L<http://www.unicode.org/unicode/reports/tr21>
b08cd201
JH
976
977=cut
978
979my %CASEFOLD;
980
981sub _casefold {
982 unless (%CASEFOLD) {
551b6b6f 983 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 984 local $_;
b08cd201 985 while (<$CASEFOLDFH>) {
a452d459 986 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
b08cd201 987 my $code = hex($1);
a452d459
KW
988 $CASEFOLD{$code}{'code'} = $1;
989 $CASEFOLD{$code}{'turkic'} = "" unless
990 defined $CASEFOLD{$code}{'turkic'};
991 if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and
992 # earlier Unicodes
993 # Both entries there (I
994 # only checked 3.1) are
995 # the same as C, and
996 # there are no other
997 # entries for those
998 # codepoints, so treat
999 # as if C, but override
1000 # the turkic one for
1001 # 'I'.
1002 $CASEFOLD{$code}{'status'} = $2;
1003 $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
1004 $CASEFOLD{$code}{'mapping'} = $3;
1005 $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
1006 } elsif ($2 eq 'F') {
1007 $CASEFOLD{$code}{'full'} = $3;
1008 unless (defined $CASEFOLD{$code}{'simple'}) {
1009 $CASEFOLD{$code}{'simple'} = "";
1010 $CASEFOLD{$code}{'mapping'} = $3;
1011 $CASEFOLD{$code}{'status'} = $2;
1012 }
1013 } elsif ($2 eq 'S') {
1014
1015
1016 # There can't be a simple without a full, and simple
1017 # overrides all but full
1018
1019 $CASEFOLD{$code}{'simple'} = $3;
1020 $CASEFOLD{$code}{'mapping'} = $3;
1021 $CASEFOLD{$code}{'status'} = $2;
1022 } elsif ($2 eq 'T') {
1023 $CASEFOLD{$code}{'turkic'} = $3;
1024 } # else can't happen because only [CIFST] are possible
b08cd201
JH
1025 }
1026 }
1027 close($CASEFOLDFH);
1028 }
1029 }
1030}
1031
1032sub casefold {
1033 my $arg = shift;
1034 my $code = _getcode($arg);
74f8133e
JH
1035 croak __PACKAGE__, "::casefold: unknown code '$arg'"
1036 unless defined $code;
b08cd201
JH
1037
1038 _casefold() unless %CASEFOLD;
1039
1040 return $CASEFOLD{$code};
1041}
1042
a452d459 1043=head2 B<casespec()>
b08cd201 1044
55d7b906 1045 use Unicode::UCD 'casespec';
b08cd201 1046
a452d459 1047 my $casespec = casespec(0xFB00);
b08cd201 1048
a452d459
KW
1049This returns the potentially locale-dependent case mappings of the L</code point
1050argument>. The mappings may be longer than a single code point (which the basic
1051Unicode case mappings as returned by L</charinfo()> never are).
b08cd201 1052
a452d459
KW
1053If there are no case mappings for the L</code point argument>, or if all three
1054possible mappings (I<lower>, I<title> and I<upper>) result in single code
a18e976f 1055points and are locale independent and unconditional, C<undef> is returned
5d8e6e41
KW
1056(which means that the case mappings, if any, for the code point are those
1057returned by L</charinfo()>).
a452d459
KW
1058
1059Otherwise, a reference to a hash giving the mappings (or a reference to a hash
5d8e6e41
KW
1060of such hashes, explained below) is returned with the following keys and their
1061meanings:
a452d459
KW
1062
1063The keys in the bottom layer hash with the meanings of their values are:
1064
1065=over
1066
1067=item B<code>
1068
1069the input L</code point argument> expressed in hexadecimal, with leading zeros
1070added if necessary to make it contain at least four hexdigits
1071
1072=item B<lower>
1073
a18e976f 1074one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1075code points for the lower case of I<code>.
1076Each has at least four hexdigits.
1077
1078=item B<title>
b08cd201 1079
a18e976f 1080one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1081code points for the title case of I<code>.
1082Each has at least four hexdigits.
b08cd201 1083
d2da20e3 1084=item B<upper>
b08cd201 1085
a18e976f 1086one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1087code points for the upper case of I<code>.
1088Each has at least four hexdigits.
1089
1090=item B<condition>
1091
1092the conditions for the mappings to be valid.
a18e976f 1093If C<undef>, the mappings are always valid.
a452d459
KW
1094When defined, this field is a list of conditions,
1095all of which must be true for the mappings to be valid.
1096The list consists of one or more
1097I<locales> (see below)
1098and/or I<contexts> (explained in the next paragraph),
1099separated by spaces.
1100(Other than as used to separate elements, spaces are to be ignored.)
1101Case distinctions in the condition list are not significant.
82c0b05b 1102Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 1103
a452d459
KW
1104A I<context> is one of those defined in the Unicode standard.
1105For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1106available at
5d8e6e41
KW
1107L<http://www.unicode.org/versions/Unicode5.1.0/>.
1108These are for context-sensitive casing.
f499c386 1109
a452d459
KW
1110=back
1111
5d8e6e41 1112The hash described above is returned for locale-independent casing, where
a18e976f 1113at least one of the mappings has length longer than one. If C<undef> is
5d8e6e41
KW
1114returned, the code point may have mappings, but if so, all are length one,
1115and are returned by L</charinfo()>.
1116Note that when this function does return a value, it will be for the complete
1117set of mappings for a code point, even those whose length is one.
1118
1119If there are additional casing rules that apply only in certain locales,
1120an additional key for each will be defined in the returned hash. Each such key
1121will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1122followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1123and a variant code). You can find the lists of all possible locales, see
1124L<Locale::Country> and L<Locale::Language>.
89e4a205 1125(In Unicode 6.0, the only locales returned by this function
a452d459 1126are C<lt>, C<tr>, and C<az>.)
b08cd201 1127
5d8e6e41
KW
1128Each locale key is a reference to a hash that has the form above, and gives
1129the casing rules for that particular locale, which take precedence over the
1130locale-independent ones when in that locale.
1131
1132If the only casing for a code point is locale-dependent, then the returned
1133hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1134will contain only locale keys.
1135
b08cd201 1136For more information about case mappings see
a452d459 1137L<http://www.unicode.org/unicode/reports/tr21/>
b08cd201
JH
1138
1139=cut
1140
1141my %CASESPEC;
1142
1143sub _casespec {
1144 unless (%CASESPEC) {
551b6b6f 1145 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 1146 local $_;
b08cd201
JH
1147 while (<$CASESPECFH>) {
1148 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
1149 my ($hexcode, $lower, $title, $upper, $condition) =
1150 ($1, $2, $3, $4, $5);
1151 my $code = hex($hexcode);
1152 if (exists $CASESPEC{$code}) {
1153 if (exists $CASESPEC{$code}->{code}) {
1154 my ($oldlower,
1155 $oldtitle,
1156 $oldupper,
1157 $oldcondition) =
1158 @{$CASESPEC{$code}}{qw(lower
1159 title
1160 upper
1161 condition)};
822ebcc8
JH
1162 if (defined $oldcondition) {
1163 my ($oldlocale) =
f499c386 1164 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
1165 delete $CASESPEC{$code};
1166 $CASESPEC{$code}->{$oldlocale} =
1167 { code => $hexcode,
1168 lower => $oldlower,
1169 title => $oldtitle,
1170 upper => $oldupper,
1171 condition => $oldcondition };
f499c386
JH
1172 }
1173 }
1174 my ($locale) =
1175 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1176 $CASESPEC{$code}->{$locale} =
1177 { code => $hexcode,
1178 lower => $lower,
1179 title => $title,
1180 upper => $upper,
1181 condition => $condition };
1182 } else {
1183 $CASESPEC{$code} =
1184 { code => $hexcode,
1185 lower => $lower,
1186 title => $title,
1187 upper => $upper,
1188 condition => $condition };
1189 }
b08cd201
JH
1190 }
1191 }
1192 close($CASESPECFH);
1193 }
1194 }
1195}
1196
1197sub casespec {
1198 my $arg = shift;
1199 my $code = _getcode($arg);
74f8133e
JH
1200 croak __PACKAGE__, "::casespec: unknown code '$arg'"
1201 unless defined $code;
b08cd201
JH
1202
1203 _casespec() unless %CASESPEC;
1204
741297c1 1205 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
1206}
1207
a452d459 1208=head2 B<namedseq()>
a2bd7410
JH
1209
1210 use Unicode::UCD 'namedseq';
1211
1212 my $namedseq = namedseq("KATAKANA LETTER AINU P");
1213 my @namedseq = namedseq("KATAKANA LETTER AINU P");
1214 my %namedseq = namedseq();
1215
1216If used with a single argument in a scalar context, returns the string
a18e976f 1217consisting of the code points of the named sequence, or C<undef> if no
a2bd7410 1218named sequence by that name exists. If used with a single argument in
956cae9a
KW
1219a list context, it returns the list of the ordinals of the code points. If used
1220with no
a2bd7410
JH
1221arguments in a list context, returns a hash with the names of the
1222named sequences as the keys and the named sequences as strings as
a18e976f 1223the values. Otherwise, it returns C<undef> or an empty list depending
a2bd7410
JH
1224on the context.
1225
a452d459
KW
1226This function only operates on officially approved (not provisional) named
1227sequences.
a2bd7410 1228
27f853a0
KW
1229Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
1230sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
1231LETTER AINU P")> will return the same string this function does, but will also
1232operate on character names that aren't named sequences, without you having to
1233know which are which. See L<charnames>.
1234
a2bd7410
JH
1235=cut
1236
1237my %NAMEDSEQ;
1238
1239sub _namedseq {
1240 unless (%NAMEDSEQ) {
98ef7649 1241 if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
a2bd7410
JH
1242 local $_;
1243 while (<$NAMEDSEQFH>) {
98ef7649
KW
1244 if (/^ [0-9A-F]+ \ /x) {
1245 chomp;
1246 my ($sequence, $name) = split /\t/;
1247 my @s = map { chr(hex($_)) } split(' ', $sequence);
1248 $NAMEDSEQ{$name} = join("", @s);
a2bd7410
JH
1249 }
1250 }
1251 close($NAMEDSEQFH);
1252 }
1253 }
1254}
1255
1256sub namedseq {
98ef7649
KW
1257
1258 # Use charnames::string_vianame() which now returns this information,
1259 # unless the caller wants the hash returned, in which case we read it in,
1260 # and thereafter use it instead of calling charnames, as it is faster.
1261
a2bd7410
JH
1262 my $wantarray = wantarray();
1263 if (defined $wantarray) {
1264 if ($wantarray) {
1265 if (@_ == 0) {
98ef7649 1266 _namedseq() unless %NAMEDSEQ;
a2bd7410
JH
1267 return %NAMEDSEQ;
1268 } elsif (@_ == 1) {
98ef7649
KW
1269 my $s;
1270 if (%NAMEDSEQ) {
1271 $s = $NAMEDSEQ{ $_[0] };
1272 }
1273 else {
1274 $s = charnames::string_vianame($_[0]);
1275 }
a2bd7410
JH
1276 return defined $s ? map { ord($_) } split('', $s) : ();
1277 }
1278 } elsif (@_ == 1) {
98ef7649
KW
1279 return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
1280 return charnames::string_vianame($_[0]);
a2bd7410
JH
1281 }
1282 }
1283 return;
1284}
1285
7319f91d
KW
1286my %NUMERIC;
1287
1288sub _numeric {
1289
1290 # Unicode 6.0 instituted the rule that only digits in a consecutive
1291 # block of 10 would be considered decimal digits. Before that, the only
1292 # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
1293 # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
1294 # ONE. The code could be modified to handle that, but not bothering, as
1295 # in TUS 6.0, U+19DA was changed to Nt=Di.
1296 if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
1297 croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
1298 }
98025745
KW
1299 my @numbers = _read_table("unicore/To/Nv.pl");
1300 foreach my $entry (@numbers) {
1301 my ($start, $end, $value) = @$entry;
1302
05dbc6f8
KW
1303 # If value contains a slash, convert to decimal, add a reverse hash
1304 # used by charinfo.
98025745
KW
1305 if ((my @rational = split /\//, $value) == 2) {
1306 my $real = $rational[0] / $rational[1];
05dbc6f8 1307 $real_to_rational{$real} = $value;
98025745
KW
1308 $value = $real;
1309 }
1310
1311 for my $i ($start .. $end) {
1312 $NUMERIC{$i} = $value;
7319f91d 1313 }
7319f91d 1314 }
2dc5eb26
KW
1315
1316 # Decided unsafe to use these that aren't officially part of the Unicode
1317 # standard.
1318 #use Math::Trig;
1319 #my $pi = acos(-1.0);
98025745 1320 #$NUMERIC{0x03C0} = $pi;
7319f91d
KW
1321
1322 # Euler's constant, not to be confused with Euler's number
98025745 1323 #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
7319f91d
KW
1324
1325 # Euler's number
98025745 1326 #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
2dc5eb26 1327
7319f91d
KW
1328 return;
1329}
1330
1331=pod
1332
67592e11 1333=head2 B<num()>
7319f91d 1334
eefd7bc2
KW
1335 use Unicode::UCD 'num';
1336
1337 my $val = num("123");
1338 my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
1339
7319f91d
KW
1340C<num> returns the numeric value of the input Unicode string; or C<undef> if it
1341doesn't think the entire string has a completely valid, safe numeric value.
1342
1343If the string is just one character in length, the Unicode numeric value
1344is returned if it has one, or C<undef> otherwise. Note that this need
1345not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
2dc5eb26
KW
1346example returns -0.5.
1347
1348=cut
7319f91d 1349
2dc5eb26
KW
1350#A few characters to which Unicode doesn't officially
1351#assign a numeric value are considered numeric by C<num>.
1352#These are:
1353
1354# EULER CONSTANT 0.5772... (this is NOT Euler's number)
1355# SCRIPT SMALL E 2.71828... (this IS Euler's number)
1356# GREEK SMALL LETTER PI 3.14159...
1357
1358=pod
7319f91d
KW
1359
1360If the string is more than one character, C<undef> is returned unless
8bb4c8e2 1361all its characters are decimal digits (that is, they would match C<\d+>),
7319f91d
KW
1362from the same script. For example if you have an ASCII '0' and a Bengali
1363'3', mixed together, they aren't considered a valid number, and C<undef>
1364is returned. A further restriction is that the digits all have to be of
1365the same form. A half-width digit mixed with a full-width one will
1366return C<undef>. The Arabic script has two sets of digits; C<num> will
1367return C<undef> unless all the digits in the string come from the same
1368set.
1369
1370C<num> errs on the side of safety, and there may be valid strings of
1371decimal digits that it doesn't recognize. Note that Unicode defines
1372a number of "digit" characters that aren't "decimal digit" characters.
a278d14b 1373"Decimal digits" have the property that they have a positional value, i.e.,
7319f91d
KW
1374there is a units position, a 10's position, a 100's, etc, AND they are
1375arranged in Unicode in blocks of 10 contiguous code points. The Chinese
1376digits, for example, are not in such a contiguous block, and so Unicode
1377doesn't view them as decimal digits, but merely digits, and so C<\d> will not
1378match them. A single-character string containing one of these digits will
1379have its decimal value returned by C<num>, but any longer string containing
1380only these digits will return C<undef>.
1381
a278d14b
KW
1382Strings of multiple sub- and superscripts are not recognized as numbers. You
1383can use either of the compatibility decompositions in Unicode::Normalize to
7319f91d
KW
1384change these into digits, and then call C<num> on the result.
1385
1386=cut
1387
1388# To handle sub, superscripts, this could if called in list context,
1389# consider those, and return the <decomposition> type in the second
1390# array element.
1391
1392sub num {
1393 my $string = $_[0];
1394
1395 _numeric unless %NUMERIC;
1396
1397 my $length = length($string);
98025745 1398 return $NUMERIC{ord($string)} if $length == 1;
7319f91d
KW
1399 return if $string =~ /\D/;
1400 my $first_ord = ord(substr($string, 0, 1));
98025745 1401 my $value = $NUMERIC{$first_ord};
7319f91d
KW
1402 my $zero_ord = $first_ord - $value;
1403
1404 for my $i (1 .. $length -1) {
1405 my $ord = ord(substr($string, $i, 1));
1406 my $digit = $ord - $zero_ord;
1407 return unless $digit >= 0 && $digit <= 9;
1408 $value = $value * 10 + $digit;
1409 }
1410 return $value;
1411}
1412
7ef25837
KW
1413=pod
1414
1415=head2 B<prop_aliases()>
1416
1417 use Unicode::UCD 'prop_aliases';
1418
1419 my ($short_name, $full_name, @other_names) = prop_aliases("space");
1420 my $same_full_name = prop_aliases("Space"); # Scalar context
1421 my ($same_short_name) = prop_aliases("Space"); # gets 0th element
1422 print "The full name is $full_name\n";
1423 print "The short name is $short_name\n";
1424 print "The other aliases are: ", join(", ", @other_names), "\n";
1425
1426 prints:
1427 The full name is White_Space
1428 The short name is WSpace
1429 The other aliases are: Space
1430
1431Most Unicode properties have several synonymous names. Typically, there is at
1432least a short name, convenient to type, and a long name that more fully
1433describes the property, and hence is more easily understood.
1434
1435If you know one name for a Unicode property, you can use C<prop_aliases> to find
1436either the long name (when called in scalar context), or a list of all of the
1437names, somewhat ordered so that the short name is in the 0th element, the long
1438name in the next element, and any other synonyms are in the remaining
1439elements, in no particular order.
1440
1441The long name is returned in a form nicely capitalized, suitable for printing.
1442
1443The input parameter name is loosely matched, which means that white space,
1444hyphens, and underscores are ignored (except for the trailing underscore in
1445the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and
1446both of which mean C<General_Category=Cased Letter>).
1447
1448If the name is unknown, C<undef> is returned (or an empty list in list
1449context). Note that Perl typically recognizes property names in regular
1450expressions with an optional C<"Is_>" (with or without the underscore)
1451prefixed to them, such as C<\p{isgc=punct}>. This function does not recognize
1452those in the input, returning C<undef>. Nor are they included in the output
1453as possible synonyms.
1454
1455C<prop_aliases> does know about the Perl extensions to Unicode properties,
1456such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode
1457properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>. The
1458final example demonstrates that the C<"Is_"> prefix is recognized for these
1459extensions; it is needed to resolve ambiguities. For example,
1460C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but
1461C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>. This is
1462because C<islc> is a Perl extension which is short for
1463C<General_Category=Cased Letter>. The lists returned for the Perl extensions
1464will not include the C<"Is_"> prefix (whether or not the input had it) unless
1465needed to resolve ambiguities, as shown in the C<"islc"> example, where the
1466returned list had one element containing C<"Is_">, and the other without.
1467
1468It is also possible for the reverse to happen: C<prop_aliases('isc')> returns
1469the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns
1470C<(C, Other)> (the latter being a Perl extension meaning
1471C<General_Category=Other>. L<perluniprops> lists the available forms,
1472including which ones are discouraged from use.
1473
1474Those discouraged forms are accepted as input to C<prop_aliases>, but are not
1475returned in the lists. C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
1476which are old synonyms for C<"Is_LC"> and should not be used in new code, are
1477examples of this. These both return C<(Is_LC, Cased_Letter)>. Thus this
1478function allows you to take a discourarged form, and find its acceptable
1479alternatives. The same goes with single-form Block property equivalences.
1480Only the forms that begin with C<"In_"> are not discouraged; if you pass
1481C<prop_aliases> a discouraged form, you will get back the equivalent ones that
1482begin with C<"In_">. It will otherwise look like a new-style block name (see.
1483L</Old-style versus new-style block names>).
1484
1485C<prop_aliases> does not know about any user-defined properties, and will
1486return C<undef> if called with one of those. Likewise for Perl internal
1487properties, with the exception of "Perl_Decimal_Digit" which it does know
1488about (and which is documented below in L</prop_invmap()>).
1489
1490=cut
1491
1492# It may be that there are use cases where the discouraged forms should be
1493# returned. If that comes up, an optional boolean second parameter to the
1494# function could be created, for example.
1495
1496# These are created by mktables for this routine and stored in unicore/UCD.pl
1497# where their structures are described.
1498our %string_property_loose_to_name;
1499our %ambiguous_names;
1500our %loose_perlprop_to_name;
1501our %prop_aliases;
1502
1503sub prop_aliases ($) {
1504 my $prop = $_[0];
1505 return unless defined $prop;
1506
1507 require "unicore/UCD.pl";
1508 require "unicore/Heavy.pl";
1509 require "utf8_heavy.pl";
1510
1511 # The property name may be loosely or strictly matched; we don't know yet.
1512 # But both types use lower-case.
1513 $prop = lc $prop;
1514
1515 # It is loosely matched if its lower case isn't known to be strict.
1516 my $list_ref;
1517 if (! exists $utf8::stricter_to_file_of{$prop}) {
1518 my $loose = utf8::_loose_name($prop);
1519
1520 # There is a hash that converts from any loose name to its standard
1521 # form, mapping all synonyms for a name to one name that can be used
1522 # as a key into another hash. The whole concept is for memory
1523 # savings, as the second hash doesn't have to have all the
1524 # combinations. Actually, there are two hashes that do the
1525 # converstion. One is used in utf8_heavy.pl (stored in Heavy.pl) for
1526 # looking up properties matchable in regexes. This function needs to
1527 # access string properties, which aren't available in regexes, so a
1528 # second conversion hash is made for them (stored in UCD.pl). Look in
1529 # the string one now, as the rest can have an optional 'is' prefix,
1530 # which these don't.
1531 if (exists $string_property_loose_to_name{$loose}) {
1532
1533 # Convert to its standard loose name.
1534 $prop = $string_property_loose_to_name{$loose};
1535 }
1536 else {
1537 my $retrying = 0; # bool. ? Has an initial 'is' been stripped
1538 RETRY:
1539 if (exists $utf8::loose_property_name_of{$loose}
1540 && (! $retrying
1541 || ! exists $ambiguous_names{$loose}))
1542 {
1543 # Found an entry giving the standard form. We don't get here
1544 # (in the test above) when we've stripped off an
1545 # 'is' and the result is an ambiguous name. That is because
1546 # these are official Unicode properties (though Perl can have
1547 # an optional 'is' prefix meaning the official property), and
1548 # all ambiguous cases involve a Perl single-form extension
1549 # for the gc, script, or block properties, and the stripped
1550 # 'is' means that they mean one of those, and not one of
1551 # these
1552 $prop = $utf8::loose_property_name_of{$loose};
1553 }
1554 elsif (exists $loose_perlprop_to_name{$loose}) {
1555
1556 # This hash is specifically for this function to list Perl
1557 # extensions that aren't in the earlier hashes. If there is
1558 # only one element, the short and long names are identical.
1559 # Otherwise the form is already in the same form as
1560 # %prop_aliases, which is handled at the end of the function.
1561 $list_ref = $loose_perlprop_to_name{$loose};
1562 if (@$list_ref == 1) {
1563 my @list = ($list_ref->[0], $list_ref->[0]);
1564 $list_ref = \@list;
1565 }
1566 }
1567 elsif (! exists $utf8::loose_to_file_of{$loose}) {
1568
1569 # loose_to_file_of is a complete list of loose names. If not
1570 # there, the input is unknown.
1571 return;
1572 }
1573 else {
1574
1575 # Here we found the name but not its aliases, so it has to
1576 # exist. This means it must be one of the Perl single-form
1577 # extensions. First see if it is for a property-value
1578 # combination in one of the following properties.
1579 my @list;
1580 foreach my $property ("gc", "script") {
1581 @list = prop_value_aliases($property, $loose);
1582 last if @list;
1583 }
1584 if (@list) {
1585
1586 # Here, it is one of those property-value combination
1587 # single-form synonyms. There are ambiguities with some
1588 # of these. Check against the list for these, and adjust
1589 # if necessary.
1590 for my $i (0 .. @list -1) {
1591 if (exists $ambiguous_names
1592 {utf8::_loose_name(lc $list[$i])})
1593 {
1594 # The ambiguity is resolved by toggling whether or
1595 # not it has an 'is' prefix
1596 $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/;
1597 }
1598 }
1599 return @list;
1600 }
1601
1602 # Here, it wasn't one of the gc or script single-form
1603 # extensions. It could be a block property single-form
1604 # extension. An 'in' prefix definitely means that, and should
1605 # be looked up without the prefix.
1606 my $began_with_in = $loose =~ s/^in//;
1607 @list = prop_value_aliases("block", $loose);
1608 if (@list) {
1609 map { $_ =~ s/^/In_/ } @list;
1610 return @list;
1611 }
1612
1613 # Here still haven't found it. The last opportunity for it
1614 # being valid is only if it began with 'is'. We retry without
1615 # the 'is', setting a flag to that effect so that we don't
1616 # accept things that begin with 'isis...'
1617 if (! $retrying && ! $began_with_in && $loose =~ s/^is//) {
1618 $retrying = 1;
1619 goto RETRY;
1620 }
1621
1622 # Here, didn't find it. Since it was in %loose_to_file_of, we
1623 # should have been able to find it.
1624 carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'. Send bug report to perlbug\@perl.org";
1625 return;
1626 }
1627 }
1628 }
1629
1630 if (! $list_ref) {
1631 # Here, we have set $prop to a standard form name of the input. Look
1632 # it up in the structure created by mktables for this purpose, which
1633 # contains both strict and loosely matched properties. Avoid
1634 # autovivifying.
1635 $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop};
1636 return unless $list_ref;
1637 }
1638
1639 # The full name is in element 1.
1640 return $list_ref->[1] unless wantarray;
1641
1642 return @{dclone $list_ref};
1643}
1644
1645=pod
1646
1647=head2 B<prop_value_aliases()>
1648
1649 use Unicode::UCD 'prop_value_aliases';
1650
1651 my ($short_name, $full_name, @other_names)
1652 = prop_value_aliases("Gc", "Punct");
1653 my $same_full_name = prop_value_aliases("Gc", "P"); # Scalar cntxt
1654 my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
1655 # element
1656 print "The full name is $full_name\n";
1657 print "The short name is $short_name\n";
1658 print "The other aliases are: ", join(", ", @other_names), "\n";
1659
1660 prints:
1661 The full name is Punctuation
1662 The short name is P
1663 The other aliases are: Punct
1664
1665Some Unicode properties have a restricted set of legal values. For example,
1666all binary properties are restricted to just C<true> or C<false>; and there
1667are only a few dozen possible General Categories.
1668
1669For such properties, there are usually several synonyms for each possible
1670value. For example, in binary properties, I<truth> can be represented by any of
1671the strings "Y", "Yes", "T", or "True"; and the General Category
1672"Punctuation" by that string, or "Punct", or simply "P".
1673
1674Like property names, there is typically at least a short name for each such
1675property-value, and a long name. If you know any name of the property-value,
1676you can use C<prop_value_aliases>() to get the long name (when called in
1677scalar context), or a list of all the names, with the short name in the 0th
1678element, the long name in the next element, and any other synonyms in the
1679remaining elements, in no particular order, except that any all-numeric
1680synonyms will be last.
1681
1682The long name is returned in a form nicely capitalized, suitable for printing.
1683
1684Case, white space, hyphens, and underscores are ignored in the input parameters
1685(except for the trailing underscore in the old-form grandfathered-in general
1686category property value C<"L_">, which is better written as C<"LC">).
1687
1688If either name is unknown, C<undef> is returned. Note that Perl typically
1689recognizes property names in regular expressions with an optional C<"Is_>"
1690(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
1691This function does not recognize those in the property parameter, returning
1692C<undef>.
1693
1694If called with a property that doesn't have synonyms for its values, it
1695returns the input value, possibly normalized with capitalization and
1696underscores.
1697
1698For the block property, new-style block names are returned (see
1699L</Old-style versus new-style block names>).
1700
1701To find the synonyms for single-forms, such as C<\p{Any}>, use
1702L</prop_aliases()> instead.
1703
1704C<prop_value_aliases> does not know about any user-defined properties, and
1705will return C<undef> if called with one of those.
1706
1707=cut
1708
1709# These are created by mktables for this routine and stored in unicore/UCD.pl
1710# where their structures are described.
1711our %loose_to_standard_value;
1712our %prop_value_aliases;
1713
1714sub prop_value_aliases ($$) {
1715 my ($prop, $value) = @_;
1716 return unless defined $prop && defined $value;
1717
1718 require "unicore/UCD.pl";
1719 require "utf8_heavy.pl";
1720
1721 # Find the property name synonym that's used as the key in other hashes,
1722 # which is element 0 in the returned list.
1723 ($prop) = prop_aliases($prop);
1724 return if ! $prop;
1725 $prop = utf8::_loose_name(lc $prop);
1726
1727 # Here is a legal property, but the hash below (created by mktables for
1728 # this purpose) only knows about the properties that have a very finite
1729 # number of potential values, that is not ones whose value could be
1730 # anything, like most (if not all) string properties. These don't have
1731 # synonyms anyway. Simply return the input. For example, there is no
1732 # synonym for ('Uppercase_Mapping', A').
1733 return $value if ! exists $prop_value_aliases{$prop};
1734
1735 # The value name may be loosely or strictly matched; we don't know yet.
1736 # But both types use lower-case.
1737 $value = lc $value;
1738
1739 # If the name isn't found under loose matching, it certainly won't be
1740 # found under strict
1741 my $loose_value = utf8::_loose_name($value);
1742 return unless exists $loose_to_standard_value{"$prop=$loose_value"};
1743
1744 # Similarly if the combination under loose matching doesn't exist, it
1745 # won't exist under strict.
1746 my $standard_value = $loose_to_standard_value{"$prop=$loose_value"};
1747 return unless exists $prop_value_aliases{$prop}{$standard_value};
1748
1749 # Here we did find a combination under loose matching rules. But it could
1750 # be that is a strict property match that shouldn't have matched.
1751 # %prop_value_aliases is set up so that the strict matches will appear as
1752 # if they were in loose form. Thus, if the non-loose version is legal,
1753 # we're ok, can skip the further check.
1754 if (! exists $utf8::stricter_to_file_of{"$prop=$value"}
1755
1756 # We're also ok and skip the further check if value loosely matches.
1757 # mktables has verified that no strict name under loose rules maps to
1758 # an existing loose name. This code relies on the very limited
1759 # circumstances that strict names can be here. Strict name matching
1760 # happens under two conditions:
1761 # 1) when the name begins with an underscore. But this function
1762 # doesn't accept those, and %prop_value_aliases doesn't have
1763 # them.
1764 # 2) When the values are numeric, in which case we need to look
1765 # further, but their squeezed-out loose values will be in
1766 # %stricter_to_file_of
1767 && exists $utf8::stricter_to_file_of{"$prop=$loose_value"})
1768 {
1769 # The only thing that's legal loosely under strict is that can have an
1770 # underscore between digit pairs XXX
1771 while ($value =~ s/(\d)_(\d)/$1$2/g) {}
1772 return unless exists $utf8::stricter_to_file_of{"$prop=$value"};
1773 }
1774
1775 # Here, we know that the combination exists. Return it.
1776 my $list_ref = $prop_value_aliases{$prop}{$standard_value};
1777 if (@$list_ref > 1) {
1778 # The full name is in element 1.
1779 return $list_ref->[1] unless wantarray;
1780
1781 return @{dclone $list_ref};
1782 }
1783
1784 return $list_ref->[0] unless wantarray;
1785
1786 # Only 1 element means that it repeats
1787 return ( $list_ref->[0], $list_ref->[0] );
1788}
7319f91d 1789
681d705c
KW
1790# All 1 bits is the largest possible UV.
1791$Unicode::UCD::MAX_CP = ~0;
1792
1793=pod
1794
1795=head2 B<prop_invlist()>
1796
1797C<prop_invlist> returns an inversion list (described below) that defines all the
1798code points for the binary Unicode property (or "property=value" pair) given
1799by the input parameter string:
1800
1801 use feature 'say';
1802 use Unicode::UCD 'prop_invlist';
1803 say join ", ", prop_invlist("Any");
1804
1805 prints:
1806 0, 1114112
1807
1808An empty list is returned if the input is unknown; the number of elements in
1809the list is returned if called in scalar context.
1810
1811L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives
1812the list of properties that this function accepts, as well as all the possible
1813forms for them (including with the optional "Is_" prefixes). (Except this
1814function doesn't accept any Perl-internal properties, some of which are listed
1815there.) This function uses the same loose or tighter matching rules for
1816resolving the input property's name as is done for regular expressions. These
1817are also specified in L<perluniprops|perluniprops/Properties accessible
1818through \p{} and \P{}>. Examples of using the "property=value" form are:
1819
1820 say join ", ", prop_invlist("Script=Shavian");
1821
1822 prints:
1823 66640, 66688
1824
1825 say join ", ", prop_invlist("ASCII_Hex_Digit=No");
1826
1827 prints:
1828 0, 48, 58, 65, 71, 97, 103
1829
1830 say join ", ", prop_invlist("ASCII_Hex_Digit=Yes");
1831
1832 prints:
1833 48, 58, 65, 71, 97, 103
1834
1835Inversion lists are a compact way of specifying Unicode property-value
1836definitions. The 0th item in the list is the lowest code point that has the
1837property-value. The next item (item [1]) is the lowest code point beyond that
1838one that does NOT have the property-value. And the next item beyond that
1839([2]) is the lowest code point beyond that one that does have the
1840property-value, and so on. Put another way, each element in the list gives
1841the beginning of a range that has the property-value (for even numbered
1842elements), or doesn't have the property-value (for odd numbered elements).
1843The name for this data structure stems from the fact that each element in the
1844list toggles (or inverts) whether the corresponding range is or isn't on the
1845list.
1846
1847In the final example above, the first ASCII Hex digit is code point 48, the
1848character "0", and all code points from it through 57 (a "9") are ASCII hex
1849digits. Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F")
1850are, as are 97 ("a") through 102 ("f"). 103 starts a range of code points
1851that aren't ASCII hex digits. That range extends to infinity, which on your
1852computer can be found in the variable C<$Unicode::UCD::MAX_CP>. (This
1853variable is as close to infinity as Perl can get on your platform, and may be
1854too high for some operations to work; you may wish to use a smaller number for
1855your purposes.)
1856
1857Note that the inversion lists returned by this function can possibly include
1858non-Unicode code points, that is anything above 0x10FFFF. This is in
1859contrast to Perl regular expression matches on those code points, in which a
1860non-Unicode code point always fails to match. For example, both of these have
1861the same result:
1862
1863 chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails.
1864 chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Fails!
1865
1866And both raise a warning that a Unicode property is being used on a
1867non-Unicode code point. It is arguable as to which is the correct thing to do
1868here. This function has chosen the way opposite to the Perl regular
1869expression behavior. This allows you to easily flip to to the Perl regular
1870expression way (for you to go in the other direction would be far harder).
1871Simply add 0x110000 at the end of the non-empty returned list if it isn't
1872already that value; and pop that value if it is; like:
1873
1874 my @list = prop_invlist("foo");
1875 if (@list) {
1876 if ($list[-1] == 0x110000) {
1877 pop @list; # Defeat the turning on for above Unicode
1878 }
1879 else {
1880 push @list, 0x110000; # Turn off for above Unicode
1881 }
1882 }
1883
1884It is a simple matter to expand out an inversion list to a full list of all
1885code points that have the property-value:
1886
1887 my @invlist = prop_invlist($property_name);
1888 die "empty" unless @invlist;
1889 my @full_list;
1890 for (my $i = 0; $i < @invlist; $i += 2) {
1891 my $upper = ($i + 1) < @invlist
1892 ? $invlist[$i+1] - 1 # In range
1893 : $Unicode::UCD::MAX_CP; # To infinity. You may want
1894 # to stop much much earlier;
1895 # going this high may expose
1896 # perl deficiencies with very
1897 # large numbers.
1898 for my $j ($invlist[$i] .. $upper) {
1899 push @full_list, $j;
1900 }
1901 }
1902
1903C<prop_invlist> does not know about any user-defined nor Perl internal-only
1904properties, and will return C<undef> if called with one of those.
1905
1906=cut
1907
1908# User-defined properties could be handled with some changes to utf8_heavy.pl;
1909# and implementing here of dealing with EXTRAS. If done, consideration should
1910# be given to the fact that the user subroutine could return different results
1911# with each call; security issues need to be thought about.
1912
1913# These are created by mktables for this routine and stored in unicore/UCD.pl
1914# where their structures are described.
1915our %loose_defaults;
1916our $MAX_UNICODE_CODEPOINT;
1917
1918sub prop_invlist ($) {
1919 my $prop = $_[0];
1920 return if ! defined $prop;
1921
1922 require "utf8_heavy.pl";
1923
1924 # Warnings for these are only for regexes, so not applicable to us
1925 no warnings 'deprecated';
1926
1927 # Get the swash definition of the property-value.
1928 my $swash = utf8::SWASHNEW(__PACKAGE__, $prop, undef, 1, 0);
1929
1930 # Fail if not found, or isn't a boolean property-value, or is a
1931 # user-defined property, or is internal-only.
1932 return if ! $swash
1933 || ref $swash eq ""
1934 || $swash->{'BITS'} != 1
1935 || $swash->{'USER_DEFINED'}
1936 || $prop =~ /^\s*_/;
1937
1938 if ($swash->{'EXTRAS'}) {
1939 carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
1940 return;
1941 }
1942 if ($swash->{'SPECIALS'}) {
1943 carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has SPECIALS magic";
1944 return;
1945 }
1946
1947 my @invlist;
1948
1949 # The input lines look like:
1950 # 0041\t005A # [26]
1951 # 005F
1952
1953 # Split into lines, stripped of trailing comments
1954 foreach my $range (split "\n",
1955 $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr)
1956 {
1957 # And find the beginning and end of the range on the line
1958 my ($hex_begin, $hex_end) = split "\t", $range;
1959 my $begin = hex $hex_begin;
1960
1961 # Add the beginning of the range
1962 push @invlist, $begin;
1963
1964 if (defined $hex_end) { # The next item starts with the code point 1
1965 # beyond the end of the range.
1966 push @invlist, hex($hex_end) + 1;
1967 }
1968 else { # No end of range, is a single code point.
1969 push @invlist, $begin + 1;
1970 }
1971 }
1972
1973 require "unicore/UCD.pl";
1974 my $FIRST_NON_UNICODE = $MAX_UNICODE_CODEPOINT + 1;
1975
1976 # Could need to be inverted: add or subtract a 0 at the beginning of the
1977 # list. And to keep it from matching non-Unicode, add or subtract the
1978 # first non-unicode code point.
1979 if ($swash->{'INVERT_IT'}) {
1980 if (@invlist && $invlist[0] == 0) {
1981 shift @invlist;
1982 }
1983 else {
1984 unshift @invlist, 0;
1985 }
1986 if (@invlist && $invlist[-1] == $FIRST_NON_UNICODE) {
1987 pop @invlist;
1988 }
1989 else {
1990 push @invlist, $FIRST_NON_UNICODE;
1991 }
1992 }
1993
1994 # Here, the list is set up to include only Unicode code points. But, if
1995 # the table is the default one for the property, it should contain all
1996 # non-Unicode code points. First calculate the loose name for the
1997 # property. This is done even for strict-name properties, as the data
1998 # structure that mktables generates for us is set up so that we don't have
1999 # to worry about that. The property-value needs to be split if compound,
2000 # as the loose rules need to be independently calculated on each part. We
2001 # know that it is syntactically valid, or SWASHNEW would have failed.
2002
2003 $prop = lc $prop;
2004 my ($prop_only, $table) = split /\s*[:=]\s*/, $prop;
2005 if ($table) {
2006
2007 # May have optional prefixed 'is'
2008 $prop = utf8::_loose_name($prop_only) =~ s/^is//r;
2009 $prop = $utf8::loose_property_name_of{$prop};
2010 $prop .= "=" . utf8::_loose_name($table);
2011 }
2012 else {
2013 $prop = utf8::_loose_name($prop);
2014 }
2015 if (exists $loose_defaults{$prop}) {
2016
2017 # Here, is the default table. If a range ended with 10ffff, instead
2018 # continue that range to infinity, by popping the 110000; otherwise,
2019 # add the range from 11000 to infinity
2020 if (! @invlist || $invlist[-1] != $FIRST_NON_UNICODE) {
2021 push @invlist, $FIRST_NON_UNICODE;
2022 }
2023 else {
2024 pop @invlist;
2025 }
2026 }
2027
2028 return @invlist;
2029}
7319f91d 2030
55d7b906 2031=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 2032
a452d459
KW
2033This returns the version of the Unicode Character Database, in other words, the
2034version of the Unicode standard the database implements. The version is a
2035string of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
2036
2037=cut
2038
2039my $UNICODEVERSION;
2040
2041sub UnicodeVersion {
2042 unless (defined $UNICODEVERSION) {
2043 openunicode(\$VERSIONFH, "version");
2044 chomp($UNICODEVERSION = <$VERSIONFH>);
2045 close($VERSIONFH);
2046 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
2047 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
2048 }
2049 return $UNICODEVERSION;
2050}
3aa957f9 2051
a452d459
KW
2052=head2 B<Blocks versus Scripts>
2053
2054The difference between a block and a script is that scripts are closer
2055to the linguistic notion of a set of code points required to present
2056languages, while block is more of an artifact of the Unicode code point
2057numbering and separation into blocks of (mostly) 256 code points.
2058
2059For example the Latin B<script> is spread over several B<blocks>, such
2060as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
2061C<Latin Extended-B>. On the other hand, the Latin script does not
2062contain all the characters of the C<Basic Latin> block (also known as
2063ASCII): it includes only the letters, and not, for example, the digits
2064or the punctuation.
2065
2066For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
2067
2068For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
2069
2070=head2 B<Matching Scripts and Blocks>
2071
2072Scripts are matched with the regular-expression construct
2073C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
f200dd12 2074while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches
a452d459
KW
2075any of the 256 code points in the Tibetan block).
2076
8b731da2
JH
2077=head1 BUGS
2078
2079Does not yet support EBCDIC platforms.
2080
561c79ed
JH
2081=head1 AUTHOR
2082
a18e976f 2083Jarkko Hietaniemi. Now maintained by perl5 porters.
561c79ed
JH
2084
2085=cut
2086
20871;