This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Unicode::UCD: move common directory to subroutine
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
36c2430c 5no warnings 'surrogate'; # surrogates can be inputs to this
98ef7649 6use charnames ();
94c91ffc 7use Unicode::Normalize qw(getCombinClass NFD);
561c79ed 8
47f2dcf2 9our $VERSION = '0.41';
561c79ed 10
741297c1
JH
11use Storable qw(dclone);
12
561c79ed
JH
13require Exporter;
14
15our @ISA = qw(Exporter);
74f8133e 16
10a6ecd2
JH
17our @EXPORT_OK = qw(charinfo
18 charblock charscript
19 charblocks charscripts
b08cd201 20 charinrange
ea508aee 21 general_categories bidi_types
b08cd201 22 compexcl
a2bd7410 23 casefold casespec
7319f91d
KW
24 namedseq
25 num
7ef25837
KW
26 prop_aliases
27 prop_value_aliases
681d705c 28 prop_invlist
62b3b855 29 prop_invmap
681d705c 30 MAX_CP
7319f91d 31 );
561c79ed
JH
32
33use Carp;
34
35=head1 NAME
36
55d7b906 37Unicode::UCD - Unicode character database
561c79ed
JH
38
39=head1 SYNOPSIS
40
55d7b906 41 use Unicode::UCD 'charinfo';
b08cd201 42 my $charinfo = charinfo($codepoint);
561c79ed 43
956cae9a
KW
44 use Unicode::UCD 'casefold';
45 my $casefold = casefold(0xFB00);
46
5d8e6e41
KW
47 use Unicode::UCD 'casespec';
48 my $casespec = casespec(0xFB00);
49
55d7b906 50 use Unicode::UCD 'charblock';
e882dd67
JH
51 my $charblock = charblock($codepoint);
52
55d7b906 53 use Unicode::UCD 'charscript';
65044554 54 my $charscript = charscript($codepoint);
561c79ed 55
55d7b906 56 use Unicode::UCD 'charblocks';
e145285f
JH
57 my $charblocks = charblocks();
58
55d7b906 59 use Unicode::UCD 'charscripts';
ea508aee 60 my $charscripts = charscripts();
e145285f 61
55d7b906 62 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
63 my $range = charscript($script);
64 print "looks like $script\n" if charinrange($range, $codepoint);
65
ea508aee
JH
66 use Unicode::UCD qw(general_categories bidi_types);
67 my $categories = general_categories();
68 my $types = bidi_types();
69
7ef25837
KW
70 use Unicode::UCD 'prop_aliases';
71 my @space_names = prop_aliases("space");
72
73 use Unicode::UCD 'prop_value_aliases';
74 my @gc_punct_names = prop_value_aliases("Gc", "Punct");
75
681d705c
KW
76 use Unicode::UCD 'prop_invlist';
77 my @puncts = prop_invlist("gc=punctuation");
78
62b3b855
KW
79 use Unicode::UCD 'prop_invmap';
80 my ($list_ref, $map_ref, $format, $missing)
81 = prop_invmap("General Category");
82
55d7b906 83 use Unicode::UCD 'compexcl';
e145285f
JH
84 my $compexcl = compexcl($codepoint);
85
a2bd7410
JH
86 use Unicode::UCD 'namedseq';
87 my $namedseq = namedseq($named_sequence_name);
88
55d7b906 89 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 90
7319f91d 91 my $convert_to_numeric =
62a8c8c2 92 Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
7319f91d 93
561c79ed
JH
94=head1 DESCRIPTION
95
a452d459
KW
96The Unicode::UCD module offers a series of functions that
97provide a simple interface to the Unicode
8b731da2 98Character Database.
561c79ed 99
a452d459
KW
100=head2 code point argument
101
102Some of the functions are called with a I<code point argument>, which is either
103a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
104followed by hexadecimals designating a Unicode code point. In other words, if
105you want a code point to be interpreted as a hexadecimal number, you must
106prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
f200dd12
KW
107interpreted as a decimal code point. Note that the largest code point in
108Unicode is U+10FFFF.
c3e5bc54 109
561c79ed
JH
110=cut
111
10a6ecd2 112my $BLOCKSFH;
10a6ecd2 113my $VERSIONFH;
b08cd201
JH
114my $CASEFOLDFH;
115my $CASESPECFH;
a2bd7410 116my $NAMEDSEQFH;
561c79ed
JH
117
118sub openunicode {
119 my ($rfh, @path) = @_;
120 my $f;
121 unless (defined $$rfh) {
122 for my $d (@INC) {
123 use File::Spec;
55d7b906 124 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 125 last if open($$rfh, $f);
e882dd67 126 undef $f;
561c79ed 127 }
e882dd67
JH
128 croak __PACKAGE__, ": failed to find ",
129 File::Spec->catfile(@path), " in @INC"
130 unless defined $f;
561c79ed
JH
131 }
132 return $f;
133}
134
a452d459 135=head2 B<charinfo()>
561c79ed 136
55d7b906 137 use Unicode::UCD 'charinfo';
561c79ed 138
b08cd201 139 my $charinfo = charinfo(0x41);
561c79ed 140
a452d459
KW
141This returns information about the input L</code point argument>
142as a reference to a hash of fields as defined by the Unicode
143standard. If the L</code point argument> is not assigned in the standard
144(i.e., has the general category C<Cn> meaning C<Unassigned>)
145or is a non-character (meaning it is guaranteed to never be assigned in
146the standard),
a18e976f 147C<undef> is returned.
a452d459
KW
148
149Fields that aren't applicable to the particular code point argument exist in the
150returned hash, and are empty.
151
152The keys in the hash with the meanings of their values are:
153
154=over
155
156=item B<code>
157
158the input L</code point argument> expressed in hexadecimal, with leading zeros
159added if necessary to make it contain at least four hexdigits
160
161=item B<name>
162
163name of I<code>, all IN UPPER CASE.
164Some control-type code points do not have names.
165This field will be empty for C<Surrogate> and C<Private Use> code points,
166and for the others without a name,
167it will contain a description enclosed in angle brackets, like
168C<E<lt>controlE<gt>>.
169
170
171=item B<category>
172
173The short name of the general category of I<code>.
174This will match one of the keys in the hash returned by L</general_categories()>.
175
7ef25837
KW
176The L</prop_value_aliases()> function can be used to get all the synonyms
177of the category name.
178
a452d459
KW
179=item B<combining>
180
181the combining class number for I<code> used in the Canonical Ordering Algorithm.
182For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
183available at
184L<http://www.unicode.org/versions/Unicode5.1.0/>
185
7ef25837
KW
186The L</prop_value_aliases()> function can be used to get all the synonyms
187of the combining class number.
188
a452d459
KW
189=item B<bidi>
190
191bidirectional type of I<code>.
192This will match one of the keys in the hash returned by L</bidi_types()>.
193
7ef25837
KW
194The L</prop_value_aliases()> function can be used to get all the synonyms
195of the bidi type name.
196
a452d459
KW
197=item B<decomposition>
198
199is empty if I<code> has no decomposition; or is one or more codes
a18e976f 200(separated by spaces) that, taken in order, represent a decomposition for
a452d459
KW
201I<code>. Each has at least four hexdigits.
202The codes may be preceded by a word enclosed in angle brackets then a space,
203like C<E<lt>compatE<gt> >, giving the type of decomposition
204
06bba7d5
KW
205This decomposition may be an intermediate one whose components are also
206decomposable. Use L<Unicode::Normalize> to get the final decomposition.
207
a452d459
KW
208=item B<decimal>
209
210if I<code> is a decimal digit this is its integer numeric value
211
212=item B<digit>
213
89e4a205
KW
214if I<code> represents some other digit-like number, this is its integer
215numeric value
a452d459
KW
216
217=item B<numeric>
218
219if I<code> represents a whole or rational number, this is its numeric value.
220Rational values are expressed as a string like C<1/4>.
221
222=item B<mirrored>
223
224C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
225
226=item B<unicode10>
227
228name of I<code> in the Unicode 1.0 standard if one
229existed for this code point and is different from the current name
230
231=item B<comment>
232
89e4a205 233As of Unicode 6.0, this is always empty.
a452d459
KW
234
235=item B<upper>
236
06bba7d5 237is empty if there is no single code point uppercase mapping for I<code>
4f66642e 238(its uppercase mapping is itself);
a452d459
KW
239otherwise it is that mapping expressed as at least four hexdigits.
240(L</casespec()> should be used in addition to B<charinfo()>
241for case mappings when the calling program can cope with multiple code point
242mappings.)
243
244=item B<lower>
245
06bba7d5 246is empty if there is no single code point lowercase mapping for I<code>
4f66642e 247(its lowercase mapping is itself);
a452d459
KW
248otherwise it is that mapping expressed as at least four hexdigits.
249(L</casespec()> should be used in addition to B<charinfo()>
250for case mappings when the calling program can cope with multiple code point
251mappings.)
252
253=item B<title>
254
06bba7d5 255is empty if there is no single code point titlecase mapping for I<code>
4f66642e 256(its titlecase mapping is itself);
a452d459
KW
257otherwise it is that mapping expressed as at least four hexdigits.
258(L</casespec()> should be used in addition to B<charinfo()>
259for case mappings when the calling program can cope with multiple code point
260mappings.)
261
262=item B<block>
263
a18e976f 264the block I<code> belongs to (used in C<\p{Blk=...}>).
a452d459
KW
265See L</Blocks versus Scripts>.
266
267
268=item B<script>
269
a18e976f 270the script I<code> belongs to.
a452d459
KW
271See L</Blocks versus Scripts>.
272
273=back
32c16050
JH
274
275Note that you cannot do (de)composition and casing based solely on the
a452d459
KW
276I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
277you will need also the L</compexcl()>, and L</casespec()> functions.
561c79ed
JH
278
279=cut
280
e10d7780 281# NB: This function is nearly duplicated in charnames.pm
10a6ecd2
JH
282sub _getcode {
283 my $arg = shift;
284
dc0a4417 285 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 286 return $arg;
dc0a4417 287 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
288 return hex($1);
289 }
290
291 return;
292}
293
05dbc6f8
KW
294# Populated by _num. Converts real number back to input rational
295my %real_to_rational;
296
297# To store the contents of files found on disk.
298my @BIDIS;
299my @CATEGORIES;
300my @DECOMPOSITIONS;
301my @NUMERIC_TYPES;
5c3b35c9
KW
302my %SIMPLE_LOWER;
303my %SIMPLE_TITLE;
304my %SIMPLE_UPPER;
305my %UNICODE_1_NAMES;
05dbc6f8 306
05dbc6f8 307sub charinfo {
a6fa416b 308
05dbc6f8
KW
309 # This function has traditionally mimicked what is in UnicodeData.txt,
310 # warts and all. This is a re-write that avoids UnicodeData.txt so that
311 # it can be removed to save disk space. Instead, this assembles
312 # information gotten by other methods that get data from various other
313 # files. It uses charnames to get the character name; and various
314 # mktables tables.
324f9e44 315
05dbc6f8 316 use feature 'unicode_strings';
a6fa416b 317
10a6ecd2
JH
318 my $arg = shift;
319 my $code = _getcode($arg);
05dbc6f8
KW
320 croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
321
322 # Non-unicode implies undef.
323 return if $code > 0x10FFFF;
324
325 my %prop;
326 my $char = chr($code);
327
35a865d4 328 @CATEGORIES =_read_table("To/Gc.pl") unless @CATEGORIES;
05dbc6f8
KW
329 $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
330 // $utf8::SwashInfo{'ToGc'}{'missing'};
331
332 return if $prop{'category'} eq 'Cn'; # Unassigned code points are undef
333
334 $prop{'code'} = sprintf "%04X", $code;
335 $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
336 : (charnames::viacode($code) // "");
337
338 $prop{'combining'} = getCombinClass($code);
339
35a865d4 340 @BIDIS =_read_table("To/Bc.pl") unless @BIDIS;
05dbc6f8
KW
341 $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
342 // $utf8::SwashInfo{'ToBc'}{'missing'};
343
344 # For most code points, we can just read in "unicore/Decomposition.pl", as
345 # its contents are exactly what should be output. But that file doesn't
346 # contain the data for the Hangul syllable decompositions, which can be
94c91ffc
KW
347 # algorithmically computed, and NFD() does that, so we call NFD() for
348 # those. We can't use NFD() for everything, as it does a complete
05dbc6f8 349 # recursive decomposition, and what this function has always done is to
94c91ffc
KW
350 # return what's in UnicodeData.txt which doesn't show that recursiveness.
351 # Fortunately, the NFD() of the Hanguls doesn't have any recursion
352 # issues.
353 # Having no decomposition implies an empty field; otherwise, all but
354 # "Canonical" imply a compatible decomposition, and the type is prefixed
355 # to that, as it is in UnicodeData.txt
05dbc6f8
KW
356 if ($char =~ /\p{Block=Hangul_Syllables}/) {
357 # The code points of the decomposition are output in standard Unicode
358 # hex format, separated by blanks.
359 $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
94c91ffc 360 unpack "U*", NFD($char);
a6fa416b 361 }
05dbc6f8 362 else {
35a865d4 363 @DECOMPOSITIONS = _read_table("Decomposition.pl")
05dbc6f8
KW
364 unless @DECOMPOSITIONS;
365 $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
366 $code) // "";
561c79ed 367 }
05dbc6f8
KW
368
369 # Can use num() to get the numeric values, if any.
370 if (! defined (my $value = num($char))) {
371 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
372 }
373 else {
374 if ($char =~ /\d/) {
375 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
376 }
377 else {
378
379 # For non-decimal-digits, we have to read in the Numeric type
380 # to distinguish them. It is not just a matter of integer vs.
381 # rational, as some whole number values are not considered digits,
382 # e.g., TAMIL NUMBER TEN.
383 $prop{'decimal'} = "";
384
35a865d4 385 @NUMERIC_TYPES =_read_table("To/Nt.pl") unless @NUMERIC_TYPES;
05dbc6f8
KW
386 if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
387 eq 'Digit')
388 {
389 $prop{'digit'} = $prop{'numeric'} = $value;
390 }
391 else {
392 $prop{'digit'} = "";
393 $prop{'numeric'} = $real_to_rational{$value} // $value;
394 }
395 }
396 }
397
398 $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
399
35a865d4 400 %UNICODE_1_NAMES =_read_table("To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
5c3b35c9 401 $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
05dbc6f8
KW
402
403 # This is true starting in 6.0, but, num() also requires 6.0, so
404 # don't need to test for version again here.
405 $prop{'comment'} = "";
406
35a865d4 407 %SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER;
bf7fe2df
KW
408 $prop{'upper'} = (defined $SIMPLE_UPPER{$code})
409 ? sprintf("%04X", $SIMPLE_UPPER{$code} + $code)
410 : "";
75e7c50b 411
35a865d4 412 %SIMPLE_LOWER = _read_table("To/Lc.pl", "use_hash") unless %SIMPLE_LOWER;
bf7fe2df
KW
413 $prop{'lower'} = (defined $SIMPLE_LOWER{$code})
414 ? sprintf("%04X", $SIMPLE_LOWER{$code} + $code)
415 : "";
75e7c50b 416
35a865d4 417 %SIMPLE_TITLE = _read_table("To/Tc.pl", "use_hash") unless %SIMPLE_TITLE;
bf7fe2df
KW
418 $prop{'title'} = (defined $SIMPLE_TITLE{$code})
419 ? sprintf("%04X", $SIMPLE_TITLE{$code} + $code)
420 : "";
05dbc6f8
KW
421
422 $prop{block} = charblock($code);
423 $prop{script} = charscript($code);
424 return \%prop;
561c79ed
JH
425}
426
e882dd67
JH
427sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
428 my ($table, $lo, $hi, $code) = @_;
429
430 return if $lo > $hi;
431
432 my $mid = int(($lo+$hi) / 2);
433
434 if ($table->[$mid]->[0] < $code) {
10a6ecd2 435 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
436 return $table->[$mid]->[2];
437 } else {
438 _search($table, $mid + 1, $hi, $code);
439 }
440 } elsif ($table->[$mid]->[0] > $code) {
441 _search($table, $lo, $mid - 1, $code);
442 } else {
443 return $table->[$mid]->[2];
444 }
445}
446
cb366075 447sub _read_table ($;$) {
3a12600d
KW
448
449 # Returns the contents of the mktables generated table file located at $1
cb366075
KW
450 # in the form of either an array of arrays or a hash, depending on if the
451 # optional second parameter is true (for hash return) or not. In the case
452 # of a hash return, each key is a code point, and its corresponding value
453 # is what the table gives as the code point's corresponding value. In the
454 # case of an array return, each outer array denotes a range with [0] the
455 # start point of that range; [1] the end point; and [2] the value that
456 # every code point in the range has. The hash return is useful for fast
457 # lookup when the table contains only single code point ranges. The array
458 # return takes much less memory when there are large ranges.
3a12600d 459 #
cb366075 460 # This function has the side effect of setting
3a12600d
KW
461 # $utf8::SwashInfo{$property}{'format'} to be the mktables format of the
462 # table; and
463 # $utf8::SwashInfo{$property}{'missing'} to be the value for all entries
464 # not listed in the table.
465 # where $property is the Unicode property name, preceded by 'To' for map
466 # properties., e.g., 'ToSc'.
467 #
468 # Table entries look like one of:
469 # 0000 0040 Common # [65]
470 # 00AA Latin
471
472 my $table = shift;
cb366075
KW
473 my $return_hash = shift;
474 $return_hash = 0 unless defined $return_hash;
3a12600d 475 my @return;
cb366075 476 my %return;
3a12600d
KW
477 local $_;
478
35a865d4 479 for (split /^/m, do "unicore/$table") {
3a12600d
KW
480 my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
481 \s* ( \# .* )? # Optional comment
482 $ /x;
83fd1222
KW
483 my $decimal_start = hex $start;
484 my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
cb366075 485 if ($return_hash) {
83fd1222 486 foreach my $i ($decimal_start .. $decimal_end) {
cb366075
KW
487 $return{$i} = $value;
488 }
489 }
9a96c106
KW
490 elsif (@return &&
491 $return[-1][1] == $decimal_start - 1
492 && $return[-1][2] eq $value)
493 {
494 # If this is merely extending the previous range, do just that.
495 $return[-1]->[1] = $decimal_end;
496 }
cb366075 497 else {
83fd1222 498 push @return, [ $decimal_start, $decimal_end, $value ];
cb366075 499 }
3a12600d 500 }
cb366075 501 return ($return_hash) ? %return : @return;
3a12600d
KW
502}
503
10a6ecd2
JH
504sub charinrange {
505 my ($range, $arg) = @_;
506 my $code = _getcode($arg);
507 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
508 unless defined $code;
509 _search($range, 0, $#$range, $code);
510}
511
a452d459 512=head2 B<charblock()>
561c79ed 513
55d7b906 514 use Unicode::UCD 'charblock';
561c79ed
JH
515
516 my $charblock = charblock(0x41);
10a6ecd2 517 my $charblock = charblock(1234);
a452d459 518 my $charblock = charblock(0x263a);
10a6ecd2
JH
519 my $charblock = charblock("U+263a");
520
78bf21c2 521 my $range = charblock('Armenian');
10a6ecd2 522
a452d459 523With a L</code point argument> charblock() returns the I<block> the code point
430fe03d
KW
524belongs to, e.g. C<Basic Latin>. The old-style block name is returned (see
525L</Old-style versus new-style block names>).
a452d459 526If the code point is unassigned, this returns the block it would belong to if
a18e976f 527it were assigned.
10a6ecd2 528
78bf21c2
JH
529See also L</Blocks versus Scripts>.
530
18972f4b 531If supplied with an argument that can't be a code point, charblock() tries to
430fe03d
KW
532do the opposite and interpret the argument as an old-style block name. The
533return value
a18e976f
KW
534is a I<range set> with one range: an anonymous list with a single element that
535consists of another anonymous list whose first element is the first code point
536in the block, and whose second (and final) element is the final code point in
537the block. (The extra list consisting of just one element is so that the same
538program logic can be used to handle both this return, and the return from
539L</charscript()> which can have multiple ranges.) You can test whether a code
540point is in a range using the L</charinrange()> function. If the argument is
541not a known block, C<undef> is returned.
561c79ed 542
561c79ed
JH
543=cut
544
545my @BLOCKS;
10a6ecd2 546my %BLOCKS;
561c79ed 547
10a6ecd2 548sub _charblocks {
06bba7d5
KW
549
550 # Can't read from the mktables table because it loses the hyphens in the
551 # original.
561c79ed 552 unless (@BLOCKS) {
10a6ecd2 553 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 554 local $_;
ce066323 555 local $/ = "\n";
10a6ecd2 556 while (<$BLOCKSFH>) {
2796c109 557 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
558 my ($lo, $hi) = (hex($1), hex($2));
559 my $subrange = [ $lo, $hi, $3 ];
560 push @BLOCKS, $subrange;
561 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
562 }
563 }
10a6ecd2 564 close($BLOCKSFH);
561c79ed
JH
565 }
566 }
10a6ecd2
JH
567}
568
569sub charblock {
570 my $arg = shift;
571
572 _charblocks() unless @BLOCKS;
573
574 my $code = _getcode($arg);
561c79ed 575
10a6ecd2 576 if (defined $code) {
c707cf8e
KW
577 my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code);
578 return $result if defined $result;
579 return 'No_Block';
580 }
581 elsif (exists $BLOCKS{$arg}) {
582 return dclone $BLOCKS{$arg};
10a6ecd2 583 }
e882dd67
JH
584}
585
a452d459 586=head2 B<charscript()>
e882dd67 587
55d7b906 588 use Unicode::UCD 'charscript';
e882dd67
JH
589
590 my $charscript = charscript(0x41);
10a6ecd2
JH
591 my $charscript = charscript(1234);
592 my $charscript = charscript("U+263a");
e882dd67 593
78bf21c2 594 my $range = charscript('Thai');
10a6ecd2 595
a452d459
KW
596With a L</code point argument> charscript() returns the I<script> the
597code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
bb2d29dc 598If the code point is unassigned, it returns C<"Unknown">.
78bf21c2 599
eb0cc9e3 600If supplied with an argument that can't be a code point, charscript() tries
a18e976f
KW
601to do the opposite and interpret the argument as a script name. The
602return value is a I<range set>: an anonymous list of lists that contain
eb0cc9e3 603I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
a18e976f
KW
604code point is in a range set using the L</charinrange()> function. If the
605argument is not a known script, C<undef> is returned.
a452d459
KW
606
607See also L</Blocks versus Scripts>.
e882dd67 608
e882dd67
JH
609=cut
610
611my @SCRIPTS;
10a6ecd2 612my %SCRIPTS;
e882dd67 613
10a6ecd2 614sub _charscripts {
35a865d4 615 @SCRIPTS =_read_table("To/Sc.pl") unless @SCRIPTS;
7bccef0b 616 foreach my $entry (@SCRIPTS) {
f3d50ac9 617 $entry->[2] =~ s/(_\w)/\L$1/g; # Preserve old-style casing
7bccef0b 618 push @{$SCRIPTS{$entry->[2]}}, $entry;
e882dd67 619 }
10a6ecd2
JH
620}
621
622sub charscript {
623 my $arg = shift;
624
625 _charscripts() unless @SCRIPTS;
e882dd67 626
10a6ecd2
JH
627 my $code = _getcode($arg);
628
629 if (defined $code) {
7bccef0b
KW
630 my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
631 return $result if defined $result;
8079ad82 632 return $utf8::SwashInfo{'ToSc'}{'missing'};
7bccef0b
KW
633 } elsif (exists $SCRIPTS{$arg}) {
634 return dclone $SCRIPTS{$arg};
10a6ecd2 635 }
7bccef0b
KW
636
637 return;
10a6ecd2
JH
638}
639
a452d459 640=head2 B<charblocks()>
10a6ecd2 641
55d7b906 642 use Unicode::UCD 'charblocks';
10a6ecd2 643
b08cd201 644 my $charblocks = charblocks();
10a6ecd2 645
b08cd201 646charblocks() returns a reference to a hash with the known block names
a452d459 647as the keys, and the code point ranges (see L</charblock()>) as the values.
10a6ecd2 648
430fe03d
KW
649The names are in the old-style (see L</Old-style versus new-style block
650names>).
651
62b3b855
KW
652L<prop_invmap("block")|/prop_invmap()> can be used to get this same data in a
653different type of data structure.
654
78bf21c2
JH
655See also L</Blocks versus Scripts>.
656
10a6ecd2
JH
657=cut
658
659sub charblocks {
b08cd201 660 _charblocks() unless %BLOCKS;
741297c1 661 return dclone \%BLOCKS;
10a6ecd2
JH
662}
663
a452d459 664=head2 B<charscripts()>
10a6ecd2 665
55d7b906 666 use Unicode::UCD 'charscripts';
10a6ecd2 667
ea508aee 668 my $charscripts = charscripts();
10a6ecd2 669
ea508aee 670charscripts() returns a reference to a hash with the known script
a452d459 671names as the keys, and the code point ranges (see L</charscript()>) as
ea508aee 672the values.
10a6ecd2 673
62b3b855
KW
674L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
675different type of data structure.
676
78bf21c2
JH
677See also L</Blocks versus Scripts>.
678
10a6ecd2
JH
679=cut
680
681sub charscripts {
b08cd201 682 _charscripts() unless %SCRIPTS;
741297c1 683 return dclone \%SCRIPTS;
561c79ed
JH
684}
685
a452d459 686=head2 B<charinrange()>
10a6ecd2 687
f200dd12 688In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
10a6ecd2 689can also test whether a code point is in the I<range> as returned by
a452d459
KW
690L</charblock()> and L</charscript()> or as the values of the hash returned
691by L</charblocks()> and L</charscripts()> by using charinrange():
10a6ecd2 692
55d7b906 693 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
694
695 $range = charscript('Hiragana');
e145285f 696 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
697
698=cut
699
ea508aee
JH
700my %GENERAL_CATEGORIES =
701 (
702 'L' => 'Letter',
703 'LC' => 'CasedLetter',
704 'Lu' => 'UppercaseLetter',
705 'Ll' => 'LowercaseLetter',
706 'Lt' => 'TitlecaseLetter',
707 'Lm' => 'ModifierLetter',
708 'Lo' => 'OtherLetter',
709 'M' => 'Mark',
710 'Mn' => 'NonspacingMark',
711 'Mc' => 'SpacingMark',
712 'Me' => 'EnclosingMark',
713 'N' => 'Number',
714 'Nd' => 'DecimalNumber',
715 'Nl' => 'LetterNumber',
716 'No' => 'OtherNumber',
717 'P' => 'Punctuation',
718 'Pc' => 'ConnectorPunctuation',
719 'Pd' => 'DashPunctuation',
720 'Ps' => 'OpenPunctuation',
721 'Pe' => 'ClosePunctuation',
722 'Pi' => 'InitialPunctuation',
723 'Pf' => 'FinalPunctuation',
724 'Po' => 'OtherPunctuation',
725 'S' => 'Symbol',
726 'Sm' => 'MathSymbol',
727 'Sc' => 'CurrencySymbol',
728 'Sk' => 'ModifierSymbol',
729 'So' => 'OtherSymbol',
730 'Z' => 'Separator',
731 'Zs' => 'SpaceSeparator',
732 'Zl' => 'LineSeparator',
733 'Zp' => 'ParagraphSeparator',
734 'C' => 'Other',
735 'Cc' => 'Control',
736 'Cf' => 'Format',
737 'Cs' => 'Surrogate',
738 'Co' => 'PrivateUse',
739 'Cn' => 'Unassigned',
740 );
741
742sub general_categories {
743 return dclone \%GENERAL_CATEGORIES;
744}
745
a452d459 746=head2 B<general_categories()>
ea508aee
JH
747
748 use Unicode::UCD 'general_categories';
749
750 my $categories = general_categories();
751
a452d459 752This returns a reference to a hash which has short
ea508aee
JH
753general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
754names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
755C<Symbol>) as values. The hash is reversible in case you need to go
756from the long names to the short names. The general category is the
a452d459
KW
757one returned from
758L</charinfo()> under the C<category> key.
ea508aee 759
7ef25837
KW
760The L</prop_value_aliases()> function can be used to get all the synonyms of
761the category name.
762
ea508aee
JH
763=cut
764
765my %BIDI_TYPES =
766 (
767 'L' => 'Left-to-Right',
768 'LRE' => 'Left-to-Right Embedding',
769 'LRO' => 'Left-to-Right Override',
770 'R' => 'Right-to-Left',
771 'AL' => 'Right-to-Left Arabic',
772 'RLE' => 'Right-to-Left Embedding',
773 'RLO' => 'Right-to-Left Override',
774 'PDF' => 'Pop Directional Format',
775 'EN' => 'European Number',
776 'ES' => 'European Number Separator',
777 'ET' => 'European Number Terminator',
778 'AN' => 'Arabic Number',
779 'CS' => 'Common Number Separator',
780 'NSM' => 'Non-Spacing Mark',
781 'BN' => 'Boundary Neutral',
782 'B' => 'Paragraph Separator',
783 'S' => 'Segment Separator',
784 'WS' => 'Whitespace',
785 'ON' => 'Other Neutrals',
786 );
787
a452d459 788=head2 B<bidi_types()>
ea508aee
JH
789
790 use Unicode::UCD 'bidi_types';
791
792 my $categories = bidi_types();
793
a452d459 794This returns a reference to a hash which has the short
ea508aee
JH
795bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
796names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
797hash is reversible in case you need to go from the long names to the
a452d459
KW
798short names. The bidi type is the one returned from
799L</charinfo()>
ea508aee
JH
800under the C<bidi> key. For the exact meaning of the various bidi classes
801the Unicode TR9 is recommended reading:
a452d459 802L<http://www.unicode.org/reports/tr9/>
ea508aee
JH
803(as of Unicode 5.0.0)
804
7ef25837
KW
805The L</prop_value_aliases()> function can be used to get all the synonyms of
806the bidi type name.
807
ea508aee
JH
808=cut
809
a452d459
KW
810sub bidi_types {
811 return dclone \%BIDI_TYPES;
812}
813
814=head2 B<compexcl()>
b08cd201 815
55d7b906 816 use Unicode::UCD 'compexcl';
b08cd201 817
a452d459 818 my $compexcl = compexcl(0x09dc);
b08cd201 819
71a442a8
KW
820This routine is included for backwards compatibility, but as of Perl 5.12, for
821most purposes it is probably more convenient to use one of the following
822instead:
823
824 my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
825 my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
826
827or even
828
829 my $compexcl = chr(0x09dc) =~ /\p{CE};
830 my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
831
832The first two forms return B<true> if the L</code point argument> should not
76b05678
KW
833be produced by composition normalization. For the final two forms to return
834B<true>, it is additionally required that this fact not otherwise be
835determinable from the Unicode data base.
71a442a8
KW
836
837This routine behaves identically to the final two forms. That is,
838it does not return B<true> if the code point has a decomposition
a452d459
KW
839consisting of another single code point, nor if its decomposition starts
840with a code point whose combining class is non-zero. Code points that meet
841either of these conditions should also not be produced by composition
71a442a8
KW
842normalization, which is probably why you should use the
843C<Full_Composition_Exclusion> property instead, as shown above.
b08cd201 844
71a442a8 845The routine returns B<false> otherwise.
b08cd201
JH
846
847=cut
848
b08cd201
JH
849sub compexcl {
850 my $arg = shift;
851 my $code = _getcode($arg);
74f8133e
JH
852 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
853 unless defined $code;
b08cd201 854
36c2430c 855 no warnings "non_unicode"; # So works on non-Unicode code points
71a442a8 856 return chr($code) =~ /\p{Composition_Exclusion}/;
b08cd201
JH
857}
858
a452d459 859=head2 B<casefold()>
b08cd201 860
55d7b906 861 use Unicode::UCD 'casefold';
b08cd201 862
a452d459
KW
863 my $casefold = casefold(0xDF);
864 if (defined $casefold) {
865 my @full_fold_hex = split / /, $casefold->{'full'};
866 my $full_fold_string =
867 join "", map {chr(hex($_))} @full_fold_hex;
868 my @turkic_fold_hex =
869 split / /, ($casefold->{'turkic'} ne "")
870 ? $casefold->{'turkic'}
871 : $casefold->{'full'};
872 my $turkic_fold_string =
873 join "", map {chr(hex($_))} @turkic_fold_hex;
874 }
875 if (defined $casefold && $casefold->{'simple'} ne "") {
876 my $simple_fold_hex = $casefold->{'simple'};
877 my $simple_fold_string = chr(hex($simple_fold_hex));
878 }
b08cd201 879
a452d459
KW
880This returns the (almost) locale-independent case folding of the
881character specified by the L</code point argument>.
b08cd201 882
a18e976f 883If there is no case folding for that code point, C<undef> is returned.
a452d459
KW
884
885If there is a case folding for that code point, a reference to a hash
b08cd201
JH
886with the following fields is returned:
887
a452d459
KW
888=over
889
890=item B<code>
891
892the input L</code point argument> expressed in hexadecimal, with leading zeros
893added if necessary to make it contain at least four hexdigits
894
895=item B<full>
896
a18e976f 897one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
898code points for the case folding for I<code>.
899Each has at least four hexdigits.
900
901=item B<simple>
902
903is empty, or is exactly one code with at least four hexdigits which can be used
904as an alternative case folding when the calling program cannot cope with the
905fold being a sequence of multiple code points. If I<full> is just one code
906point, then I<simple> equals I<full>. If there is no single code point folding
907defined for I<code>, then I<simple> is the empty string. Otherwise, it is an
908inferior, but still better-than-nothing alternative folding to I<full>.
909
910=item B<mapping>
911
912is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
913otherwise. It can be considered to be the simplest possible folding for
914I<code>. It is defined primarily for backwards compatibility.
915
916=item B<status>
b08cd201 917
a452d459
KW
918is C<C> (for C<common>) if the best possible fold is a single code point
919(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct
920folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if
a18e976f
KW
921there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).
922Note that this
a452d459
KW
923describes the contents of I<mapping>. It is defined primarily for backwards
924compatibility.
b08cd201 925
a452d459
KW
926On versions 3.1 and earlier of Unicode, I<status> can also be
927C<I> which is the same as C<C> but is a special case for dotted uppercase I and
928dotless lowercase i:
b08cd201 929
a452d459 930=over
b08cd201 931
a18e976f 932=item B<*> If you use this C<I> mapping
a452d459 933
a18e976f 934the result is case-insensitive,
a452d459
KW
935but dotless and dotted I's are not distinguished
936
a18e976f 937=item B<*> If you exclude this C<I> mapping
a452d459 938
a18e976f 939the result is not fully case-insensitive, but
a452d459
KW
940dotless and dotted I's are distinguished
941
942=back
943
944=item B<turkic>
945
946contains any special folding for Turkic languages. For versions of Unicode
947starting with 3.2, this field is empty unless I<code> has a different folding
948in Turkic languages, in which case it is one or more codes (separated by
a18e976f 949spaces) that, taken in order, give the code points for the case folding for
a452d459
KW
950I<code> in those languages.
951Each code has at least four hexdigits.
952Note that this folding does not maintain canonical equivalence without
953additional processing.
954
955For versions of Unicode 3.1 and earlier, this field is empty unless there is a
956special folding for Turkic languages, in which case I<status> is C<I>, and
957I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
958
959=back
960
961Programs that want complete generality and the best folding results should use
962the folding contained in the I<full> field. But note that the fold for some
963code points will be a sequence of multiple code points.
964
965Programs that can't cope with the fold mapping being multiple code points can
966use the folding contained in the I<simple> field, with the loss of some
967generality. In Unicode 5.1, about 7% of the defined foldings have no single
968code point folding.
969
970The I<mapping> and I<status> fields are provided for backwards compatibility for
971existing programs. They contain the same values as in previous versions of
972this function.
973
974Locale is not completely independent. The I<turkic> field contains results to
975use when the locale is a Turkic language.
b08cd201
JH
976
977For more information about case mappings see
a452d459 978L<http://www.unicode.org/unicode/reports/tr21>
b08cd201
JH
979
980=cut
981
982my %CASEFOLD;
983
984sub _casefold {
985 unless (%CASEFOLD) {
551b6b6f 986 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 987 local $_;
ce066323 988 local $/ = "\n";
b08cd201 989 while (<$CASEFOLDFH>) {
a452d459 990 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
b08cd201 991 my $code = hex($1);
a452d459
KW
992 $CASEFOLD{$code}{'code'} = $1;
993 $CASEFOLD{$code}{'turkic'} = "" unless
994 defined $CASEFOLD{$code}{'turkic'};
995 if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and
996 # earlier Unicodes
997 # Both entries there (I
998 # only checked 3.1) are
999 # the same as C, and
1000 # there are no other
1001 # entries for those
1002 # codepoints, so treat
1003 # as if C, but override
1004 # the turkic one for
1005 # 'I'.
1006 $CASEFOLD{$code}{'status'} = $2;
1007 $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
1008 $CASEFOLD{$code}{'mapping'} = $3;
1009 $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
1010 } elsif ($2 eq 'F') {
1011 $CASEFOLD{$code}{'full'} = $3;
1012 unless (defined $CASEFOLD{$code}{'simple'}) {
1013 $CASEFOLD{$code}{'simple'} = "";
1014 $CASEFOLD{$code}{'mapping'} = $3;
1015 $CASEFOLD{$code}{'status'} = $2;
1016 }
1017 } elsif ($2 eq 'S') {
1018
1019
1020 # There can't be a simple without a full, and simple
1021 # overrides all but full
1022
1023 $CASEFOLD{$code}{'simple'} = $3;
1024 $CASEFOLD{$code}{'mapping'} = $3;
1025 $CASEFOLD{$code}{'status'} = $2;
1026 } elsif ($2 eq 'T') {
1027 $CASEFOLD{$code}{'turkic'} = $3;
1028 } # else can't happen because only [CIFST] are possible
b08cd201
JH
1029 }
1030 }
1031 close($CASEFOLDFH);
1032 }
1033 }
1034}
1035
1036sub casefold {
1037 my $arg = shift;
1038 my $code = _getcode($arg);
74f8133e
JH
1039 croak __PACKAGE__, "::casefold: unknown code '$arg'"
1040 unless defined $code;
b08cd201
JH
1041
1042 _casefold() unless %CASEFOLD;
1043
1044 return $CASEFOLD{$code};
1045}
1046
a452d459 1047=head2 B<casespec()>
b08cd201 1048
55d7b906 1049 use Unicode::UCD 'casespec';
b08cd201 1050
a452d459 1051 my $casespec = casespec(0xFB00);
b08cd201 1052
a452d459
KW
1053This returns the potentially locale-dependent case mappings of the L</code point
1054argument>. The mappings may be longer than a single code point (which the basic
1055Unicode case mappings as returned by L</charinfo()> never are).
b08cd201 1056
a452d459
KW
1057If there are no case mappings for the L</code point argument>, or if all three
1058possible mappings (I<lower>, I<title> and I<upper>) result in single code
a18e976f 1059points and are locale independent and unconditional, C<undef> is returned
5d8e6e41
KW
1060(which means that the case mappings, if any, for the code point are those
1061returned by L</charinfo()>).
a452d459
KW
1062
1063Otherwise, a reference to a hash giving the mappings (or a reference to a hash
5d8e6e41
KW
1064of such hashes, explained below) is returned with the following keys and their
1065meanings:
a452d459
KW
1066
1067The keys in the bottom layer hash with the meanings of their values are:
1068
1069=over
1070
1071=item B<code>
1072
1073the input L</code point argument> expressed in hexadecimal, with leading zeros
1074added if necessary to make it contain at least four hexdigits
1075
1076=item B<lower>
1077
a18e976f 1078one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1079code points for the lower case of I<code>.
1080Each has at least four hexdigits.
1081
1082=item B<title>
b08cd201 1083
a18e976f 1084one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1085code points for the title case of I<code>.
1086Each has at least four hexdigits.
b08cd201 1087
d2da20e3 1088=item B<upper>
b08cd201 1089
a18e976f 1090one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1091code points for the upper case of I<code>.
1092Each has at least four hexdigits.
1093
1094=item B<condition>
1095
1096the conditions for the mappings to be valid.
a18e976f 1097If C<undef>, the mappings are always valid.
a452d459
KW
1098When defined, this field is a list of conditions,
1099all of which must be true for the mappings to be valid.
1100The list consists of one or more
1101I<locales> (see below)
1102and/or I<contexts> (explained in the next paragraph),
1103separated by spaces.
1104(Other than as used to separate elements, spaces are to be ignored.)
1105Case distinctions in the condition list are not significant.
82c0b05b 1106Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 1107
a452d459
KW
1108A I<context> is one of those defined in the Unicode standard.
1109For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1110available at
5d8e6e41
KW
1111L<http://www.unicode.org/versions/Unicode5.1.0/>.
1112These are for context-sensitive casing.
f499c386 1113
a452d459
KW
1114=back
1115
5d8e6e41 1116The hash described above is returned for locale-independent casing, where
a18e976f 1117at least one of the mappings has length longer than one. If C<undef> is
5d8e6e41
KW
1118returned, the code point may have mappings, but if so, all are length one,
1119and are returned by L</charinfo()>.
1120Note that when this function does return a value, it will be for the complete
1121set of mappings for a code point, even those whose length is one.
1122
1123If there are additional casing rules that apply only in certain locales,
1124an additional key for each will be defined in the returned hash. Each such key
1125will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1126followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1127and a variant code). You can find the lists of all possible locales, see
1128L<Locale::Country> and L<Locale::Language>.
89e4a205 1129(In Unicode 6.0, the only locales returned by this function
a452d459 1130are C<lt>, C<tr>, and C<az>.)
b08cd201 1131
5d8e6e41
KW
1132Each locale key is a reference to a hash that has the form above, and gives
1133the casing rules for that particular locale, which take precedence over the
1134locale-independent ones when in that locale.
1135
1136If the only casing for a code point is locale-dependent, then the returned
1137hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1138will contain only locale keys.
1139
b08cd201 1140For more information about case mappings see
a452d459 1141L<http://www.unicode.org/unicode/reports/tr21/>
b08cd201
JH
1142
1143=cut
1144
1145my %CASESPEC;
1146
1147sub _casespec {
1148 unless (%CASESPEC) {
551b6b6f 1149 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 1150 local $_;
ce066323 1151 local $/ = "\n";
b08cd201
JH
1152 while (<$CASESPECFH>) {
1153 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
1154 my ($hexcode, $lower, $title, $upper, $condition) =
1155 ($1, $2, $3, $4, $5);
1156 my $code = hex($hexcode);
1157 if (exists $CASESPEC{$code}) {
1158 if (exists $CASESPEC{$code}->{code}) {
1159 my ($oldlower,
1160 $oldtitle,
1161 $oldupper,
1162 $oldcondition) =
1163 @{$CASESPEC{$code}}{qw(lower
1164 title
1165 upper
1166 condition)};
822ebcc8
JH
1167 if (defined $oldcondition) {
1168 my ($oldlocale) =
f499c386 1169 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
1170 delete $CASESPEC{$code};
1171 $CASESPEC{$code}->{$oldlocale} =
1172 { code => $hexcode,
1173 lower => $oldlower,
1174 title => $oldtitle,
1175 upper => $oldupper,
1176 condition => $oldcondition };
f499c386
JH
1177 }
1178 }
1179 my ($locale) =
1180 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1181 $CASESPEC{$code}->{$locale} =
1182 { code => $hexcode,
1183 lower => $lower,
1184 title => $title,
1185 upper => $upper,
1186 condition => $condition };
1187 } else {
1188 $CASESPEC{$code} =
1189 { code => $hexcode,
1190 lower => $lower,
1191 title => $title,
1192 upper => $upper,
1193 condition => $condition };
1194 }
b08cd201
JH
1195 }
1196 }
1197 close($CASESPECFH);
1198 }
1199 }
1200}
1201
1202sub casespec {
1203 my $arg = shift;
1204 my $code = _getcode($arg);
74f8133e
JH
1205 croak __PACKAGE__, "::casespec: unknown code '$arg'"
1206 unless defined $code;
b08cd201
JH
1207
1208 _casespec() unless %CASESPEC;
1209
741297c1 1210 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
1211}
1212
a452d459 1213=head2 B<namedseq()>
a2bd7410
JH
1214
1215 use Unicode::UCD 'namedseq';
1216
1217 my $namedseq = namedseq("KATAKANA LETTER AINU P");
1218 my @namedseq = namedseq("KATAKANA LETTER AINU P");
1219 my %namedseq = namedseq();
1220
1221If used with a single argument in a scalar context, returns the string
a18e976f 1222consisting of the code points of the named sequence, or C<undef> if no
a2bd7410 1223named sequence by that name exists. If used with a single argument in
956cae9a
KW
1224a list context, it returns the list of the ordinals of the code points. If used
1225with no
a2bd7410
JH
1226arguments in a list context, returns a hash with the names of the
1227named sequences as the keys and the named sequences as strings as
a18e976f 1228the values. Otherwise, it returns C<undef> or an empty list depending
a2bd7410
JH
1229on the context.
1230
a452d459
KW
1231This function only operates on officially approved (not provisional) named
1232sequences.
a2bd7410 1233
27f853a0
KW
1234Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
1235sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
1236LETTER AINU P")> will return the same string this function does, but will also
1237operate on character names that aren't named sequences, without you having to
1238know which are which. See L<charnames>.
1239
a2bd7410
JH
1240=cut
1241
1242my %NAMEDSEQ;
1243
1244sub _namedseq {
1245 unless (%NAMEDSEQ) {
98ef7649 1246 if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
a2bd7410 1247 local $_;
ce066323 1248 local $/ = "\n";
a2bd7410 1249 while (<$NAMEDSEQFH>) {
98ef7649
KW
1250 if (/^ [0-9A-F]+ \ /x) {
1251 chomp;
1252 my ($sequence, $name) = split /\t/;
1253 my @s = map { chr(hex($_)) } split(' ', $sequence);
1254 $NAMEDSEQ{$name} = join("", @s);
a2bd7410
JH
1255 }
1256 }
1257 close($NAMEDSEQFH);
1258 }
1259 }
1260}
1261
1262sub namedseq {
98ef7649
KW
1263
1264 # Use charnames::string_vianame() which now returns this information,
1265 # unless the caller wants the hash returned, in which case we read it in,
1266 # and thereafter use it instead of calling charnames, as it is faster.
1267
a2bd7410
JH
1268 my $wantarray = wantarray();
1269 if (defined $wantarray) {
1270 if ($wantarray) {
1271 if (@_ == 0) {
98ef7649 1272 _namedseq() unless %NAMEDSEQ;
a2bd7410
JH
1273 return %NAMEDSEQ;
1274 } elsif (@_ == 1) {
98ef7649
KW
1275 my $s;
1276 if (%NAMEDSEQ) {
1277 $s = $NAMEDSEQ{ $_[0] };
1278 }
1279 else {
1280 $s = charnames::string_vianame($_[0]);
1281 }
a2bd7410
JH
1282 return defined $s ? map { ord($_) } split('', $s) : ();
1283 }
1284 } elsif (@_ == 1) {
98ef7649
KW
1285 return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
1286 return charnames::string_vianame($_[0]);
a2bd7410
JH
1287 }
1288 }
1289 return;
1290}
1291
7319f91d
KW
1292my %NUMERIC;
1293
1294sub _numeric {
1295
1296 # Unicode 6.0 instituted the rule that only digits in a consecutive
1297 # block of 10 would be considered decimal digits. Before that, the only
1298 # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
1299 # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
1300 # ONE. The code could be modified to handle that, but not bothering, as
1301 # in TUS 6.0, U+19DA was changed to Nt=Di.
1302 if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
1303 croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
1304 }
35a865d4 1305 my @numbers = _read_table("To/Nv.pl");
98025745
KW
1306 foreach my $entry (@numbers) {
1307 my ($start, $end, $value) = @$entry;
1308
05dbc6f8
KW
1309 # If value contains a slash, convert to decimal, add a reverse hash
1310 # used by charinfo.
98025745
KW
1311 if ((my @rational = split /\//, $value) == 2) {
1312 my $real = $rational[0] / $rational[1];
05dbc6f8 1313 $real_to_rational{$real} = $value;
98025745
KW
1314 $value = $real;
1315 }
1316
1317 for my $i ($start .. $end) {
1318 $NUMERIC{$i} = $value;
7319f91d 1319 }
7319f91d 1320 }
2dc5eb26
KW
1321
1322 # Decided unsafe to use these that aren't officially part of the Unicode
1323 # standard.
1324 #use Math::Trig;
1325 #my $pi = acos(-1.0);
98025745 1326 #$NUMERIC{0x03C0} = $pi;
7319f91d
KW
1327
1328 # Euler's constant, not to be confused with Euler's number
98025745 1329 #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
7319f91d
KW
1330
1331 # Euler's number
98025745 1332 #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
2dc5eb26 1333
7319f91d
KW
1334 return;
1335}
1336
1337=pod
1338
67592e11 1339=head2 B<num()>
7319f91d 1340
eefd7bc2
KW
1341 use Unicode::UCD 'num';
1342
1343 my $val = num("123");
1344 my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
1345
7319f91d
KW
1346C<num> returns the numeric value of the input Unicode string; or C<undef> if it
1347doesn't think the entire string has a completely valid, safe numeric value.
1348
1349If the string is just one character in length, the Unicode numeric value
1350is returned if it has one, or C<undef> otherwise. Note that this need
1351not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
2dc5eb26
KW
1352example returns -0.5.
1353
1354=cut
7319f91d 1355
2dc5eb26
KW
1356#A few characters to which Unicode doesn't officially
1357#assign a numeric value are considered numeric by C<num>.
1358#These are:
1359
1360# EULER CONSTANT 0.5772... (this is NOT Euler's number)
1361# SCRIPT SMALL E 2.71828... (this IS Euler's number)
1362# GREEK SMALL LETTER PI 3.14159...
1363
1364=pod
7319f91d
KW
1365
1366If the string is more than one character, C<undef> is returned unless
8bb4c8e2 1367all its characters are decimal digits (that is, they would match C<\d+>),
7319f91d
KW
1368from the same script. For example if you have an ASCII '0' and a Bengali
1369'3', mixed together, they aren't considered a valid number, and C<undef>
1370is returned. A further restriction is that the digits all have to be of
1371the same form. A half-width digit mixed with a full-width one will
1372return C<undef>. The Arabic script has two sets of digits; C<num> will
1373return C<undef> unless all the digits in the string come from the same
1374set.
1375
1376C<num> errs on the side of safety, and there may be valid strings of
1377decimal digits that it doesn't recognize. Note that Unicode defines
1378a number of "digit" characters that aren't "decimal digit" characters.
a278d14b 1379"Decimal digits" have the property that they have a positional value, i.e.,
7319f91d
KW
1380there is a units position, a 10's position, a 100's, etc, AND they are
1381arranged in Unicode in blocks of 10 contiguous code points. The Chinese
1382digits, for example, are not in such a contiguous block, and so Unicode
1383doesn't view them as decimal digits, but merely digits, and so C<\d> will not
1384match them. A single-character string containing one of these digits will
1385have its decimal value returned by C<num>, but any longer string containing
1386only these digits will return C<undef>.
1387
a278d14b
KW
1388Strings of multiple sub- and superscripts are not recognized as numbers. You
1389can use either of the compatibility decompositions in Unicode::Normalize to
7319f91d
KW
1390change these into digits, and then call C<num> on the result.
1391
1392=cut
1393
1394# To handle sub, superscripts, this could if called in list context,
1395# consider those, and return the <decomposition> type in the second
1396# array element.
1397
1398sub num {
1399 my $string = $_[0];
1400
1401 _numeric unless %NUMERIC;
1402
1403 my $length = length($string);
98025745 1404 return $NUMERIC{ord($string)} if $length == 1;
7319f91d
KW
1405 return if $string =~ /\D/;
1406 my $first_ord = ord(substr($string, 0, 1));
98025745 1407 my $value = $NUMERIC{$first_ord};
7319f91d
KW
1408 my $zero_ord = $first_ord - $value;
1409
1410 for my $i (1 .. $length -1) {
1411 my $ord = ord(substr($string, $i, 1));
1412 my $digit = $ord - $zero_ord;
1413 return unless $digit >= 0 && $digit <= 9;
1414 $value = $value * 10 + $digit;
1415 }
1416 return $value;
1417}
1418
7ef25837
KW
1419=pod
1420
1421=head2 B<prop_aliases()>
1422
1423 use Unicode::UCD 'prop_aliases';
1424
1425 my ($short_name, $full_name, @other_names) = prop_aliases("space");
1426 my $same_full_name = prop_aliases("Space"); # Scalar context
1427 my ($same_short_name) = prop_aliases("Space"); # gets 0th element
1428 print "The full name is $full_name\n";
1429 print "The short name is $short_name\n";
1430 print "The other aliases are: ", join(", ", @other_names), "\n";
1431
1432 prints:
1433 The full name is White_Space
1434 The short name is WSpace
1435 The other aliases are: Space
1436
1437Most Unicode properties have several synonymous names. Typically, there is at
1438least a short name, convenient to type, and a long name that more fully
1439describes the property, and hence is more easily understood.
1440
1441If you know one name for a Unicode property, you can use C<prop_aliases> to find
1442either the long name (when called in scalar context), or a list of all of the
1443names, somewhat ordered so that the short name is in the 0th element, the long
1444name in the next element, and any other synonyms are in the remaining
1445elements, in no particular order.
1446
1447The long name is returned in a form nicely capitalized, suitable for printing.
1448
1449The input parameter name is loosely matched, which means that white space,
1450hyphens, and underscores are ignored (except for the trailing underscore in
1451the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and
1452both of which mean C<General_Category=Cased Letter>).
1453
1454If the name is unknown, C<undef> is returned (or an empty list in list
1455context). Note that Perl typically recognizes property names in regular
1456expressions with an optional C<"Is_>" (with or without the underscore)
1457prefixed to them, such as C<\p{isgc=punct}>. This function does not recognize
1458those in the input, returning C<undef>. Nor are they included in the output
1459as possible synonyms.
1460
1461C<prop_aliases> does know about the Perl extensions to Unicode properties,
1462such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode
1463properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>. The
1464final example demonstrates that the C<"Is_"> prefix is recognized for these
1465extensions; it is needed to resolve ambiguities. For example,
1466C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but
1467C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>. This is
1468because C<islc> is a Perl extension which is short for
1469C<General_Category=Cased Letter>. The lists returned for the Perl extensions
1470will not include the C<"Is_"> prefix (whether or not the input had it) unless
1471needed to resolve ambiguities, as shown in the C<"islc"> example, where the
1472returned list had one element containing C<"Is_">, and the other without.
1473
1474It is also possible for the reverse to happen: C<prop_aliases('isc')> returns
1475the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns
1476C<(C, Other)> (the latter being a Perl extension meaning
ee94c7d1
KW
1477C<General_Category=Other>.
1478L<perluniprops/Properties accessible through Unicode::UCD> lists the available
1479forms, including which ones are discouraged from use.
7ef25837
KW
1480
1481Those discouraged forms are accepted as input to C<prop_aliases>, but are not
1482returned in the lists. C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
1483which are old synonyms for C<"Is_LC"> and should not be used in new code, are
1484examples of this. These both return C<(Is_LC, Cased_Letter)>. Thus this
1485function allows you to take a discourarged form, and find its acceptable
1486alternatives. The same goes with single-form Block property equivalences.
1487Only the forms that begin with C<"In_"> are not discouraged; if you pass
1488C<prop_aliases> a discouraged form, you will get back the equivalent ones that
1489begin with C<"In_">. It will otherwise look like a new-style block name (see.
1490L</Old-style versus new-style block names>).
1491
1492C<prop_aliases> does not know about any user-defined properties, and will
1493return C<undef> if called with one of those. Likewise for Perl internal
1494properties, with the exception of "Perl_Decimal_Digit" which it does know
1495about (and which is documented below in L</prop_invmap()>).
1496
1497=cut
1498
1499# It may be that there are use cases where the discouraged forms should be
1500# returned. If that comes up, an optional boolean second parameter to the
1501# function could be created, for example.
1502
1503# These are created by mktables for this routine and stored in unicore/UCD.pl
1504# where their structures are described.
1505our %string_property_loose_to_name;
1506our %ambiguous_names;
1507our %loose_perlprop_to_name;
1508our %prop_aliases;
1509
1510sub prop_aliases ($) {
1511 my $prop = $_[0];
1512 return unless defined $prop;
1513
1514 require "unicore/UCD.pl";
1515 require "unicore/Heavy.pl";
1516 require "utf8_heavy.pl";
1517
1518 # The property name may be loosely or strictly matched; we don't know yet.
1519 # But both types use lower-case.
1520 $prop = lc $prop;
1521
1522 # It is loosely matched if its lower case isn't known to be strict.
1523 my $list_ref;
1524 if (! exists $utf8::stricter_to_file_of{$prop}) {
1525 my $loose = utf8::_loose_name($prop);
1526
1527 # There is a hash that converts from any loose name to its standard
1528 # form, mapping all synonyms for a name to one name that can be used
1529 # as a key into another hash. The whole concept is for memory
1530 # savings, as the second hash doesn't have to have all the
1531 # combinations. Actually, there are two hashes that do the
1532 # converstion. One is used in utf8_heavy.pl (stored in Heavy.pl) for
1533 # looking up properties matchable in regexes. This function needs to
1534 # access string properties, which aren't available in regexes, so a
1535 # second conversion hash is made for them (stored in UCD.pl). Look in
1536 # the string one now, as the rest can have an optional 'is' prefix,
1537 # which these don't.
1538 if (exists $string_property_loose_to_name{$loose}) {
1539
1540 # Convert to its standard loose name.
1541 $prop = $string_property_loose_to_name{$loose};
1542 }
1543 else {
1544 my $retrying = 0; # bool. ? Has an initial 'is' been stripped
1545 RETRY:
1546 if (exists $utf8::loose_property_name_of{$loose}
1547 && (! $retrying
1548 || ! exists $ambiguous_names{$loose}))
1549 {
1550 # Found an entry giving the standard form. We don't get here
1551 # (in the test above) when we've stripped off an
1552 # 'is' and the result is an ambiguous name. That is because
1553 # these are official Unicode properties (though Perl can have
1554 # an optional 'is' prefix meaning the official property), and
1555 # all ambiguous cases involve a Perl single-form extension
1556 # for the gc, script, or block properties, and the stripped
1557 # 'is' means that they mean one of those, and not one of
1558 # these
1559 $prop = $utf8::loose_property_name_of{$loose};
1560 }
1561 elsif (exists $loose_perlprop_to_name{$loose}) {
1562
1563 # This hash is specifically for this function to list Perl
1564 # extensions that aren't in the earlier hashes. If there is
1565 # only one element, the short and long names are identical.
1566 # Otherwise the form is already in the same form as
1567 # %prop_aliases, which is handled at the end of the function.
1568 $list_ref = $loose_perlprop_to_name{$loose};
1569 if (@$list_ref == 1) {
1570 my @list = ($list_ref->[0], $list_ref->[0]);
1571 $list_ref = \@list;
1572 }
1573 }
1574 elsif (! exists $utf8::loose_to_file_of{$loose}) {
1575
1576 # loose_to_file_of is a complete list of loose names. If not
1577 # there, the input is unknown.
1578 return;
1579 }
1580 else {
1581
1582 # Here we found the name but not its aliases, so it has to
1583 # exist. This means it must be one of the Perl single-form
1584 # extensions. First see if it is for a property-value
1585 # combination in one of the following properties.
1586 my @list;
1587 foreach my $property ("gc", "script") {
1588 @list = prop_value_aliases($property, $loose);
1589 last if @list;
1590 }
1591 if (@list) {
1592
1593 # Here, it is one of those property-value combination
1594 # single-form synonyms. There are ambiguities with some
1595 # of these. Check against the list for these, and adjust
1596 # if necessary.
1597 for my $i (0 .. @list -1) {
1598 if (exists $ambiguous_names
1599 {utf8::_loose_name(lc $list[$i])})
1600 {
1601 # The ambiguity is resolved by toggling whether or
1602 # not it has an 'is' prefix
1603 $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/;
1604 }
1605 }
1606 return @list;
1607 }
1608
1609 # Here, it wasn't one of the gc or script single-form
1610 # extensions. It could be a block property single-form
1611 # extension. An 'in' prefix definitely means that, and should
2a4f2769
KW
1612 # be looked up without the prefix. However, starting in
1613 # Unicode 6.1, we have to special case 'indic...', as there
1614 # is a property that begins with that name. We shouldn't
1615 # strip the 'in' from that. I'm (khw) generalizing this to
1616 # 'indic' instead of the single property, because I suspect
1617 # that others of this class may come along in the future.
1618 # However, this could backfire and a block created whose name
1619 # begins with 'dic...', and we would want to strip the 'in'.
1620 # At which point this would have to be tweaked.
1621 my $began_with_in = $loose =~ s/^in(?!dic)//;
7ef25837
KW
1622 @list = prop_value_aliases("block", $loose);
1623 if (@list) {
1624 map { $_ =~ s/^/In_/ } @list;
1625 return @list;
1626 }
1627
1628 # Here still haven't found it. The last opportunity for it
1629 # being valid is only if it began with 'is'. We retry without
1630 # the 'is', setting a flag to that effect so that we don't
1631 # accept things that begin with 'isis...'
1632 if (! $retrying && ! $began_with_in && $loose =~ s/^is//) {
1633 $retrying = 1;
1634 goto RETRY;
1635 }
1636
1637 # Here, didn't find it. Since it was in %loose_to_file_of, we
1638 # should have been able to find it.
1639 carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'. Send bug report to perlbug\@perl.org";
1640 return;
1641 }
1642 }
1643 }
1644
1645 if (! $list_ref) {
1646 # Here, we have set $prop to a standard form name of the input. Look
1647 # it up in the structure created by mktables for this purpose, which
1648 # contains both strict and loosely matched properties. Avoid
1649 # autovivifying.
1650 $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop};
1651 return unless $list_ref;
1652 }
1653
1654 # The full name is in element 1.
1655 return $list_ref->[1] unless wantarray;
1656
1657 return @{dclone $list_ref};
1658}
1659
1660=pod
1661
1662=head2 B<prop_value_aliases()>
1663
1664 use Unicode::UCD 'prop_value_aliases';
1665
1666 my ($short_name, $full_name, @other_names)
1667 = prop_value_aliases("Gc", "Punct");
1668 my $same_full_name = prop_value_aliases("Gc", "P"); # Scalar cntxt
1669 my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
1670 # element
1671 print "The full name is $full_name\n";
1672 print "The short name is $short_name\n";
1673 print "The other aliases are: ", join(", ", @other_names), "\n";
1674
1675 prints:
1676 The full name is Punctuation
1677 The short name is P
1678 The other aliases are: Punct
1679
1680Some Unicode properties have a restricted set of legal values. For example,
1681all binary properties are restricted to just C<true> or C<false>; and there
1682are only a few dozen possible General Categories.
1683
1684For such properties, there are usually several synonyms for each possible
1685value. For example, in binary properties, I<truth> can be represented by any of
1686the strings "Y", "Yes", "T", or "True"; and the General Category
1687"Punctuation" by that string, or "Punct", or simply "P".
1688
1689Like property names, there is typically at least a short name for each such
1690property-value, and a long name. If you know any name of the property-value,
1691you can use C<prop_value_aliases>() to get the long name (when called in
1692scalar context), or a list of all the names, with the short name in the 0th
1693element, the long name in the next element, and any other synonyms in the
1694remaining elements, in no particular order, except that any all-numeric
1695synonyms will be last.
1696
1697The long name is returned in a form nicely capitalized, suitable for printing.
1698
1699Case, white space, hyphens, and underscores are ignored in the input parameters
1700(except for the trailing underscore in the old-form grandfathered-in general
1701category property value C<"L_">, which is better written as C<"LC">).
1702
1703If either name is unknown, C<undef> is returned. Note that Perl typically
1704recognizes property names in regular expressions with an optional C<"Is_>"
1705(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
1706This function does not recognize those in the property parameter, returning
1707C<undef>.
1708
1709If called with a property that doesn't have synonyms for its values, it
1710returns the input value, possibly normalized with capitalization and
1711underscores.
1712
1713For the block property, new-style block names are returned (see
1714L</Old-style versus new-style block names>).
1715
1716To find the synonyms for single-forms, such as C<\p{Any}>, use
1717L</prop_aliases()> instead.
1718
1719C<prop_value_aliases> does not know about any user-defined properties, and
1720will return C<undef> if called with one of those.
1721
1722=cut
1723
1724# These are created by mktables for this routine and stored in unicore/UCD.pl
1725# where their structures are described.
1726our %loose_to_standard_value;
1727our %prop_value_aliases;
1728
1729sub prop_value_aliases ($$) {
1730 my ($prop, $value) = @_;
1731 return unless defined $prop && defined $value;
1732
1733 require "unicore/UCD.pl";
1734 require "utf8_heavy.pl";
1735
1736 # Find the property name synonym that's used as the key in other hashes,
1737 # which is element 0 in the returned list.
1738 ($prop) = prop_aliases($prop);
1739 return if ! $prop;
1740 $prop = utf8::_loose_name(lc $prop);
1741
1742 # Here is a legal property, but the hash below (created by mktables for
1743 # this purpose) only knows about the properties that have a very finite
1744 # number of potential values, that is not ones whose value could be
1745 # anything, like most (if not all) string properties. These don't have
1746 # synonyms anyway. Simply return the input. For example, there is no
1747 # synonym for ('Uppercase_Mapping', A').
1748 return $value if ! exists $prop_value_aliases{$prop};
1749
1750 # The value name may be loosely or strictly matched; we don't know yet.
1751 # But both types use lower-case.
1752 $value = lc $value;
1753
1754 # If the name isn't found under loose matching, it certainly won't be
1755 # found under strict
1756 my $loose_value = utf8::_loose_name($value);
1757 return unless exists $loose_to_standard_value{"$prop=$loose_value"};
1758
1759 # Similarly if the combination under loose matching doesn't exist, it
1760 # won't exist under strict.
1761 my $standard_value = $loose_to_standard_value{"$prop=$loose_value"};
1762 return unless exists $prop_value_aliases{$prop}{$standard_value};
1763
1764 # Here we did find a combination under loose matching rules. But it could
1765 # be that is a strict property match that shouldn't have matched.
1766 # %prop_value_aliases is set up so that the strict matches will appear as
1767 # if they were in loose form. Thus, if the non-loose version is legal,
1768 # we're ok, can skip the further check.
1769 if (! exists $utf8::stricter_to_file_of{"$prop=$value"}
1770
1771 # We're also ok and skip the further check if value loosely matches.
1772 # mktables has verified that no strict name under loose rules maps to
1773 # an existing loose name. This code relies on the very limited
1774 # circumstances that strict names can be here. Strict name matching
1775 # happens under two conditions:
1776 # 1) when the name begins with an underscore. But this function
1777 # doesn't accept those, and %prop_value_aliases doesn't have
1778 # them.
1779 # 2) When the values are numeric, in which case we need to look
1780 # further, but their squeezed-out loose values will be in
1781 # %stricter_to_file_of
1782 && exists $utf8::stricter_to_file_of{"$prop=$loose_value"})
1783 {
1784 # The only thing that's legal loosely under strict is that can have an
1785 # underscore between digit pairs XXX
1786 while ($value =~ s/(\d)_(\d)/$1$2/g) {}
1787 return unless exists $utf8::stricter_to_file_of{"$prop=$value"};
1788 }
1789
1790 # Here, we know that the combination exists. Return it.
1791 my $list_ref = $prop_value_aliases{$prop}{$standard_value};
1792 if (@$list_ref > 1) {
1793 # The full name is in element 1.
1794 return $list_ref->[1] unless wantarray;
1795
1796 return @{dclone $list_ref};
1797 }
1798
1799 return $list_ref->[0] unless wantarray;
1800
1801 # Only 1 element means that it repeats
1802 return ( $list_ref->[0], $list_ref->[0] );
1803}
7319f91d 1804
681d705c
KW
1805# All 1 bits is the largest possible UV.
1806$Unicode::UCD::MAX_CP = ~0;
1807
1808=pod
1809
1810=head2 B<prop_invlist()>
1811
1812C<prop_invlist> returns an inversion list (described below) that defines all the
1813code points for the binary Unicode property (or "property=value" pair) given
1814by the input parameter string:
1815
1816 use feature 'say';
1817 use Unicode::UCD 'prop_invlist';
1818 say join ", ", prop_invlist("Any");
1819
1820 prints:
1821 0, 1114112
1822
1823An empty list is returned if the input is unknown; the number of elements in
1824the list is returned if called in scalar context.
1825
1826L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives
1827the list of properties that this function accepts, as well as all the possible
1828forms for them (including with the optional "Is_" prefixes). (Except this
1829function doesn't accept any Perl-internal properties, some of which are listed
1830there.) This function uses the same loose or tighter matching rules for
1831resolving the input property's name as is done for regular expressions. These
1832are also specified in L<perluniprops|perluniprops/Properties accessible
1833through \p{} and \P{}>. Examples of using the "property=value" form are:
1834
1835 say join ", ", prop_invlist("Script=Shavian");
1836
1837 prints:
1838 66640, 66688
1839
1840 say join ", ", prop_invlist("ASCII_Hex_Digit=No");
1841
1842 prints:
1843 0, 48, 58, 65, 71, 97, 103
1844
1845 say join ", ", prop_invlist("ASCII_Hex_Digit=Yes");
1846
1847 prints:
1848 48, 58, 65, 71, 97, 103
1849
1850Inversion lists are a compact way of specifying Unicode property-value
1851definitions. The 0th item in the list is the lowest code point that has the
1852property-value. The next item (item [1]) is the lowest code point beyond that
1853one that does NOT have the property-value. And the next item beyond that
1854([2]) is the lowest code point beyond that one that does have the
1855property-value, and so on. Put another way, each element in the list gives
1856the beginning of a range that has the property-value (for even numbered
1857elements), or doesn't have the property-value (for odd numbered elements).
1858The name for this data structure stems from the fact that each element in the
1859list toggles (or inverts) whether the corresponding range is or isn't on the
1860list.
1861
1862In the final example above, the first ASCII Hex digit is code point 48, the
1863character "0", and all code points from it through 57 (a "9") are ASCII hex
1864digits. Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F")
1865are, as are 97 ("a") through 102 ("f"). 103 starts a range of code points
1866that aren't ASCII hex digits. That range extends to infinity, which on your
1867computer can be found in the variable C<$Unicode::UCD::MAX_CP>. (This
1868variable is as close to infinity as Perl can get on your platform, and may be
1869too high for some operations to work; you may wish to use a smaller number for
1870your purposes.)
1871
1872Note that the inversion lists returned by this function can possibly include
1873non-Unicode code points, that is anything above 0x10FFFF. This is in
1874contrast to Perl regular expression matches on those code points, in which a
1875non-Unicode code point always fails to match. For example, both of these have
1876the same result:
1877
1878 chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails.
1879 chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Fails!
1880
1881And both raise a warning that a Unicode property is being used on a
1882non-Unicode code point. It is arguable as to which is the correct thing to do
1883here. This function has chosen the way opposite to the Perl regular
1884expression behavior. This allows you to easily flip to to the Perl regular
1885expression way (for you to go in the other direction would be far harder).
1886Simply add 0x110000 at the end of the non-empty returned list if it isn't
1887already that value; and pop that value if it is; like:
1888
1889 my @list = prop_invlist("foo");
1890 if (@list) {
1891 if ($list[-1] == 0x110000) {
1892 pop @list; # Defeat the turning on for above Unicode
1893 }
1894 else {
1895 push @list, 0x110000; # Turn off for above Unicode
1896 }
1897 }
1898
1899It is a simple matter to expand out an inversion list to a full list of all
1900code points that have the property-value:
1901
1902 my @invlist = prop_invlist($property_name);
1903 die "empty" unless @invlist;
1904 my @full_list;
1905 for (my $i = 0; $i < @invlist; $i += 2) {
1906 my $upper = ($i + 1) < @invlist
1907 ? $invlist[$i+1] - 1 # In range
1908 : $Unicode::UCD::MAX_CP; # To infinity. You may want
1909 # to stop much much earlier;
1910 # going this high may expose
1911 # perl deficiencies with very
1912 # large numbers.
1913 for my $j ($invlist[$i] .. $upper) {
1914 push @full_list, $j;
1915 }
1916 }
1917
1918C<prop_invlist> does not know about any user-defined nor Perl internal-only
1919properties, and will return C<undef> if called with one of those.
1920
1921=cut
1922
1923# User-defined properties could be handled with some changes to utf8_heavy.pl;
1924# and implementing here of dealing with EXTRAS. If done, consideration should
1925# be given to the fact that the user subroutine could return different results
1926# with each call; security issues need to be thought about.
1927
1928# These are created by mktables for this routine and stored in unicore/UCD.pl
1929# where their structures are described.
1930our %loose_defaults;
1931our $MAX_UNICODE_CODEPOINT;
1932
1933sub prop_invlist ($) {
1934 my $prop = $_[0];
1935 return if ! defined $prop;
1936
1937 require "utf8_heavy.pl";
1938
1939 # Warnings for these are only for regexes, so not applicable to us
1940 no warnings 'deprecated';
1941
1942 # Get the swash definition of the property-value.
1943 my $swash = utf8::SWASHNEW(__PACKAGE__, $prop, undef, 1, 0);
1944
1945 # Fail if not found, or isn't a boolean property-value, or is a
1946 # user-defined property, or is internal-only.
1947 return if ! $swash
1948 || ref $swash eq ""
1949 || $swash->{'BITS'} != 1
1950 || $swash->{'USER_DEFINED'}
1951 || $prop =~ /^\s*_/;
1952
1953 if ($swash->{'EXTRAS'}) {
1954 carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
1955 return;
1956 }
1957 if ($swash->{'SPECIALS'}) {
1958 carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has SPECIALS magic";
1959 return;
1960 }
1961
1962 my @invlist;
1963
1964 # The input lines look like:
1965 # 0041\t005A # [26]
1966 # 005F
1967
1968 # Split into lines, stripped of trailing comments
1969 foreach my $range (split "\n",
1970 $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr)
1971 {
1972 # And find the beginning and end of the range on the line
1973 my ($hex_begin, $hex_end) = split "\t", $range;
1974 my $begin = hex $hex_begin;
1975
a39cc031
KW
1976 # If the new range merely extends the old, we remove the marker
1977 # created the last time through the loop for the old's end, which
1978 # causes the new one's end to be used instead.
1979 if (@invlist && $begin == $invlist[-1]) {
1980 pop @invlist;
1981 }
1982 else {
2f3f243e
KW
1983 # Add the beginning of the range
1984 push @invlist, $begin;
a39cc031 1985 }
681d705c
KW
1986
1987 if (defined $hex_end) { # The next item starts with the code point 1
1988 # beyond the end of the range.
1989 push @invlist, hex($hex_end) + 1;
1990 }
1991 else { # No end of range, is a single code point.
1992 push @invlist, $begin + 1;
1993 }
1994 }
1995
1996 require "unicore/UCD.pl";
1997 my $FIRST_NON_UNICODE = $MAX_UNICODE_CODEPOINT + 1;
1998
1999 # Could need to be inverted: add or subtract a 0 at the beginning of the
2000 # list. And to keep it from matching non-Unicode, add or subtract the
2001 # first non-unicode code point.
2002 if ($swash->{'INVERT_IT'}) {
2003 if (@invlist && $invlist[0] == 0) {
2004 shift @invlist;
2005 }
2006 else {
2007 unshift @invlist, 0;
2008 }
2009 if (@invlist && $invlist[-1] == $FIRST_NON_UNICODE) {
2010 pop @invlist;
2011 }
2012 else {
2013 push @invlist, $FIRST_NON_UNICODE;
2014 }
2015 }
2016
2017 # Here, the list is set up to include only Unicode code points. But, if
2018 # the table is the default one for the property, it should contain all
2019 # non-Unicode code points. First calculate the loose name for the
2020 # property. This is done even for strict-name properties, as the data
2021 # structure that mktables generates for us is set up so that we don't have
2022 # to worry about that. The property-value needs to be split if compound,
2023 # as the loose rules need to be independently calculated on each part. We
2024 # know that it is syntactically valid, or SWASHNEW would have failed.
2025
2026 $prop = lc $prop;
2027 my ($prop_only, $table) = split /\s*[:=]\s*/, $prop;
2028 if ($table) {
2029
2030 # May have optional prefixed 'is'
2031 $prop = utf8::_loose_name($prop_only) =~ s/^is//r;
2032 $prop = $utf8::loose_property_name_of{$prop};
2033 $prop .= "=" . utf8::_loose_name($table);
2034 }
2035 else {
2036 $prop = utf8::_loose_name($prop);
2037 }
2038 if (exists $loose_defaults{$prop}) {
2039
2040 # Here, is the default table. If a range ended with 10ffff, instead
2041 # continue that range to infinity, by popping the 110000; otherwise,
2042 # add the range from 11000 to infinity
2043 if (! @invlist || $invlist[-1] != $FIRST_NON_UNICODE) {
2044 push @invlist, $FIRST_NON_UNICODE;
2045 }
2046 else {
2047 pop @invlist;
2048 }
2049 }
2050
2051 return @invlist;
2052}
7319f91d 2053
62b3b855
KW
2054sub _search_invlist {
2055 # Find the range in the inversion list which contains a code point; that
2056 # is, find i such that l[i] <= code_point < l[i+1]
2057
2058 # If this is ever made public, could use to speed up .t specials. Would
2059 # need to use code point argument, as in other functions in this pm
2060
2061 my $list_ref = shift;
2062 my $code_point = shift;
2063 # Verify non-neg numeric XXX
2064
2065 my $max_element = @$list_ref - 1;
2066 return if ! $max_element < 0; # Undef if list is empty.
2067
2068 # Short cut something at the far-end of the table. This also allows us to
2069 # refer to element [$i+1] without fear of being out-of-bounds in the loop
2070 # below.
2071 return $max_element if $code_point >= $list_ref->[$max_element];
2072
2073 use integer; # want integer division
2074
2075 my $i = $max_element / 2;
2076
2077 my $lower = 0;
2078 my $upper = $max_element;
2079 while (1) {
2080
2081 if ($code_point >= $list_ref->[$i]) {
2082
2083 # Here we have met the lower constraint. We can quit if we
2084 # also meet the upper one.
2085 last if $code_point < $list_ref->[$i+1];
2086
2087 $lower = $i; # Still too low.
2088
2089 }
2090 else {
2091
2092 # Here, $code_point < $list_ref[$i], so look lower down.
2093 $upper = $i;
2094 }
2095
2096 # Split search domain in half to try again.
2097 my $temp = ($upper + $lower) / 2;
2098
2099 # No point in continuing unless $i changes for next time
2100 # in the loop.
2101 return $i if $temp == $i;
2102 $i = $temp;
2103 } # End of while loop
2104
2105 # Here we have found the offset
2106 return $i;
2107}
2108
2109=pod
2110
2111=head2 B<prop_invmap()>
2112
2113 use Unicode::UCD 'prop_invmap';
2114 my ($list_ref, $map_ref, $format, $missing)
2115 = prop_invmap("General Category");
2116
2117C<prop_invmap> is used to get the complete mapping definition for a property,
2118in the form of an inversion map. An inversion map consists of two parallel
2119arrays. One is an ordered list of code points that mark range beginnings, and
2120the other gives the value (or mapping) that all code points in the
2121corresponding range have.
2122
2123C<prop_invmap> is called with the name of the desired property. The name is
2124loosely matched, meaning that differences in case, white-space, hyphens, and
2125underscores are not meaningful (except for the trailing underscore in the
2126old-form grandfathered-in property C<"L_">, which is better written as C<"LC">,
2127or even better, C<"Gc=LC">).
2128
2129Many Unicode properties have more than one name (or alias). C<prop_invmap>
2130understands all of these, including Perl extensions to them. Ambiguities are
2131resolved as described above for L</prop_aliases()>. The Perl internal
2132property "Perl_Decimal_Digit, described below, is also accepted. C<undef> is
2133returned if the property name is unknown.
ee94c7d1
KW
2134See L<perluniprops/Properties accessible through Unicode::UCD> for the
2135properties acceptable as inputs to this function.
62b3b855
KW
2136
2137It is a fatal error to call this function except in list context.
2138
2139In addition to the the two arrays that form the inversion map, C<prop_invmap>
2140returns two other values; one is a scalar that gives some details as to the
2141format of the entries of the map array; the other is used for specialized
2142purposes, described at the end of this section.
2143
2144This means that C<prop_invmap> returns a 4 element list. For example,
2145
2146 my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default)
2147 = prop_invmap("Block");
2148
2149In this call, the two arrays will be populated as shown below (for Unicode
21506.0):
2151
2152 Index @blocks_ranges @blocks_maps
2153 0 0x0000 Basic Latin
2154 1 0x0080 Latin-1 Supplement
2155 2 0x0100 Latin Extended-A
2156 3 0x0180 Latin Extended-B
2157 4 0x0250 IPA Extensions
2158 5 0x02B0 Spacing Modifier Letters
2159 6 0x0300 Combining Diacritical Marks
2160 7 0x0370 Greek and Coptic
2161 8 0x0400 Cyrillic
2162 ...
2163 233 0x2B820 No_Block
2164 234 0x2F800 CJK Compatibility Ideographs Supplement
2165 235 0x2FA20 No_Block
2166 236 0xE0000 Tags
2167 237 0xE0080 No_Block
2168 238 0xE0100 Variation Selectors Supplement
2169 239 0xE01F0 No_Block
2170 240 0xF0000 Supplementary Private Use Area-A
2171 241 0x100000 Supplementary Private Use Area-B
2172 242 0x110000 No_Block
2173
2174The first line (with Index [0]) means that the value for code point 0 is "Basic
2175Latin". The entry "0x0080" in the @blocks_ranges column in the second line
2176means that the value from the first line, "Basic Latin", extends to all code
2177points in the range from 0 up to but not including 0x0080, that is, through
647396da 2178127. In other words, the code points from 0 to 127 are all in the "Basic
62b3b855
KW
2179Latin" block. Similarly, all code points in the range from 0x0080 up to (but
2180not including) 0x0100 are in the block named "Latin-1 Supplement", etc.
2181(Notice that the return is the old-style block names; see L</Old-style versus
2182new-style block names>).
2183
2184The final line (with Index [242]) means that the value for all code points above
2185the legal Unicode maximum code point have the value "No_Block", which is the
2186term Unicode uses for a non-existing block.
2187
2188The arrays completely specify the mappings for all possible code points.
2189The final element in an inversion map returned by this function will always be
2190for the range that consists of all the code points that aren't legal Unicode,
2191but that are expressible on the platform. (That is, it starts with code point
21920x110000, the first code point above the legal Unicode maximum, and extends to
2193infinity.) The value for that range will be the same that any typical
2194unassigned code point has for the specified property. (Certain unassigned
2195code points are not "typical"; for example the non-character code points, or
2196those in blocks that are to be written right-to-left. The above-Unicode
2197range's value is not based on these atypical code points.) It could be argued
2198that, instead of treating these as unassigned Unicode code points, the value
2199for this range should be C<undef>. If you wish, you can change the returned
2200arrays accordingly.
2201
2202The maps are almost always simple scalars that should be interpreted as-is.
2203These values are those given in the Unicode-supplied data files, which may be
2204inconsistent as to capitalization and as to which synonym for a property-value
2205is given. The results may be normalized by using the L</prop_value_aliases()>
2206function.
2207
2208There are exceptions to the simple scalar maps. Some properties have some
2209elements in their map list that are themselves lists of scalars; and some
2210special strings are returned that are not to be interpreted as-is. Element
2211[2] (placed into C<$format> in the example above) of the returned four element
647396da 2212list tells you if the map has any of these special elements or not, as follows:
62b3b855
KW
2213
2214=over
2215
dc8d8ea6 2216=item B<C<s>>
62b3b855
KW
2217
2218means all the elements of the map array are simple scalars, with no special
2219elements. Almost all properties are like this, like the C<block> example
2220above.
2221
dc8d8ea6 2222=item B<C<sl>>
62b3b855 2223
647396da 2224means that some of the map array elements have the form given by C<"s">, and
62b3b855
KW
2225the rest are lists of scalars. For example, here is a portion of the output
2226of calling C<prop_invmap>() with the "Script Extensions" property:
2227
2228 @scripts_ranges @scripts_maps
2229 ...
c2ca0207
KW
2230 0x0953 Devanagari
2231 0x0964 [ Bengali, Devanagari, Gurumukhi, Oriya ]
2232 0x0966 Devanagari
62b3b855
KW
2233 0x0970 Common
2234
647396da
KW
2235Here, the code points 0x964 and 0x965 are both used in Bengali,
2236Devanagari, Gurmukhi, and Oriya, but no other scripts.
62b3b855 2237
647396da 2238The Name_Alias property is also of this form. But each scalar consists of two
58b75e36 2239components: 1) the name, and 2) the type of alias this is. They are
7620cb10
KW
2240separated by a colon and a space. In Unicode 6.1, there are several alias types:
2241
2242=over
2243
2244=item C<correction>
2245
2246indicates that the name is a corrected form for the
2247original name (which remains valid) for the same code point.
2248
2249=item C<control>
2250
2251adds a new name for a control character.
2252
2253=item C<alternate>
2254
2255is an alternate name for a character
2256
2257=item C<figment>
2258
2259is a name for a character that has been documented but was never in any
2260actual standard.
2261
2262=item C<abbreviation>
2263
2264is a common abbreviation for a character
2265
2266=back
2267
2268The lists are ordered (roughly) so the most preferred names come before less
2269preferred ones.
58b75e36
KW
2270
2271For example,
2272
7620cb10
KW
2273 @aliases_ranges @alias_maps
2274 ...
2275 0x009E [ 'PRIVACY MESSAGE: control', 'PM: abbreviation' ]
2276 0x009F [ 'APPLICATION PROGRAM COMMAND: control',
2277 'APC: abbreviation'
2278 ]
2279 0x00A0 'NBSP: abbreviation'
2280 0x00A1 ""
2281 0x00AD 'SHY: abbreviation'
2282 0x00AE ""
2283 0x01A2 'LATIN CAPITAL LETTER GHA: correction'
2284 0x01A3 'LATIN SMALL LETTER GHA: correction'
2285 0x01A4 ""
58b75e36 2286 ...
58b75e36 2287
7620cb10
KW
2288A map to the empty string means that there is no alias defined for the code
2289point.
58b75e36 2290
dc8d8ea6 2291=item B<C<c>>
62b3b855 2292
647396da 2293is like C<"s"> in that all the map array elements are scalars, but here they are
bf7fe2df
KW
2294restricted to all being integers, and each has to be tweaked to get the correct
2295result by adding the code point number to it. For example, in:
62b3b855
KW
2296
2297 my ($uppers_ranges_ref, $uppers_maps_ref, $format)
2298 = prop_invmap("Simple_Uppercase_Mapping");
2299
2300the returned arrays look like this:
2301
2302 @$uppers_ranges_ref @$uppers_maps_ref Note
bf7fe2df
KW
2303 0 0
2304 97 -32 'a' maps to 'A', b => B ...
2305 123 0
2306 181 743 MICRO SIGN => Greek Cap MU
2307 182 0
62b3b855
KW
2308 ...
2309
bf7fe2df
KW
2310The first line means that the uppercase of code point 0 is 0+0; the uppercase
2311of code point 1 is 1+0; ... of code point 96 is 96+0. In other words, the
2312uppercase of each of the first 0..96 code points is itself. The second line
2313means that code point 97 maps to 97-32 (=65) or the uppercase of 'a' is 'A';
231498 => 98-32 (=66) or the uppercase of 'b' is 'B'; ... 122 => 122-32 (=90) or
2315the uppercase of 'z' is 'Z'.
2316
2317By requiring adding the code point to the returned result, the arrays are made
647396da 2318significantly smaller, which speeds up searching them.
62b3b855 2319
dc8d8ea6 2320=item B<C<cl>>
62b3b855 2321
647396da 2322means that some of the map array elements have the form given by C<"c">, and
62b3b855
KW
2323the rest are ordered lists of code points.
2324For example, in:
2325
2326 my ($uppers_ranges_ref, $uppers_maps_ref, $format)
2327 = prop_invmap("Uppercase_Mapping");
2328
2329the returned arrays look like this:
2330
2331 @$uppers_ranges_ref @$uppers_maps_ref
bf7fe2df
KW
2332 0 0
2333 97 -32
2334 123 0
2335 181 743
2336 182 0
62b3b855
KW
2337 ...
2338 0x0149 [ 0x02BC 0x004E ]
bf7fe2df
KW
2339 0x014A 0
2340 0x014B -1
62b3b855
KW
2341 ...
2342
2343This is the full Uppercase_Mapping property (as opposed to the
2344Simple_Uppercase_Mapping given in the example for format C<"c">). The only
2345difference between the two in the ranges shown is that the code point at
23460x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
2347characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
2348CAPITAL LETTER N).
2349
bf7fe2df
KW
2350Yes, there is an inconsistency here. When the map is a single element the
2351correct value must be derived by adding the code point number to it; when the
2352map is a list of code points, they are the final correct values. The reason
2353for forcing the addition is to make the returned map array significantly more
2354compact. There is no such advantage to doing the same thing to the elements
2355that are lists, and the addition is extra work.
2356
b0b13ada
KW
2357=item B<C<ce>>
2358
647396da 2359This is like C<"c">, but some elements are the empty string, so not all are
b0b13ada
KW
2360integers.
2361The one internal Perl property accessible by C<prop_invmap> is of this type:
2362"Perl_Decimal_Digit" returns an inversion map which gives the numeric values
2363that are represented by the Unicode decimal digit characters. Characters that
2364don't represent decimal digits map to the empty string, like so:
2365
2366 @digits @values
2367 0x0000 ""
2368 0x0030 -48
2369 0x003A: ""
2370 0x0660: -1632
2371 0x066A: ""
2372 0x06F0: -1776
2373 0x06FA: ""
2374 0x07C0: -1984
2375 0x07CA: ""
2376 0x0966: -2406
2377 ...
2378
2379This means that the code points from 0 to 0x2F do not represent decimal digits;
2380the code point 0x30 (DIGIT ZERO, =48 decimal) represents 48-48 = 0; code
2381point 0x31, (DIGIT ONE), represents 49-48 = 1; ... code point 0x39, (DIGIT
2382NINE), represents 57-48 = 9; ... code points 0x3A through 0x65F do not
2383represent decimal digits; 0x660 (ARABIC-INDIC DIGIT ZERO, =1632 decimal),
2384represents 1632-1632 = 0; ... 0x07C1 (NKO DIGIT ONE, = 1985), represents
23851985-1984 = 1 ...
2386
dc8d8ea6 2387=item B<C<cle>>
62b3b855 2388
647396da
KW
2389is a combination of the C<"cl"> type and the C<"ce"> type. Some of
2390the map array elements have the forms given by C<"cl">, and
62b3b855
KW
2391the rest are the empty string. The property C<NFKC_Casefold> has this form.
2392An example slice is:
2393
2394 @$ranges_ref @$maps_ref Note
2395 ...
34132297
KW
2396 0x00AA -73 FEMININE ORDINAL INDICATOR => 'a'
2397 0x00AB 0
62b3b855 2398 0x00AD SOFT HYPHEN => ""
34132297 2399 0x00AE 0
62b3b855 2400 0x00AF [ 0x0020, 0x0304 ] MACRON => SPACE . COMBINING MACRON
34132297 2401 0x00B0 0
62b3b855
KW
2402 ...
2403
6cc45523
KW
2404=item B<C<r>>
2405
2406means that all the elements of the map array are either rational numbers or
2407the string C<"NaN">, meaning "Not a Number". A rational number is either an
2408integer, or two integers separated by a solidus (C<"/">). The second integer
2409represents the denominator of the division implied by the solidus, and is
2410guaranteed not to be 0. If you want to convert them to scalar numbers, you
2411can use something like this:
2412
2413 my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
2414 if ($format && $format eq "r") {
2415 map { $_ = eval $_ } @$invmap_ref;
2416 }
2417
2418Here's some entries from the output of the property "Nv", which has format
2419C<"r">.
2420
2421 @numerics_ranges @numerics_maps Note
2422 0x00 "NaN"
2423 0x30 0 DIGIT 0
2424 0x31 1
2425 0x32 2
2426 ...
2427 0x37 7
2428 0x38 8
2429 0x39 9 DIGIT 9
2430 0x3A "NaN"
2431 0xB2 2 SUPERSCRIPT 2
2432 0xB3 3 SUPERSCRIPT 2
2433 0xB4 "NaN"
2434 0xB9 1 SUPERSCRIPT 1
2435 0xBA "NaN"
2436 0xBC 1/4 VULGAR FRACTION 1/4
2437 0xBD 1/2 VULGAR FRACTION 1/2
2438 0xBE 3/4 VULGAR FRACTION 3/4
2439 0xBF "NaN"
2440 0x660 0 ARABIC-INDIC DIGIT ZERO
2441
dc8d8ea6 2442=item B<C<n>>
62b3b855
KW
2443
2444means the Name property. All the elements of the map array are simple
2445scalars, but some of them contain special strings that require more work to
2446get the actual name.
2447
2448Entries such as:
2449
2450 CJK UNIFIED IDEOGRAPH-<code point>
2451
2452mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-"
2453with the code point (expressed in hexadecimal) appended to it, like "CJK
647396da
KW
2454UNIFIED IDEOGRAPH-3403" (similarly for S<C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
2455pointE<gt>>>).
62b3b855
KW
2456
2457Also, entries like
2458
2459 <hangul syllable>
2460
2461means that the name is algorithmically calculated. This is easily done by
2462the function L<charnames/charnames::viacode(code)>.
2463
2464Note that for control characters (C<Gc=cc>), Unicode's data files have the
2465string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
7620cb10 2466string. This function returns that real name, the empty string. (There are
647396da
KW
2467names for these characters, but they are considered aliases, not the Name
2468property name, and are contained in the C<Name_Alias> property.)
62b3b855 2469
dc8d8ea6 2470=item B<C<d>>
62b3b855 2471
647396da 2472means the Decomposition_Mapping property. This property is like C<"cl">
bea2c146 2473properties, except that one of the scalar elements is of the form:
62b3b855
KW
2474
2475 <hangul syllable>
2476
bea2c146
KW
2477This signifies that this entry should be replaced by the decompositions for
2478all the code points whose decomposition is algorithmically calculated. (All
647396da 2479of them are currently in one range and likely to remain so; the C<"n"> format
bea2c146 2480has this same entry.) These can be generated via the function
62b3b855
KW
2481L<Unicode::Normalize::NFD()|Unicode::Normalize>.
2482
62b3b855
KW
2483Note that the mapping is the one that is specified in the Unicode data files,
2484and to get the final decomposition, it may need to be applied recursively.
2485
2486=back
2487
2488A binary search can be used to quickly find a code point in the inversion
2489list, and hence its corresponding mapping.
2490
2491The final element (index [3], assigned to C<$default> in the "block" example) in
2492the four element list returned by this function may be useful for applications
2493that wish to convert the returned inversion map data structure into some
2494other, such as a hash. It gives the mapping that most code points map to
2495under the property. If you establish the convention that any code point not
2496explicitly listed in your data structure maps to this value, you can
2497potentially make your data structure much smaller. As you construct your data
2498structure from the one returned by this function, simply ignore those ranges
2499that map to this value, generally called the "default" value. For example, to
2500convert to the data structure searchable by L</charinrange()>, you can follow
2501this recipe:
2502
2503 my ($list_ref, $map_ref, $format, $missing) = prop_invmap($property);
2504 my @range_list;
2505 for my $i (0 .. @$list_ref - 2) {
2506 next if $map_ref->[$i] eq $missing;
2507 push @range_list, [ $list_ref->[$i],
2508 $list_ref->[$i+1],
2509 $map_ref->[$i]
2510 ];
2511 }
2512
2513 print charinrange(\@range_list, $code_point), "\n";
2514
2515
2516With this, C<charinrange()> will return C<undef> if its input code point maps
2517to C<$missing>. You can avoid this by omitting the C<next> statement, and adding
2518a line after the loop to handle the final element of the inversion map.
2519
62b3b855
KW
2520
2521Note that the inversion maps returned for the C<Case_Folding> and
2522C<Simple_Case_Folding> properties do not include the Turkic-locale mappings.
2523Use L</casefold()> for these.
2524
62b3b855
KW
2525C<prop_invmap> does not know about any user-defined properties, and will
2526return C<undef> if called with one of those.
2527
2528=cut
2529
2530# User-defined properties could be handled with some changes to utf8_heavy.pl;
2531# if done, consideration should be given to the fact that the user subroutine
2532# could return different results with each call, which could lead to some
2533# security issues.
2534
2535# One could store things in memory so they don't have to be recalculated, but
2536# it is unlikely this will be called often, and some properties would take up
2537# significant memory.
2538
2539# These are created by mktables for this routine and stored in unicore/UCD.pl
2540# where their structures are described.
2541our @algorithmic_named_code_points;
2542our $HANGUL_BEGIN;
2543our $HANGUL_COUNT;
2544
2545sub prop_invmap ($) {
2546
2547 croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray;
2548
2549 my $prop = $_[0];
2550 return unless defined $prop;
2551
2552 # Fail internal properties
2553 return if $prop =~ /^_/;
2554
2555 # The values returned by this function.
2556 my (@invlist, @invmap, $format, $missing);
2557
2558 # The swash has two components we look at, the base list, and a hash,
2559 # named 'SPECIALS', containing any additional members whose mappings don't
2560 # fit into the the base list scheme of things. These generally 'override'
2561 # any value in the base list for the same code point.
2562 my $overrides;
2563
2564 require "utf8_heavy.pl";
2565 require "unicore/UCD.pl";
2566
2567RETRY:
2568
647396da
KW
2569 # If there are multiple entries for a single code point
2570 my $has_multiples = 0;
2571
62b3b855
KW
2572 # Try to get the map swash for the property. They have 'To' prepended to
2573 # the property name, and 32 means we will accept 32 bit return values.
647396da 2574 # The 0 means we aren't calling this from tr///.
62b3b855
KW
2575 my $swash = utf8::SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0);
2576
2577 # If didn't find it, could be because needs a proxy. And if was the
2578 # 'Block' or 'Name' property, use a proxy even if did find it. Finding it
647396da
KW
2579 # in these cases would be the result of the installation changing mktables
2580 # to output the Block or Name tables. The Block table gives block names
2581 # in the new-style, and this routine is supposed to return old-style block
2582 # names. The Name table is valid, but we need to execute the special code
2583 # below to add in the algorithmic-defined name entries.
34132297 2584 # And NFKCCF needs conversion, so handle that here too.
62b3b855 2585 if (ref $swash eq ""
34132297 2586 || $swash->{'TYPE'} =~ / ^ To (?: Blk | Na | NFKCCF ) $ /x)
62b3b855
KW
2587 {
2588
2589 # Get the short name of the input property, in standard form
2590 my ($second_try) = prop_aliases($prop);
2591 return unless $second_try;
2592 $second_try = utf8::_loose_name(lc $second_try);
2593
2594 if ($second_try eq "in") {
2595
2596 # This property is identical to age for inversion map purposes
2597 $prop = "age";
2598 goto RETRY;
2599 }
75e7c50b 2600 elsif ($second_try =~ / ^ s ( cf | [ltu] c ) $ /x) {
62b3b855 2601
75e7c50b
KW
2602 # These properties use just the LIST part of the full mapping,
2603 # which includes the simple maps that are otherwise overridden by
2604 # the SPECIALS. So all we need do is to not look at the SPECIALS;
2605 # set $overrides to indicate that
62b3b855 2606 $overrides = -1;
62b3b855 2607
75e7c50b 2608 # The full name is the simple name stripped of its initial 's'
62b3b855
KW
2609 $prop = $second_try =~ s/^s//r;
2610 goto RETRY;
2611 }
2612 elsif ($second_try eq "blk") {
2613
2614 # We use the old block names. Just create a fake swash from its
2615 # data.
2616 _charblocks();
2617 my %blocks;
2618 $blocks{'LIST'} = "";
2619 $blocks{'TYPE'} = "ToBlk";
2620 $utf8::SwashInfo{ToBlk}{'missing'} = "No_Block";
2621 $utf8::SwashInfo{ToBlk}{'format'} = "s";
2622
2623 foreach my $block (@BLOCKS) {
2624 $blocks{'LIST'} .= sprintf "%x\t%x\t%s\n",
2625 $block->[0],
2626 $block->[1],
2627 $block->[2];
2628 }
2629 $swash = \%blocks;
2630 }
2631 elsif ($second_try eq "na") {
2632
2633 # Use the combo file that has all the Name-type properties in it,
2634 # extracting just the ones that are for the actual 'Name'
2635 # property. And create a fake swash from it.
2636 my %names;
2637 $names{'LIST'} = "";
2638 my $original = do "unicore/Name.pl";
62b3b855
KW
2639 my $algorithm_names = \@algorithmic_named_code_points;
2640
3b6a8189
KW
2641 # We need to remove the names from it that are aliases. For that
2642 # we need to also read in that table. Create a hash with the keys
2643 # being the code points, and the values being a list of the
2644 # aliases for the code point key.
2645 my ($aliases_code_points, $aliases_maps, undef, undef) =
2646 &prop_invmap('Name_Alias');
2647 my %aliases;
2648 for (my $i = 0; $i < @$aliases_code_points; $i++) {
2649 my $code_point = $aliases_code_points->[$i];
2650 $aliases{$code_point} = $aliases_maps->[$i];
2651
2652 # If not already a list, make it into one, so that later we
2653 # can treat things uniformly
2654 if (! ref $aliases{$code_point}) {
2655 $aliases{$code_point} = [ $aliases{$code_point} ];
2656 }
2657
2658 # Remove the alias type from the entry, retaining just the
2659 # name.
2660 map { s/:.*// } @{$aliases{$code_point}};
2661 }
2662
62b3b855
KW
2663 my $i = 0;
2664 foreach my $line (split "\n", $original) {
2665 my ($hex_code_point, $name) = split "\t", $line;
2666
2667 # Weeds out all comments, blank lines, and named sequences
2668 next if $hex_code_point =~ /\P{ASCII_HEX_DIGIT}/;
2669
2670 my $code_point = hex $hex_code_point;
2671
2672 # The name of all controls is the default: the empty string.
2673 # The set of controls is immutable, so these hard-coded
2674 # constants work.
2675 next if $code_point <= 0x9F
2676 && ($code_point <= 0x1F || $code_point >= 0x7F);
2677
3b6a8189
KW
2678 # If this is a name_alias, it isn't a name
2679 next if grep { $_ eq $name } @{$aliases{$code_point}};
62b3b855
KW
2680
2681 # If we are beyond where one of the special lines needs to
2682 # be inserted ...
3b6a8189 2683 while ($i < @$algorithm_names
62b3b855
KW
2684 && $code_point > $algorithm_names->[$i]->{'low'})
2685 {
2686
2687 # ... then insert it, ahead of what we were about to
2688 # output
3b6a8189 2689 $names{'LIST'} .= sprintf "%x\t%x\t%s\n",
62b3b855
KW
2690 $algorithm_names->[$i]->{'low'},
2691 $algorithm_names->[$i]->{'high'},
2692 $algorithm_names->[$i]->{'name'};
2693
62b3b855
KW
2694 # Done with this range.
2695 $i++;
2696
3b6a8189
KW
2697 # We loop until all special lines that precede the next
2698 # regular one are output.
62b3b855
KW
2699 }
2700
3b6a8189
KW
2701 # Here, is a normal name.
2702 $names{'LIST'} .= sprintf "%x\t\t%s\n", $code_point, $name;
2703 } # End of loop through all the names
62b3b855
KW
2704
2705 $names{'TYPE'} = "ToNa";
2706 $utf8::SwashInfo{ToNa}{'missing'} = "";
2707 $utf8::SwashInfo{ToNa}{'format'} = "n";
2708 $swash = \%names;
2709 }
2710 elsif ($second_try =~ / ^ ( d [mt] ) $ /x) {
2711
2712 # The file is a combination of dt and dm properties. Create a
2713 # fake swash from the portion that we want.
2714 my $original = do "unicore/Decomposition.pl";
2715 my %decomps;
2716
2717 if ($second_try eq 'dt') {
2718 $decomps{'TYPE'} = "ToDt";
2719 $utf8::SwashInfo{'ToDt'}{'missing'} = "None";
2720 $utf8::SwashInfo{'ToDt'}{'format'} = "s";
2721 }
2722 else {
2723 $decomps{'TYPE'} = "ToDm";
bea2c146
KW
2724 $utf8::SwashInfo{'ToDm'}{'missing'} = "0";
2725 $utf8::SwashInfo{'ToDm'}{'format'} = 'i';
62b3b855
KW
2726
2727 # Use a special internal-to-this_routine format, 'dm', to
2728 # distinguish from 'd', meaning decimal.
2729 $utf8::SwashInfo{'ToDm'}{'format'} = "dm";
2730 }
2731
2732 $decomps{'LIST'} = "";
2733
2734 # This property has one special range not in the file: for the
2735 # hangul syllables
2736 my $done_hangul = 0; # Have we done the hangul range.
2737 foreach my $line (split "\n", $original) {
2738 my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
2739 my $code_point = hex $hex_lower;
2740 my $value;
bea2c146 2741 my $redo = 0;
62b3b855
KW
2742
2743 # The type, enclosed in <...>, precedes the mapping separated
2744 # by blanks
2745 if ($type_and_map =~ / ^ < ( .* ) > \s+ (.*) $ /x) {
2746 $value = ($second_try eq 'dt') ? $1 : $2
2747 }
2748 else { # If there is no type specified, it's canonical
2749 $value = ($second_try eq 'dt')
2750 ? "Canonical" :
2751 $type_and_map;
2752 }
bea2c146
KW
2753 if ($second_try eq 'dm') {
2754 my @map = map { hex } split " ", $value;
2755
2756 if (@map == 1) {
2757
2758 # Single character maps are converted to deltas, as
2759 # this file is stored, for backwards compatibility,
2760 # not using them.
2761 $value = $map[0] - $code_point;
2762
2763 # If this is a multi-char range, process the rest of
2764 # it by doing a 'redo' after this line is done. Fix
2765 # up the line to contain the rest of the range for
2766 # that redo.
2767 if ($hex_upper ne "" && hex $hex_upper != $code_point) {
2768 $line = sprintf("%04X\t%s\t%s",
2769 $code_point + 1,
2770 $hex_upper,
2771 $type_and_map);
2772 $redo = 1;
2773
2774 # Pretend that this is a single element range.
2775 $hex_upper = $hex_lower;
2776 }
2777 }
2778 else {
2779 $value = join " ", @map;
2780 }
2781 }
62b3b855
KW
2782
2783 # Insert the hangul range at the appropriate spot.
2784 if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
2785 $done_hangul = 1;
2786 $decomps{'LIST'} .=
2787 sprintf "%x\t%x\t%s\n",
2788 $HANGUL_BEGIN,
2789 $HANGUL_BEGIN + $HANGUL_COUNT - 1,
2790 ($second_try eq 'dt')
2791 ? "Canonical"
2792 : "<hangul syllable>";
2793 }
2794
2795 # And append this to our constructed LIST.
2796 $decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
bea2c146
KW
2797
2798 redo if $redo;
62b3b855
KW
2799 }
2800 $swash = \%decomps;
2801 }
34132297
KW
2802 elsif ($second_try eq 'nfkccf') {
2803
2804 # This property is stored in the old format for backwards
2805 # compatibility for any applications that read its file directly.
2806 # So here we convert it to delta format for compatibility with the
2807 # other properties similar to it.
2808 my %nfkccf;
2809
2810 # Create a new LIST with deltas instead of code points.
2811 my $list = "";
2812 foreach my $range (split "\n", $swash->{'LIST'}) {
2813 my ($hex_begin, $hex_end, $map) = split "\t", $range;
2814 my $begin = hex $hex_begin;
2815 my $end = (defined $hex_end && $hex_end ne "")
2816 ? hex $hex_end
2817 : $begin;
2818 my $decimal_map = hex $map;
2819 foreach my $code_point ($begin .. $end) {
2820 $list .= sprintf("%04X\t\t%d\n", $code_point, $decimal_map - $code_point);
2821 }
2822 }
2823
2824 $nfkccf{'LIST'} = $list;
2825 $nfkccf{'TYPE'} = "ToNFKCCF";
2826 $nfkccf{'SPECIALS'} = $swash->{'SPECIALS'};
2827 $swash = \%nfkccf;
2828 $utf8::SwashInfo{'ToNFKCCF'}{'missing'} = 0;
2829 $utf8::SwashInfo{'ToNFKCCF'}{'format'} = 'i';
2830 }
62b3b855
KW
2831 else { # Don't know this property. Fail.
2832 return;
2833 }
2834 }
2835
2836 if ($swash->{'EXTRAS'}) {
2837 carp __PACKAGE__, "::prop_invmap: swash returned for $prop unexpectedly has EXTRAS magic";
2838 return;
2839 }
2840
2841 # Here, have a valid swash return. Examine it.
34132297 2842 my $returned_prop = $swash->{'TYPE'};
62b3b855
KW
2843
2844 # All properties but binary ones should have 'missing' and 'format'
2845 # entries
2846 $missing = $utf8::SwashInfo{$returned_prop}{'missing'};
2847 $missing = 'N' unless defined $missing;
2848
2849 $format = $utf8::SwashInfo{$returned_prop}{'format'};
2850 $format = 'b' unless defined $format;
2851
2852 # The LIST input lines look like:
2853 # ...
2854 # 0374\t\tCommon
2855 # 0375\t0377\tGreek # [3]
2856 # 037A\t037D\tGreek # [4]
2857 # 037E\t\tCommon
2858 # 0384\t\tGreek
2859 # ...
2860 #
2861 # Convert them to like
2862 # 0374 => Common
2863 # 0375 => Greek
2864 # 0378 => $missing
2865 # 037A => Greek
2866 # 037E => Common
2867 # 037F => $missing
2868 # 0384 => Greek
2869 #
2870 # For binary properties, the final non-comment column is absent, and
2871 # assumed to be 'Y'.
2872
2873 foreach my $range (split "\n", $swash->{'LIST'}) {
2874 $range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments
2875
2876 # Find the beginning and end of the range on the line
2877 my ($hex_begin, $hex_end, $map) = split "\t", $range;
2878 my $begin = hex $hex_begin;
2879 my $end = (defined $hex_end && $hex_end ne "")
2880 ? hex $hex_end
2881 : $begin;
2882
92bcf67b
KW
2883 # Each time through the loop (after the first):
2884 # $invlist[-2] contains the beginning of the previous range processed
2885 # $invlist[-1] contains the end+1 of the previous range processed
2886 # $invmap[-2] contains the value of the previous range processed
2887 # $invmap[-1] contains the default value for missing ranges ($missing)
2888 #
2889 # Thus, things are set up for the typical case of a new non-adjacent
2890 # range of non-missings to be added. But, if the new range is
dc8d8ea6 2891 # adjacent, it needs to replace the [-1] element; and if the new
92bcf67b
KW
2892 # range is a multiple value of the previous one, it needs to be added
2893 # to the [-2] map element.
2894
2895 # The first time through, everything will be empty. If the property
2896 # doesn't have a range that begins at 0, add one that maps to $missing
62b3b855
KW
2897 if (! @invlist) {
2898 if ($begin != 0) {
2899 push @invlist, 0;
2900 push @invmap, $missing;
2901 }
2902 }
e35c6019
KW
2903 elsif (@invlist > 1 && $invlist[-2] == $begin) {
2904
2905 # Here we handle the case where the input has multiple entries for
2906 # each code point. mktables should have made sure that each such
2907 # range contains only one code point. At this point, $invlist[-1]
2908 # is the $missing that was added at the end of the last loop
2909 # iteration, and [-2] is the last real input code point, and that
2910 # code point is the same as the one we are adding now, making the
2911 # new one a multiple entry. Add it to the existing entry, either
2912 # by pushing it to the existing list of multiple entries, or
2913 # converting the single current entry into a list with both on it.
2914 # This is all we need do for this iteration.
2915
2916 if ($end != $begin) {
2917 croak __PACKAGE__, "Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
2918 }
2919 if (! ref $invmap[-2]) {
2920 $invmap[-2] = [ $invmap[-2], $map ];
2921 }
2922 else {
2923 push @{$invmap[-2]}, $map;
2924 }
2925 $has_multiples = 1;
2926 next;
2927 }
62b3b855
KW
2928 elsif ($invlist[-1] == $begin) {
2929
2930 # If the input isn't in the most compact form, so that there are
2931 # two adjacent ranges that map to the same thing, they should be
2932 # combined. This happens in our constructed dt mapping, as
2933 # Element [-2] is the map for the latest range so far processed.
2934 # Just set the beginning point of the map to $missing (in
2935 # invlist[-1]) to 1 beyond where this range ends. For example, in
2936 # 12\t13\tXYZ
2937 # 14\t17\tXYZ
2938 # we have set it up so that it looks like
2939 # 12 => XYZ
2940 # 14 => $missing
2941 #
2942 # We now see that it should be
2943 # 12 => XYZ
2944 # 18 => $missing
c887f93f
KW
2945 if (@invlist > 1 && ( (defined $map)
2946 ? $invmap[-2] eq $map
2947 : $invmap[-2] eq 'Y'))
2948 {
62b3b855
KW
2949 $invlist[-1] = $end + 1;
2950 next;
2951 }
2952
2953 # Here, the range started in the previous iteration that maps to
2954 # $missing starts at the same code point as this range. That
2955 # means there is no gap to fill that that range was intended for,
2956 # so we just pop it off the parallel arrays.
2957 pop @invlist;
2958 pop @invmap;
2959 }
2960
2961 # Add the range beginning, and the range's map.
2962 push @invlist, $begin;
2963 if ($format eq 'dm') {
2964
2965 # The decomposition maps are either a line like <hangul syllable>
2966 # which are to be taken as is; or a sequence of code points in hex
2967 # and separated by blanks. Convert them to decimal, and if there
2968 # is more than one, use an anonymous array as the map.
2969 if ($map =~ /^ < /x) {
2970 push @invmap, $map;
2971 }
2972 else {
bea2c146 2973 my @map = split " ", $map;
62b3b855
KW
2974 if (@map == 1) {
2975 push @invmap, $map[0];
2976 }
2977 else {
2978 push @invmap, \@map;
2979 }
2980 }
2981 }
2982 else {
2983
2984 # Otherwise, convert hex formatted list entries to decimal; add a
2985 # 'Y' map for the missing value in binary properties, or
2986 # otherwise, use the input map unchanged.
2987 $map = ($format eq 'x')
2988 ? hex $map
2989 : $format eq 'b'
2990 ? 'Y'
2991 : $map;
2992 push @invmap, $map;
2993 }
2994
2995 # We just started a range. It ends with $end. The gap between it and
2996 # the next element in the list must be filled with a range that maps
2997 # to the default value. If there is no gap, the next iteration will
2998 # pop this, unless there is no next iteration, and we have filled all
2999 # of the Unicode code space, so check for that and skip.
3000 if ($end < $MAX_UNICODE_CODEPOINT) {
3001 push @invlist, $end + 1;
3002 push @invmap, $missing;
3003 }
3004 }
3005
3006 # If the property is empty, make all code points use the value for missing
3007 # ones.
3008 if (! @invlist) {
3009 push @invlist, 0;
3010 push @invmap, $missing;
3011 }
3012
647396da 3013 # And add in standard element that all non-Unicode code points map to:
62b3b855
KW
3014 # $missing
3015 push @invlist, $MAX_UNICODE_CODEPOINT + 1;
3016 push @invmap, $missing;
3017
3018 # The second component of the map are those values that require
3019 # non-standard specification, stored in SPECIALS. These override any
3020 # duplicate code points in LIST. If we are using a proxy, we may have
3021 # already set $overrides based on the proxy.
3022 $overrides = $swash->{'SPECIALS'} unless defined $overrides;
3023 if ($overrides) {
3024
3025 # A negative $overrides implies that the SPECIALS should be ignored,
3026 # and a simple 'c' list is the value.
3027 if ($overrides < 0) {
3028 $format = 'c';
3029 }
3030 else {
3031
3032 # Currently, all overrides are for properties that normally map to
3033 # single code points, but now some will map to lists of code
3034 # points (but there is an exception case handled below).
3035 $format = 'cl';
3036
3037 # Look through the overrides.
3038 foreach my $cp_maybe_utf8 (keys %$overrides) {
3039 my $cp;
3040 my @map;
3041
3042 # If the overrides came from SPECIALS, the code point keys are
3043 # packed UTF-8.
3044 if ($overrides == $swash->{'SPECIALS'}) {
3045 $cp = unpack("C0U", $cp_maybe_utf8);
3046 @map = unpack "U0U*", $swash->{'SPECIALS'}{$cp_maybe_utf8};
3047
3048 # The empty string will show up unpacked as an empty
3049 # array.
3050 $format = 'cle' if @map == 0;
3051 }
3052 else {
3053
3054 # But if we generated the overrides, we didn't bother to
3055 # pack them, and we, so far, do this only for properties
3056 # that are 'c' ones.
3057 $cp = $cp_maybe_utf8;
3058 @map = hex $overrides->{$cp};
3059 $format = 'c';
3060 }
3061
3062 # Find the range that the override applies to.
3063 my $i = _search_invlist(\@invlist, $cp);
3064 if ($cp < $invlist[$i] || $cp >= $invlist[$i + 1]) {
3065 croak __PACKAGE__, "wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
3066 }
3067
3068 # And what that range currently maps to
3069 my $cur_map = $invmap[$i];
3070
3071 # If there is a gap between the next range and the code point
3072 # we are overriding, we have to add elements to both arrays to
3073 # fill that gap, using the map that applies to it, which is
3074 # $cur_map, since it is part of the current range.
3075 if ($invlist[$i + 1] > $cp + 1) {
3076 #use feature 'say';
3077 #say "Before splice:";
3078 #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3079 #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3080 #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3081 #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3082 #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3083
3084 splice @invlist, $i + 1, 0, $cp + 1;
3085 splice @invmap, $i + 1, 0, $cur_map;
3086
3087 #say "After splice:";
3088 #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3089 #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3090 #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3091 #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3092 #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3093 }
3094
3095 # If the remaining portion of the range is multiple code
3096 # points (ending with the one we are replacing, guaranteed by
3097 # the earlier splice). We must split it into two
3098 if ($invlist[$i] < $cp) {
3099 $i++; # Compensate for the new element
3100
3101 #use feature 'say';
3102 #say "Before splice:";
3103 #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3104 #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3105 #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3106 #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3107 #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3108
3109 splice @invlist, $i, 0, $cp;
3110 splice @invmap, $i, 0, 'dummy';
3111
3112 #say "After splice:";
3113 #say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
3114 #say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
3115 #say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
3116 #say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
3117 #say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
3118 }
3119
3120 # Here, the range we are overriding contains a single code
3121 # point. The result could be the empty string, a single
3122 # value, or a list. If the last case, we use an anonymous
3123 # array.
3124 $invmap[$i] = (scalar @map == 0)
3125 ? ""
3126 : (scalar @map > 1)
3127 ? \@map
3128 : $map[0];
3129 }
3130 }
3131 }
3132 elsif ($format eq 'x') {
3133
647396da
KW
3134 # All hex-valued properties are really to code points, and have been
3135 # converted to decimal.
bf7fe2df 3136 $format = 'i';
62b3b855
KW
3137 }
3138 elsif ($format eq 'dm') {
3139 $format = 'd';
3140 }
3141 elsif ($format eq 'sw') { # blank-separated elements to form a list.
3142 map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap;
3143 $format = 'sl';
3144 }
3145 elsif ($returned_prop eq 'ToNameAlias') {
3146
3147 # This property currently doesn't have any lists, but theoretically
3148 # could
3149 $format = 'sl';
3150 }
b0b13ada
KW
3151 elsif ($returned_prop eq 'ToPerlDecimalDigit') {
3152 $format = 'ce';
3153 }
62b3b855
KW
3154 elsif ($format ne 'n' && $format ne 'r') {
3155
3156 # All others are simple scalars
3157 $format = 's';
3158 }
e35c6019
KW
3159 if ($has_multiples && $format !~ /l/) {
3160 croak __PACKAGE__, "Wrong format '$format' for prop_invmap('$prop'); should indicate has lists";
3161 }
62b3b855
KW
3162
3163 return (\@invlist, \@invmap, $format, $missing);
3164}
3165
55d7b906 3166=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 3167
a452d459
KW
3168This returns the version of the Unicode Character Database, in other words, the
3169version of the Unicode standard the database implements. The version is a
3170string of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
3171
3172=cut
3173
3174my $UNICODEVERSION;
3175
3176sub UnicodeVersion {
3177 unless (defined $UNICODEVERSION) {
3178 openunicode(\$VERSIONFH, "version");
ce066323 3179 local $/ = "\n";
10a6ecd2
JH
3180 chomp($UNICODEVERSION = <$VERSIONFH>);
3181 close($VERSIONFH);
3182 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
3183 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
3184 }
3185 return $UNICODEVERSION;
3186}
3aa957f9 3187
a452d459
KW
3188=head2 B<Blocks versus Scripts>
3189
3190The difference between a block and a script is that scripts are closer
3191to the linguistic notion of a set of code points required to present
3192languages, while block is more of an artifact of the Unicode code point
3193numbering and separation into blocks of (mostly) 256 code points.
3194
3195For example the Latin B<script> is spread over several B<blocks>, such
3196as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
3197C<Latin Extended-B>. On the other hand, the Latin script does not
3198contain all the characters of the C<Basic Latin> block (also known as
3199ASCII): it includes only the letters, and not, for example, the digits
3200or the punctuation.
3201
3202For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
3203
3204For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
3205
3206=head2 B<Matching Scripts and Blocks>
3207
3208Scripts are matched with the regular-expression construct
3209C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
f200dd12 3210while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches
a452d459
KW
3211any of the 256 code points in the Tibetan block).
3212
430fe03d
KW
3213=head2 Old-style versus new-style block names
3214
3215Unicode publishes the names of blocks in two different styles, though the two
3216are equivalent under Unicode's loose matching rules.
3217
3218The original style uses blanks and hyphens in the block names (except for
3219C<No_Block>), like so:
3220
3221 Miscellaneous Mathematical Symbols-B
3222
3223The newer style replaces these with underscores, like this:
3224
3225 Miscellaneous_Mathematical_Symbols_B
3226
3227This newer style is consistent with the values of other Unicode properties.
3228To preserve backward compatibility, all the functions in Unicode::UCD that
3229return block names (except one) return the old-style ones. That one function,
3230L</prop_value_aliases()> can be used to convert from old-style to new-style:
3231
3232 my $new_style = prop_values_aliases("block", $old_style);
3233
3234Perl also has single-form extensions that refer to blocks, C<In_Cyrillic>,
3235meaning C<Block=Cyrillic>. These have always been written in the new style.
3236
3237To convert from new-style to old-style, follow this recipe:
3238
3239 $old_style = charblock((prop_invlist("block=$new_style"))[0]);
3240
3241(which finds the range of code points in the block using C<prop_invlist>,
3242gets the lower end of the range (0th element) and then looks up the old name
3243for its block using C<charblock>).
3244
7620cb10
KW
3245Note that starting in Unicode 6.1, many of the block names have shorter
3246synonyms. These are always given in the new style.
3247
8b731da2
JH
3248=head1 BUGS
3249
3250Does not yet support EBCDIC platforms.
3251
561c79ed
JH
3252=head1 AUTHOR
3253
a18e976f 3254Jarkko Hietaniemi. Now maintained by perl5 porters.
561c79ed
JH
3255
3256=cut
3257
32581;