This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Add %loose_defaults to UCD.pl
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
55d7b906 1package Unicode::UCD;
561c79ed
JH
2
3use strict;
4use warnings;
36c2430c 5no warnings 'surrogate'; # surrogates can be inputs to this
98ef7649 6use charnames ();
94c91ffc 7use Unicode::Normalize qw(getCombinClass NFD);
561c79ed 8
7ef25837 9our $VERSION = '0.37';
561c79ed 10
741297c1
JH
11use Storable qw(dclone);
12
561c79ed
JH
13require Exporter;
14
15our @ISA = qw(Exporter);
74f8133e 16
10a6ecd2
JH
17our @EXPORT_OK = qw(charinfo
18 charblock charscript
19 charblocks charscripts
b08cd201 20 charinrange
ea508aee 21 general_categories bidi_types
b08cd201 22 compexcl
a2bd7410 23 casefold casespec
7319f91d
KW
24 namedseq
25 num
7ef25837
KW
26 prop_aliases
27 prop_value_aliases
7319f91d 28 );
561c79ed
JH
29
30use Carp;
31
32=head1 NAME
33
55d7b906 34Unicode::UCD - Unicode character database
561c79ed
JH
35
36=head1 SYNOPSIS
37
55d7b906 38 use Unicode::UCD 'charinfo';
b08cd201 39 my $charinfo = charinfo($codepoint);
561c79ed 40
956cae9a
KW
41 use Unicode::UCD 'casefold';
42 my $casefold = casefold(0xFB00);
43
5d8e6e41
KW
44 use Unicode::UCD 'casespec';
45 my $casespec = casespec(0xFB00);
46
55d7b906 47 use Unicode::UCD 'charblock';
e882dd67
JH
48 my $charblock = charblock($codepoint);
49
55d7b906 50 use Unicode::UCD 'charscript';
65044554 51 my $charscript = charscript($codepoint);
561c79ed 52
55d7b906 53 use Unicode::UCD 'charblocks';
e145285f
JH
54 my $charblocks = charblocks();
55
55d7b906 56 use Unicode::UCD 'charscripts';
ea508aee 57 my $charscripts = charscripts();
e145285f 58
55d7b906 59 use Unicode::UCD qw(charscript charinrange);
e145285f
JH
60 my $range = charscript($script);
61 print "looks like $script\n" if charinrange($range, $codepoint);
62
ea508aee
JH
63 use Unicode::UCD qw(general_categories bidi_types);
64 my $categories = general_categories();
65 my $types = bidi_types();
66
7ef25837
KW
67 use Unicode::UCD 'prop_aliases';
68 my @space_names = prop_aliases("space");
69
70 use Unicode::UCD 'prop_value_aliases';
71 my @gc_punct_names = prop_value_aliases("Gc", "Punct");
72
55d7b906 73 use Unicode::UCD 'compexcl';
e145285f
JH
74 my $compexcl = compexcl($codepoint);
75
a2bd7410
JH
76 use Unicode::UCD 'namedseq';
77 my $namedseq = namedseq($named_sequence_name);
78
55d7b906 79 my $unicode_version = Unicode::UCD::UnicodeVersion();
e145285f 80
7319f91d 81 my $convert_to_numeric =
62a8c8c2 82 Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
7319f91d 83
561c79ed
JH
84=head1 DESCRIPTION
85
a452d459
KW
86The Unicode::UCD module offers a series of functions that
87provide a simple interface to the Unicode
8b731da2 88Character Database.
561c79ed 89
a452d459
KW
90=head2 code point argument
91
92Some of the functions are called with a I<code point argument>, which is either
93a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
94followed by hexadecimals designating a Unicode code point. In other words, if
95you want a code point to be interpreted as a hexadecimal number, you must
96prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
f200dd12
KW
97interpreted as a decimal code point. Note that the largest code point in
98Unicode is U+10FFFF.
561c79ed
JH
99=cut
100
10a6ecd2 101my $BLOCKSFH;
10a6ecd2 102my $VERSIONFH;
b08cd201
JH
103my $CASEFOLDFH;
104my $CASESPECFH;
a2bd7410 105my $NAMEDSEQFH;
561c79ed
JH
106
107sub openunicode {
108 my ($rfh, @path) = @_;
109 my $f;
110 unless (defined $$rfh) {
111 for my $d (@INC) {
112 use File::Spec;
55d7b906 113 $f = File::Spec->catfile($d, "unicore", @path);
32c16050 114 last if open($$rfh, $f);
e882dd67 115 undef $f;
561c79ed 116 }
e882dd67
JH
117 croak __PACKAGE__, ": failed to find ",
118 File::Spec->catfile(@path), " in @INC"
119 unless defined $f;
561c79ed
JH
120 }
121 return $f;
122}
123
a452d459 124=head2 B<charinfo()>
561c79ed 125
55d7b906 126 use Unicode::UCD 'charinfo';
561c79ed 127
b08cd201 128 my $charinfo = charinfo(0x41);
561c79ed 129
a452d459
KW
130This returns information about the input L</code point argument>
131as a reference to a hash of fields as defined by the Unicode
132standard. If the L</code point argument> is not assigned in the standard
133(i.e., has the general category C<Cn> meaning C<Unassigned>)
134or is a non-character (meaning it is guaranteed to never be assigned in
135the standard),
a18e976f 136C<undef> is returned.
a452d459
KW
137
138Fields that aren't applicable to the particular code point argument exist in the
139returned hash, and are empty.
140
141The keys in the hash with the meanings of their values are:
142
143=over
144
145=item B<code>
146
147the input L</code point argument> expressed in hexadecimal, with leading zeros
148added if necessary to make it contain at least four hexdigits
149
150=item B<name>
151
152name of I<code>, all IN UPPER CASE.
153Some control-type code points do not have names.
154This field will be empty for C<Surrogate> and C<Private Use> code points,
155and for the others without a name,
156it will contain a description enclosed in angle brackets, like
157C<E<lt>controlE<gt>>.
158
159
160=item B<category>
161
162The short name of the general category of I<code>.
163This will match one of the keys in the hash returned by L</general_categories()>.
164
7ef25837
KW
165The L</prop_value_aliases()> function can be used to get all the synonyms
166of the category name.
167
a452d459
KW
168=item B<combining>
169
170the combining class number for I<code> used in the Canonical Ordering Algorithm.
171For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
172available at
173L<http://www.unicode.org/versions/Unicode5.1.0/>
174
7ef25837
KW
175The L</prop_value_aliases()> function can be used to get all the synonyms
176of the combining class number.
177
a452d459
KW
178=item B<bidi>
179
180bidirectional type of I<code>.
181This will match one of the keys in the hash returned by L</bidi_types()>.
182
7ef25837
KW
183The L</prop_value_aliases()> function can be used to get all the synonyms
184of the bidi type name.
185
a452d459
KW
186=item B<decomposition>
187
188is empty if I<code> has no decomposition; or is one or more codes
a18e976f 189(separated by spaces) that, taken in order, represent a decomposition for
a452d459
KW
190I<code>. Each has at least four hexdigits.
191The codes may be preceded by a word enclosed in angle brackets then a space,
192like C<E<lt>compatE<gt> >, giving the type of decomposition
193
06bba7d5
KW
194This decomposition may be an intermediate one whose components are also
195decomposable. Use L<Unicode::Normalize> to get the final decomposition.
196
a452d459
KW
197=item B<decimal>
198
199if I<code> is a decimal digit this is its integer numeric value
200
201=item B<digit>
202
89e4a205
KW
203if I<code> represents some other digit-like number, this is its integer
204numeric value
a452d459
KW
205
206=item B<numeric>
207
208if I<code> represents a whole or rational number, this is its numeric value.
209Rational values are expressed as a string like C<1/4>.
210
211=item B<mirrored>
212
213C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
214
215=item B<unicode10>
216
217name of I<code> in the Unicode 1.0 standard if one
218existed for this code point and is different from the current name
219
220=item B<comment>
221
89e4a205 222As of Unicode 6.0, this is always empty.
a452d459
KW
223
224=item B<upper>
225
06bba7d5 226is empty if there is no single code point uppercase mapping for I<code>
4f66642e 227(its uppercase mapping is itself);
a452d459
KW
228otherwise it is that mapping expressed as at least four hexdigits.
229(L</casespec()> should be used in addition to B<charinfo()>
230for case mappings when the calling program can cope with multiple code point
231mappings.)
232
233=item B<lower>
234
06bba7d5 235is empty if there is no single code point lowercase mapping for I<code>
4f66642e 236(its lowercase mapping is itself);
a452d459
KW
237otherwise it is that mapping expressed as at least four hexdigits.
238(L</casespec()> should be used in addition to B<charinfo()>
239for case mappings when the calling program can cope with multiple code point
240mappings.)
241
242=item B<title>
243
06bba7d5 244is empty if there is no single code point titlecase mapping for I<code>
4f66642e 245(its titlecase mapping is itself);
a452d459
KW
246otherwise it is that mapping expressed as at least four hexdigits.
247(L</casespec()> should be used in addition to B<charinfo()>
248for case mappings when the calling program can cope with multiple code point
249mappings.)
250
251=item B<block>
252
a18e976f 253the block I<code> belongs to (used in C<\p{Blk=...}>).
a452d459
KW
254See L</Blocks versus Scripts>.
255
256
257=item B<script>
258
a18e976f 259the script I<code> belongs to.
a452d459
KW
260See L</Blocks versus Scripts>.
261
262=back
32c16050
JH
263
264Note that you cannot do (de)composition and casing based solely on the
a452d459
KW
265I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
266you will need also the L</compexcl()>, and L</casespec()> functions.
561c79ed
JH
267
268=cut
269
e10d7780 270# NB: This function is nearly duplicated in charnames.pm
10a6ecd2
JH
271sub _getcode {
272 my $arg = shift;
273
dc0a4417 274 if ($arg =~ /^[1-9]\d*$/) {
10a6ecd2 275 return $arg;
dc0a4417 276 } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
10a6ecd2
JH
277 return hex($1);
278 }
279
280 return;
281}
282
05dbc6f8
KW
283# Populated by _num. Converts real number back to input rational
284my %real_to_rational;
285
286# To store the contents of files found on disk.
287my @BIDIS;
288my @CATEGORIES;
289my @DECOMPOSITIONS;
290my @NUMERIC_TYPES;
5c3b35c9
KW
291my %SIMPLE_LOWER;
292my %SIMPLE_TITLE;
293my %SIMPLE_UPPER;
294my %UNICODE_1_NAMES;
05dbc6f8
KW
295
296sub _charinfo_case {
297
298 # Returns the value to set into one of the case fields in the charinfo
299 # structure.
300 # $char is the character,
301 # $cased is the case-changed character
302 # $file is the file in lib/unicore/To/$file that contains the data
303 # needed for this, in the form that _search() understands.
5c3b35c9 304 # $hash_ref points to the hash holding the contents of $file. It will
05dbc6f8
KW
305 # be populated if empty.
306 # By using the 'uc', etc. functions, we avoid loading more files into
307 # memory except for those rare cases where the simple casing (which has
308 # been what charinfo() has always returned, is different than the full
309 # casing.
5c3b35c9 310 my ($char, $cased, $file, $hash_ref) = @_;
05dbc6f8
KW
311
312 return "" if $cased eq $char;
313
314 return sprintf("%04X", ord $cased) if length($cased) == 1;
315
5c3b35c9
KW
316 %$hash_ref =_read_table("unicore/To/$file", 'use_hash') unless %$hash_ref;
317 return $hash_ref->{ord $char} // "";
a6fa416b
TS
318}
319
05dbc6f8 320sub charinfo {
a6fa416b 321
05dbc6f8
KW
322 # This function has traditionally mimicked what is in UnicodeData.txt,
323 # warts and all. This is a re-write that avoids UnicodeData.txt so that
324 # it can be removed to save disk space. Instead, this assembles
325 # information gotten by other methods that get data from various other
326 # files. It uses charnames to get the character name; and various
327 # mktables tables.
324f9e44 328
05dbc6f8 329 use feature 'unicode_strings';
a6fa416b 330
10a6ecd2
JH
331 my $arg = shift;
332 my $code = _getcode($arg);
05dbc6f8
KW
333 croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
334
335 # Non-unicode implies undef.
336 return if $code > 0x10FFFF;
337
338 my %prop;
339 my $char = chr($code);
340
341 @CATEGORIES =_read_table("unicore/To/Gc.pl") unless @CATEGORIES;
342 $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
343 // $utf8::SwashInfo{'ToGc'}{'missing'};
344
345 return if $prop{'category'} eq 'Cn'; # Unassigned code points are undef
346
347 $prop{'code'} = sprintf "%04X", $code;
348 $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
349 : (charnames::viacode($code) // "");
350
351 $prop{'combining'} = getCombinClass($code);
352
353 @BIDIS =_read_table("unicore/To/Bc.pl") unless @BIDIS;
354 $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
355 // $utf8::SwashInfo{'ToBc'}{'missing'};
356
357 # For most code points, we can just read in "unicore/Decomposition.pl", as
358 # its contents are exactly what should be output. But that file doesn't
359 # contain the data for the Hangul syllable decompositions, which can be
94c91ffc
KW
360 # algorithmically computed, and NFD() does that, so we call NFD() for
361 # those. We can't use NFD() for everything, as it does a complete
05dbc6f8 362 # recursive decomposition, and what this function has always done is to
94c91ffc
KW
363 # return what's in UnicodeData.txt which doesn't show that recursiveness.
364 # Fortunately, the NFD() of the Hanguls doesn't have any recursion
365 # issues.
366 # Having no decomposition implies an empty field; otherwise, all but
367 # "Canonical" imply a compatible decomposition, and the type is prefixed
368 # to that, as it is in UnicodeData.txt
05dbc6f8
KW
369 if ($char =~ /\p{Block=Hangul_Syllables}/) {
370 # The code points of the decomposition are output in standard Unicode
371 # hex format, separated by blanks.
372 $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
94c91ffc 373 unpack "U*", NFD($char);
a6fa416b 374 }
05dbc6f8
KW
375 else {
376 @DECOMPOSITIONS = _read_table("unicore/Decomposition.pl")
377 unless @DECOMPOSITIONS;
378 $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
379 $code) // "";
561c79ed 380 }
05dbc6f8
KW
381
382 # Can use num() to get the numeric values, if any.
383 if (! defined (my $value = num($char))) {
384 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
385 }
386 else {
387 if ($char =~ /\d/) {
388 $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
389 }
390 else {
391
392 # For non-decimal-digits, we have to read in the Numeric type
393 # to distinguish them. It is not just a matter of integer vs.
394 # rational, as some whole number values are not considered digits,
395 # e.g., TAMIL NUMBER TEN.
396 $prop{'decimal'} = "";
397
398 @NUMERIC_TYPES =_read_table("unicore/To/Nt.pl")
399 unless @NUMERIC_TYPES;
400 if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
401 eq 'Digit')
402 {
403 $prop{'digit'} = $prop{'numeric'} = $value;
404 }
405 else {
406 $prop{'digit'} = "";
407 $prop{'numeric'} = $real_to_rational{$value} // $value;
408 }
409 }
410 }
411
412 $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
413
5c3b35c9
KW
414 %UNICODE_1_NAMES =_read_table("unicore/To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
415 $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
05dbc6f8
KW
416
417 # This is true starting in 6.0, but, num() also requires 6.0, so
418 # don't need to test for version again here.
419 $prop{'comment'} = "";
420
5c3b35c9
KW
421 $prop{'upper'} = _charinfo_case($char, uc $char, '_suc.pl', \%SIMPLE_UPPER);
422 $prop{'lower'} = _charinfo_case($char, lc $char, '_slc.pl', \%SIMPLE_LOWER);
05dbc6f8 423 $prop{'title'} = _charinfo_case($char, ucfirst $char, '_stc.pl',
5c3b35c9 424 \%SIMPLE_TITLE);
05dbc6f8
KW
425
426 $prop{block} = charblock($code);
427 $prop{script} = charscript($code);
428 return \%prop;
561c79ed
JH
429}
430
e882dd67
JH
431sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
432 my ($table, $lo, $hi, $code) = @_;
433
434 return if $lo > $hi;
435
436 my $mid = int(($lo+$hi) / 2);
437
438 if ($table->[$mid]->[0] < $code) {
10a6ecd2 439 if ($table->[$mid]->[1] >= $code) {
e882dd67
JH
440 return $table->[$mid]->[2];
441 } else {
442 _search($table, $mid + 1, $hi, $code);
443 }
444 } elsif ($table->[$mid]->[0] > $code) {
445 _search($table, $lo, $mid - 1, $code);
446 } else {
447 return $table->[$mid]->[2];
448 }
449}
450
cb366075 451sub _read_table ($;$) {
3a12600d
KW
452
453 # Returns the contents of the mktables generated table file located at $1
cb366075
KW
454 # in the form of either an array of arrays or a hash, depending on if the
455 # optional second parameter is true (for hash return) or not. In the case
456 # of a hash return, each key is a code point, and its corresponding value
457 # is what the table gives as the code point's corresponding value. In the
458 # case of an array return, each outer array denotes a range with [0] the
459 # start point of that range; [1] the end point; and [2] the value that
460 # every code point in the range has. The hash return is useful for fast
461 # lookup when the table contains only single code point ranges. The array
462 # return takes much less memory when there are large ranges.
3a12600d 463 #
cb366075 464 # This function has the side effect of setting
3a12600d
KW
465 # $utf8::SwashInfo{$property}{'format'} to be the mktables format of the
466 # table; and
467 # $utf8::SwashInfo{$property}{'missing'} to be the value for all entries
468 # not listed in the table.
469 # where $property is the Unicode property name, preceded by 'To' for map
470 # properties., e.g., 'ToSc'.
471 #
472 # Table entries look like one of:
473 # 0000 0040 Common # [65]
474 # 00AA Latin
475
476 my $table = shift;
cb366075
KW
477 my $return_hash = shift;
478 $return_hash = 0 unless defined $return_hash;
3a12600d 479 my @return;
cb366075 480 my %return;
3a12600d
KW
481 local $_;
482
483 for (split /^/m, do $table) {
484 my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
485 \s* ( \# .* )? # Optional comment
486 $ /x;
83fd1222
KW
487 my $decimal_start = hex $start;
488 my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
cb366075 489 if ($return_hash) {
83fd1222 490 foreach my $i ($decimal_start .. $decimal_end) {
cb366075
KW
491 $return{$i} = $value;
492 }
493 }
9a96c106
KW
494 elsif (@return &&
495 $return[-1][1] == $decimal_start - 1
496 && $return[-1][2] eq $value)
497 {
498 # If this is merely extending the previous range, do just that.
499 $return[-1]->[1] = $decimal_end;
500 }
cb366075 501 else {
83fd1222 502 push @return, [ $decimal_start, $decimal_end, $value ];
cb366075 503 }
3a12600d 504 }
cb366075 505 return ($return_hash) ? %return : @return;
3a12600d
KW
506}
507
10a6ecd2
JH
508sub charinrange {
509 my ($range, $arg) = @_;
510 my $code = _getcode($arg);
511 croak __PACKAGE__, "::charinrange: unknown code '$arg'"
512 unless defined $code;
513 _search($range, 0, $#$range, $code);
514}
515
a452d459 516=head2 B<charblock()>
561c79ed 517
55d7b906 518 use Unicode::UCD 'charblock';
561c79ed
JH
519
520 my $charblock = charblock(0x41);
10a6ecd2 521 my $charblock = charblock(1234);
a452d459 522 my $charblock = charblock(0x263a);
10a6ecd2
JH
523 my $charblock = charblock("U+263a");
524
78bf21c2 525 my $range = charblock('Armenian');
10a6ecd2 526
a452d459
KW
527With a L</code point argument> charblock() returns the I<block> the code point
528belongs to, e.g. C<Basic Latin>.
529If the code point is unassigned, this returns the block it would belong to if
a18e976f 530it were assigned.
10a6ecd2 531
78bf21c2
JH
532See also L</Blocks versus Scripts>.
533
18972f4b 534If supplied with an argument that can't be a code point, charblock() tries to
a18e976f
KW
535do the opposite and interpret the argument as a block name. The return value
536is a I<range set> with one range: an anonymous list with a single element that
537consists of another anonymous list whose first element is the first code point
538in the block, and whose second (and final) element is the final code point in
539the block. (The extra list consisting of just one element is so that the same
540program logic can be used to handle both this return, and the return from
541L</charscript()> which can have multiple ranges.) You can test whether a code
542point is in a range using the L</charinrange()> function. If the argument is
543not a known block, C<undef> is returned.
561c79ed 544
561c79ed
JH
545=cut
546
547my @BLOCKS;
10a6ecd2 548my %BLOCKS;
561c79ed 549
10a6ecd2 550sub _charblocks {
06bba7d5
KW
551
552 # Can't read from the mktables table because it loses the hyphens in the
553 # original.
561c79ed 554 unless (@BLOCKS) {
10a6ecd2 555 if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
6c8d78fb 556 local $_;
10a6ecd2 557 while (<$BLOCKSFH>) {
2796c109 558 if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
10a6ecd2
JH
559 my ($lo, $hi) = (hex($1), hex($2));
560 my $subrange = [ $lo, $hi, $3 ];
561 push @BLOCKS, $subrange;
562 push @{$BLOCKS{$3}}, $subrange;
561c79ed
JH
563 }
564 }
10a6ecd2 565 close($BLOCKSFH);
561c79ed
JH
566 }
567 }
10a6ecd2
JH
568}
569
570sub charblock {
571 my $arg = shift;
572
573 _charblocks() unless @BLOCKS;
574
575 my $code = _getcode($arg);
561c79ed 576
10a6ecd2 577 if (defined $code) {
c707cf8e
KW
578 my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code);
579 return $result if defined $result;
580 return 'No_Block';
581 }
582 elsif (exists $BLOCKS{$arg}) {
583 return dclone $BLOCKS{$arg};
10a6ecd2 584 }
e882dd67
JH
585}
586
a452d459 587=head2 B<charscript()>
e882dd67 588
55d7b906 589 use Unicode::UCD 'charscript';
e882dd67
JH
590
591 my $charscript = charscript(0x41);
10a6ecd2
JH
592 my $charscript = charscript(1234);
593 my $charscript = charscript("U+263a");
e882dd67 594
78bf21c2 595 my $range = charscript('Thai');
10a6ecd2 596
a452d459
KW
597With a L</code point argument> charscript() returns the I<script> the
598code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
bb2d29dc 599If the code point is unassigned, it returns C<"Unknown">.
78bf21c2 600
eb0cc9e3 601If supplied with an argument that can't be a code point, charscript() tries
a18e976f
KW
602to do the opposite and interpret the argument as a script name. The
603return value is a I<range set>: an anonymous list of lists that contain
eb0cc9e3 604I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
a18e976f
KW
605code point is in a range set using the L</charinrange()> function. If the
606argument is not a known script, C<undef> is returned.
a452d459
KW
607
608See also L</Blocks versus Scripts>.
e882dd67 609
e882dd67
JH
610=cut
611
612my @SCRIPTS;
10a6ecd2 613my %SCRIPTS;
e882dd67 614
10a6ecd2 615sub _charscripts {
7bccef0b
KW
616 @SCRIPTS =_read_table("unicore/To/Sc.pl") unless @SCRIPTS;
617 foreach my $entry (@SCRIPTS) {
f3d50ac9 618 $entry->[2] =~ s/(_\w)/\L$1/g; # Preserve old-style casing
7bccef0b 619 push @{$SCRIPTS{$entry->[2]}}, $entry;
e882dd67 620 }
10a6ecd2
JH
621}
622
623sub charscript {
624 my $arg = shift;
625
626 _charscripts() unless @SCRIPTS;
e882dd67 627
10a6ecd2
JH
628 my $code = _getcode($arg);
629
630 if (defined $code) {
7bccef0b
KW
631 my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
632 return $result if defined $result;
8079ad82 633 return $utf8::SwashInfo{'ToSc'}{'missing'};
7bccef0b
KW
634 } elsif (exists $SCRIPTS{$arg}) {
635 return dclone $SCRIPTS{$arg};
10a6ecd2 636 }
7bccef0b
KW
637
638 return;
10a6ecd2
JH
639}
640
a452d459 641=head2 B<charblocks()>
10a6ecd2 642
55d7b906 643 use Unicode::UCD 'charblocks';
10a6ecd2 644
b08cd201 645 my $charblocks = charblocks();
10a6ecd2 646
b08cd201 647charblocks() returns a reference to a hash with the known block names
a452d459 648as the keys, and the code point ranges (see L</charblock()>) as the values.
10a6ecd2 649
78bf21c2
JH
650See also L</Blocks versus Scripts>.
651
10a6ecd2
JH
652=cut
653
654sub charblocks {
b08cd201 655 _charblocks() unless %BLOCKS;
741297c1 656 return dclone \%BLOCKS;
10a6ecd2
JH
657}
658
a452d459 659=head2 B<charscripts()>
10a6ecd2 660
55d7b906 661 use Unicode::UCD 'charscripts';
10a6ecd2 662
ea508aee 663 my $charscripts = charscripts();
10a6ecd2 664
ea508aee 665charscripts() returns a reference to a hash with the known script
a452d459 666names as the keys, and the code point ranges (see L</charscript()>) as
ea508aee 667the values.
10a6ecd2 668
78bf21c2
JH
669See also L</Blocks versus Scripts>.
670
10a6ecd2
JH
671=cut
672
673sub charscripts {
b08cd201 674 _charscripts() unless %SCRIPTS;
741297c1 675 return dclone \%SCRIPTS;
561c79ed
JH
676}
677
a452d459 678=head2 B<charinrange()>
10a6ecd2 679
f200dd12 680In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
10a6ecd2 681can also test whether a code point is in the I<range> as returned by
a452d459
KW
682L</charblock()> and L</charscript()> or as the values of the hash returned
683by L</charblocks()> and L</charscripts()> by using charinrange():
10a6ecd2 684
55d7b906 685 use Unicode::UCD qw(charscript charinrange);
10a6ecd2
JH
686
687 $range = charscript('Hiragana');
e145285f 688 print "looks like hiragana\n" if charinrange($range, $codepoint);
10a6ecd2
JH
689
690=cut
691
ea508aee
JH
692my %GENERAL_CATEGORIES =
693 (
694 'L' => 'Letter',
695 'LC' => 'CasedLetter',
696 'Lu' => 'UppercaseLetter',
697 'Ll' => 'LowercaseLetter',
698 'Lt' => 'TitlecaseLetter',
699 'Lm' => 'ModifierLetter',
700 'Lo' => 'OtherLetter',
701 'M' => 'Mark',
702 'Mn' => 'NonspacingMark',
703 'Mc' => 'SpacingMark',
704 'Me' => 'EnclosingMark',
705 'N' => 'Number',
706 'Nd' => 'DecimalNumber',
707 'Nl' => 'LetterNumber',
708 'No' => 'OtherNumber',
709 'P' => 'Punctuation',
710 'Pc' => 'ConnectorPunctuation',
711 'Pd' => 'DashPunctuation',
712 'Ps' => 'OpenPunctuation',
713 'Pe' => 'ClosePunctuation',
714 'Pi' => 'InitialPunctuation',
715 'Pf' => 'FinalPunctuation',
716 'Po' => 'OtherPunctuation',
717 'S' => 'Symbol',
718 'Sm' => 'MathSymbol',
719 'Sc' => 'CurrencySymbol',
720 'Sk' => 'ModifierSymbol',
721 'So' => 'OtherSymbol',
722 'Z' => 'Separator',
723 'Zs' => 'SpaceSeparator',
724 'Zl' => 'LineSeparator',
725 'Zp' => 'ParagraphSeparator',
726 'C' => 'Other',
727 'Cc' => 'Control',
728 'Cf' => 'Format',
729 'Cs' => 'Surrogate',
730 'Co' => 'PrivateUse',
731 'Cn' => 'Unassigned',
732 );
733
734sub general_categories {
735 return dclone \%GENERAL_CATEGORIES;
736}
737
a452d459 738=head2 B<general_categories()>
ea508aee
JH
739
740 use Unicode::UCD 'general_categories';
741
742 my $categories = general_categories();
743
a452d459 744This returns a reference to a hash which has short
ea508aee
JH
745general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
746names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
747C<Symbol>) as values. The hash is reversible in case you need to go
748from the long names to the short names. The general category is the
a452d459
KW
749one returned from
750L</charinfo()> under the C<category> key.
ea508aee 751
7ef25837
KW
752The L</prop_value_aliases()> function can be used to get all the synonyms of
753the category name.
754
ea508aee
JH
755=cut
756
757my %BIDI_TYPES =
758 (
759 'L' => 'Left-to-Right',
760 'LRE' => 'Left-to-Right Embedding',
761 'LRO' => 'Left-to-Right Override',
762 'R' => 'Right-to-Left',
763 'AL' => 'Right-to-Left Arabic',
764 'RLE' => 'Right-to-Left Embedding',
765 'RLO' => 'Right-to-Left Override',
766 'PDF' => 'Pop Directional Format',
767 'EN' => 'European Number',
768 'ES' => 'European Number Separator',
769 'ET' => 'European Number Terminator',
770 'AN' => 'Arabic Number',
771 'CS' => 'Common Number Separator',
772 'NSM' => 'Non-Spacing Mark',
773 'BN' => 'Boundary Neutral',
774 'B' => 'Paragraph Separator',
775 'S' => 'Segment Separator',
776 'WS' => 'Whitespace',
777 'ON' => 'Other Neutrals',
778 );
779
a452d459 780=head2 B<bidi_types()>
ea508aee
JH
781
782 use Unicode::UCD 'bidi_types';
783
784 my $categories = bidi_types();
785
a452d459 786This returns a reference to a hash which has the short
ea508aee
JH
787bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
788names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
789hash is reversible in case you need to go from the long names to the
a452d459
KW
790short names. The bidi type is the one returned from
791L</charinfo()>
ea508aee
JH
792under the C<bidi> key. For the exact meaning of the various bidi classes
793the Unicode TR9 is recommended reading:
a452d459 794L<http://www.unicode.org/reports/tr9/>
ea508aee
JH
795(as of Unicode 5.0.0)
796
7ef25837
KW
797The L</prop_value_aliases()> function can be used to get all the synonyms of
798the bidi type name.
799
ea508aee
JH
800=cut
801
a452d459
KW
802sub bidi_types {
803 return dclone \%BIDI_TYPES;
804}
805
806=head2 B<compexcl()>
b08cd201 807
55d7b906 808 use Unicode::UCD 'compexcl';
b08cd201 809
a452d459 810 my $compexcl = compexcl(0x09dc);
b08cd201 811
71a442a8
KW
812This routine is included for backwards compatibility, but as of Perl 5.12, for
813most purposes it is probably more convenient to use one of the following
814instead:
815
816 my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
817 my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
818
819or even
820
821 my $compexcl = chr(0x09dc) =~ /\p{CE};
822 my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
823
824The first two forms return B<true> if the L</code point argument> should not
76b05678
KW
825be produced by composition normalization. For the final two forms to return
826B<true>, it is additionally required that this fact not otherwise be
827determinable from the Unicode data base.
71a442a8
KW
828
829This routine behaves identically to the final two forms. That is,
830it does not return B<true> if the code point has a decomposition
a452d459
KW
831consisting of another single code point, nor if its decomposition starts
832with a code point whose combining class is non-zero. Code points that meet
833either of these conditions should also not be produced by composition
71a442a8
KW
834normalization, which is probably why you should use the
835C<Full_Composition_Exclusion> property instead, as shown above.
b08cd201 836
71a442a8 837The routine returns B<false> otherwise.
b08cd201
JH
838
839=cut
840
b08cd201
JH
841sub compexcl {
842 my $arg = shift;
843 my $code = _getcode($arg);
74f8133e
JH
844 croak __PACKAGE__, "::compexcl: unknown code '$arg'"
845 unless defined $code;
b08cd201 846
36c2430c 847 no warnings "non_unicode"; # So works on non-Unicode code points
71a442a8 848 return chr($code) =~ /\p{Composition_Exclusion}/;
b08cd201
JH
849}
850
a452d459 851=head2 B<casefold()>
b08cd201 852
55d7b906 853 use Unicode::UCD 'casefold';
b08cd201 854
a452d459
KW
855 my $casefold = casefold(0xDF);
856 if (defined $casefold) {
857 my @full_fold_hex = split / /, $casefold->{'full'};
858 my $full_fold_string =
859 join "", map {chr(hex($_))} @full_fold_hex;
860 my @turkic_fold_hex =
861 split / /, ($casefold->{'turkic'} ne "")
862 ? $casefold->{'turkic'}
863 : $casefold->{'full'};
864 my $turkic_fold_string =
865 join "", map {chr(hex($_))} @turkic_fold_hex;
866 }
867 if (defined $casefold && $casefold->{'simple'} ne "") {
868 my $simple_fold_hex = $casefold->{'simple'};
869 my $simple_fold_string = chr(hex($simple_fold_hex));
870 }
b08cd201 871
a452d459
KW
872This returns the (almost) locale-independent case folding of the
873character specified by the L</code point argument>.
b08cd201 874
a18e976f 875If there is no case folding for that code point, C<undef> is returned.
a452d459
KW
876
877If there is a case folding for that code point, a reference to a hash
b08cd201
JH
878with the following fields is returned:
879
a452d459
KW
880=over
881
882=item B<code>
883
884the input L</code point argument> expressed in hexadecimal, with leading zeros
885added if necessary to make it contain at least four hexdigits
886
887=item B<full>
888
a18e976f 889one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
890code points for the case folding for I<code>.
891Each has at least four hexdigits.
892
893=item B<simple>
894
895is empty, or is exactly one code with at least four hexdigits which can be used
896as an alternative case folding when the calling program cannot cope with the
897fold being a sequence of multiple code points. If I<full> is just one code
898point, then I<simple> equals I<full>. If there is no single code point folding
899defined for I<code>, then I<simple> is the empty string. Otherwise, it is an
900inferior, but still better-than-nothing alternative folding to I<full>.
901
902=item B<mapping>
903
904is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
905otherwise. It can be considered to be the simplest possible folding for
906I<code>. It is defined primarily for backwards compatibility.
907
908=item B<status>
b08cd201 909
a452d459
KW
910is C<C> (for C<common>) if the best possible fold is a single code point
911(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct
912folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if
a18e976f
KW
913there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).
914Note that this
a452d459
KW
915describes the contents of I<mapping>. It is defined primarily for backwards
916compatibility.
b08cd201 917
a452d459
KW
918On versions 3.1 and earlier of Unicode, I<status> can also be
919C<I> which is the same as C<C> but is a special case for dotted uppercase I and
920dotless lowercase i:
b08cd201 921
a452d459 922=over
b08cd201 923
a18e976f 924=item B<*> If you use this C<I> mapping
a452d459 925
a18e976f 926the result is case-insensitive,
a452d459
KW
927but dotless and dotted I's are not distinguished
928
a18e976f 929=item B<*> If you exclude this C<I> mapping
a452d459 930
a18e976f 931the result is not fully case-insensitive, but
a452d459
KW
932dotless and dotted I's are distinguished
933
934=back
935
936=item B<turkic>
937
938contains any special folding for Turkic languages. For versions of Unicode
939starting with 3.2, this field is empty unless I<code> has a different folding
940in Turkic languages, in which case it is one or more codes (separated by
a18e976f 941spaces) that, taken in order, give the code points for the case folding for
a452d459
KW
942I<code> in those languages.
943Each code has at least four hexdigits.
944Note that this folding does not maintain canonical equivalence without
945additional processing.
946
947For versions of Unicode 3.1 and earlier, this field is empty unless there is a
948special folding for Turkic languages, in which case I<status> is C<I>, and
949I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
950
951=back
952
953Programs that want complete generality and the best folding results should use
954the folding contained in the I<full> field. But note that the fold for some
955code points will be a sequence of multiple code points.
956
957Programs that can't cope with the fold mapping being multiple code points can
958use the folding contained in the I<simple> field, with the loss of some
959generality. In Unicode 5.1, about 7% of the defined foldings have no single
960code point folding.
961
962The I<mapping> and I<status> fields are provided for backwards compatibility for
963existing programs. They contain the same values as in previous versions of
964this function.
965
966Locale is not completely independent. The I<turkic> field contains results to
967use when the locale is a Turkic language.
b08cd201
JH
968
969For more information about case mappings see
a452d459 970L<http://www.unicode.org/unicode/reports/tr21>
b08cd201
JH
971
972=cut
973
974my %CASEFOLD;
975
976sub _casefold {
977 unless (%CASEFOLD) {
551b6b6f 978 if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
6c8d78fb 979 local $_;
b08cd201 980 while (<$CASEFOLDFH>) {
a452d459 981 if (/^([0-9A-F]+); ([CFIST]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
b08cd201 982 my $code = hex($1);
a452d459
KW
983 $CASEFOLD{$code}{'code'} = $1;
984 $CASEFOLD{$code}{'turkic'} = "" unless
985 defined $CASEFOLD{$code}{'turkic'};
986 if ($2 eq 'C' || $2 eq 'I') { # 'I' is only on 3.1 and
987 # earlier Unicodes
988 # Both entries there (I
989 # only checked 3.1) are
990 # the same as C, and
991 # there are no other
992 # entries for those
993 # codepoints, so treat
994 # as if C, but override
995 # the turkic one for
996 # 'I'.
997 $CASEFOLD{$code}{'status'} = $2;
998 $CASEFOLD{$code}{'full'} = $CASEFOLD{$code}{'simple'} =
999 $CASEFOLD{$code}{'mapping'} = $3;
1000 $CASEFOLD{$code}{'turkic'} = $3 if $2 eq 'I';
1001 } elsif ($2 eq 'F') {
1002 $CASEFOLD{$code}{'full'} = $3;
1003 unless (defined $CASEFOLD{$code}{'simple'}) {
1004 $CASEFOLD{$code}{'simple'} = "";
1005 $CASEFOLD{$code}{'mapping'} = $3;
1006 $CASEFOLD{$code}{'status'} = $2;
1007 }
1008 } elsif ($2 eq 'S') {
1009
1010
1011 # There can't be a simple without a full, and simple
1012 # overrides all but full
1013
1014 $CASEFOLD{$code}{'simple'} = $3;
1015 $CASEFOLD{$code}{'mapping'} = $3;
1016 $CASEFOLD{$code}{'status'} = $2;
1017 } elsif ($2 eq 'T') {
1018 $CASEFOLD{$code}{'turkic'} = $3;
1019 } # else can't happen because only [CIFST] are possible
b08cd201
JH
1020 }
1021 }
1022 close($CASEFOLDFH);
1023 }
1024 }
1025}
1026
1027sub casefold {
1028 my $arg = shift;
1029 my $code = _getcode($arg);
74f8133e
JH
1030 croak __PACKAGE__, "::casefold: unknown code '$arg'"
1031 unless defined $code;
b08cd201
JH
1032
1033 _casefold() unless %CASEFOLD;
1034
1035 return $CASEFOLD{$code};
1036}
1037
a452d459 1038=head2 B<casespec()>
b08cd201 1039
55d7b906 1040 use Unicode::UCD 'casespec';
b08cd201 1041
a452d459 1042 my $casespec = casespec(0xFB00);
b08cd201 1043
a452d459
KW
1044This returns the potentially locale-dependent case mappings of the L</code point
1045argument>. The mappings may be longer than a single code point (which the basic
1046Unicode case mappings as returned by L</charinfo()> never are).
b08cd201 1047
a452d459
KW
1048If there are no case mappings for the L</code point argument>, or if all three
1049possible mappings (I<lower>, I<title> and I<upper>) result in single code
a18e976f 1050points and are locale independent and unconditional, C<undef> is returned
5d8e6e41
KW
1051(which means that the case mappings, if any, for the code point are those
1052returned by L</charinfo()>).
a452d459
KW
1053
1054Otherwise, a reference to a hash giving the mappings (or a reference to a hash
5d8e6e41
KW
1055of such hashes, explained below) is returned with the following keys and their
1056meanings:
a452d459
KW
1057
1058The keys in the bottom layer hash with the meanings of their values are:
1059
1060=over
1061
1062=item B<code>
1063
1064the input L</code point argument> expressed in hexadecimal, with leading zeros
1065added if necessary to make it contain at least four hexdigits
1066
1067=item B<lower>
1068
a18e976f 1069one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1070code points for the lower case of I<code>.
1071Each has at least four hexdigits.
1072
1073=item B<title>
b08cd201 1074
a18e976f 1075one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1076code points for the title case of I<code>.
1077Each has at least four hexdigits.
b08cd201 1078
d2da20e3 1079=item B<upper>
b08cd201 1080
a18e976f 1081one or more codes (separated by spaces) that, taken in order, give the
a452d459
KW
1082code points for the upper case of I<code>.
1083Each has at least four hexdigits.
1084
1085=item B<condition>
1086
1087the conditions for the mappings to be valid.
a18e976f 1088If C<undef>, the mappings are always valid.
a452d459
KW
1089When defined, this field is a list of conditions,
1090all of which must be true for the mappings to be valid.
1091The list consists of one or more
1092I<locales> (see below)
1093and/or I<contexts> (explained in the next paragraph),
1094separated by spaces.
1095(Other than as used to separate elements, spaces are to be ignored.)
1096Case distinctions in the condition list are not significant.
82c0b05b 1097Conditions preceded by "NON_" represent the negation of the condition.
b08cd201 1098
a452d459
KW
1099A I<context> is one of those defined in the Unicode standard.
1100For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
1101available at
5d8e6e41
KW
1102L<http://www.unicode.org/versions/Unicode5.1.0/>.
1103These are for context-sensitive casing.
f499c386 1104
a452d459
KW
1105=back
1106
5d8e6e41 1107The hash described above is returned for locale-independent casing, where
a18e976f 1108at least one of the mappings has length longer than one. If C<undef> is
5d8e6e41
KW
1109returned, the code point may have mappings, but if so, all are length one,
1110and are returned by L</charinfo()>.
1111Note that when this function does return a value, it will be for the complete
1112set of mappings for a code point, even those whose length is one.
1113
1114If there are additional casing rules that apply only in certain locales,
1115an additional key for each will be defined in the returned hash. Each such key
1116will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
1117followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
1118and a variant code). You can find the lists of all possible locales, see
1119L<Locale::Country> and L<Locale::Language>.
89e4a205 1120(In Unicode 6.0, the only locales returned by this function
a452d459 1121are C<lt>, C<tr>, and C<az>.)
b08cd201 1122
5d8e6e41
KW
1123Each locale key is a reference to a hash that has the form above, and gives
1124the casing rules for that particular locale, which take precedence over the
1125locale-independent ones when in that locale.
1126
1127If the only casing for a code point is locale-dependent, then the returned
1128hash will not have any of the base keys, like C<code>, C<upper>, etc., but
1129will contain only locale keys.
1130
b08cd201 1131For more information about case mappings see
a452d459 1132L<http://www.unicode.org/unicode/reports/tr21/>
b08cd201
JH
1133
1134=cut
1135
1136my %CASESPEC;
1137
1138sub _casespec {
1139 unless (%CASESPEC) {
551b6b6f 1140 if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
6c8d78fb 1141 local $_;
b08cd201
JH
1142 while (<$CASESPECFH>) {
1143 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
f499c386
JH
1144 my ($hexcode, $lower, $title, $upper, $condition) =
1145 ($1, $2, $3, $4, $5);
1146 my $code = hex($hexcode);
1147 if (exists $CASESPEC{$code}) {
1148 if (exists $CASESPEC{$code}->{code}) {
1149 my ($oldlower,
1150 $oldtitle,
1151 $oldupper,
1152 $oldcondition) =
1153 @{$CASESPEC{$code}}{qw(lower
1154 title
1155 upper
1156 condition)};
822ebcc8
JH
1157 if (defined $oldcondition) {
1158 my ($oldlocale) =
f499c386 1159 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
f499c386
JH
1160 delete $CASESPEC{$code};
1161 $CASESPEC{$code}->{$oldlocale} =
1162 { code => $hexcode,
1163 lower => $oldlower,
1164 title => $oldtitle,
1165 upper => $oldupper,
1166 condition => $oldcondition };
f499c386
JH
1167 }
1168 }
1169 my ($locale) =
1170 ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
1171 $CASESPEC{$code}->{$locale} =
1172 { code => $hexcode,
1173 lower => $lower,
1174 title => $title,
1175 upper => $upper,
1176 condition => $condition };
1177 } else {
1178 $CASESPEC{$code} =
1179 { code => $hexcode,
1180 lower => $lower,
1181 title => $title,
1182 upper => $upper,
1183 condition => $condition };
1184 }
b08cd201
JH
1185 }
1186 }
1187 close($CASESPECFH);
1188 }
1189 }
1190}
1191
1192sub casespec {
1193 my $arg = shift;
1194 my $code = _getcode($arg);
74f8133e
JH
1195 croak __PACKAGE__, "::casespec: unknown code '$arg'"
1196 unless defined $code;
b08cd201
JH
1197
1198 _casespec() unless %CASESPEC;
1199
741297c1 1200 return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
b08cd201
JH
1201}
1202
a452d459 1203=head2 B<namedseq()>
a2bd7410
JH
1204
1205 use Unicode::UCD 'namedseq';
1206
1207 my $namedseq = namedseq("KATAKANA LETTER AINU P");
1208 my @namedseq = namedseq("KATAKANA LETTER AINU P");
1209 my %namedseq = namedseq();
1210
1211If used with a single argument in a scalar context, returns the string
a18e976f 1212consisting of the code points of the named sequence, or C<undef> if no
a2bd7410 1213named sequence by that name exists. If used with a single argument in
956cae9a
KW
1214a list context, it returns the list of the ordinals of the code points. If used
1215with no
a2bd7410
JH
1216arguments in a list context, returns a hash with the names of the
1217named sequences as the keys and the named sequences as strings as
a18e976f 1218the values. Otherwise, it returns C<undef> or an empty list depending
a2bd7410
JH
1219on the context.
1220
a452d459
KW
1221This function only operates on officially approved (not provisional) named
1222sequences.
a2bd7410 1223
27f853a0
KW
1224Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
1225sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
1226LETTER AINU P")> will return the same string this function does, but will also
1227operate on character names that aren't named sequences, without you having to
1228know which are which. See L<charnames>.
1229
a2bd7410
JH
1230=cut
1231
1232my %NAMEDSEQ;
1233
1234sub _namedseq {
1235 unless (%NAMEDSEQ) {
98ef7649 1236 if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
a2bd7410
JH
1237 local $_;
1238 while (<$NAMEDSEQFH>) {
98ef7649
KW
1239 if (/^ [0-9A-F]+ \ /x) {
1240 chomp;
1241 my ($sequence, $name) = split /\t/;
1242 my @s = map { chr(hex($_)) } split(' ', $sequence);
1243 $NAMEDSEQ{$name} = join("", @s);
a2bd7410
JH
1244 }
1245 }
1246 close($NAMEDSEQFH);
1247 }
1248 }
1249}
1250
1251sub namedseq {
98ef7649
KW
1252
1253 # Use charnames::string_vianame() which now returns this information,
1254 # unless the caller wants the hash returned, in which case we read it in,
1255 # and thereafter use it instead of calling charnames, as it is faster.
1256
a2bd7410
JH
1257 my $wantarray = wantarray();
1258 if (defined $wantarray) {
1259 if ($wantarray) {
1260 if (@_ == 0) {
98ef7649 1261 _namedseq() unless %NAMEDSEQ;
a2bd7410
JH
1262 return %NAMEDSEQ;
1263 } elsif (@_ == 1) {
98ef7649
KW
1264 my $s;
1265 if (%NAMEDSEQ) {
1266 $s = $NAMEDSEQ{ $_[0] };
1267 }
1268 else {
1269 $s = charnames::string_vianame($_[0]);
1270 }
a2bd7410
JH
1271 return defined $s ? map { ord($_) } split('', $s) : ();
1272 }
1273 } elsif (@_ == 1) {
98ef7649
KW
1274 return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
1275 return charnames::string_vianame($_[0]);
a2bd7410
JH
1276 }
1277 }
1278 return;
1279}
1280
7319f91d
KW
1281my %NUMERIC;
1282
1283sub _numeric {
1284
1285 # Unicode 6.0 instituted the rule that only digits in a consecutive
1286 # block of 10 would be considered decimal digits. Before that, the only
1287 # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
1288 # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
1289 # ONE. The code could be modified to handle that, but not bothering, as
1290 # in TUS 6.0, U+19DA was changed to Nt=Di.
1291 if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
1292 croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
1293 }
98025745
KW
1294 my @numbers = _read_table("unicore/To/Nv.pl");
1295 foreach my $entry (@numbers) {
1296 my ($start, $end, $value) = @$entry;
1297
05dbc6f8
KW
1298 # If value contains a slash, convert to decimal, add a reverse hash
1299 # used by charinfo.
98025745
KW
1300 if ((my @rational = split /\//, $value) == 2) {
1301 my $real = $rational[0] / $rational[1];
05dbc6f8 1302 $real_to_rational{$real} = $value;
98025745
KW
1303 $value = $real;
1304 }
1305
1306 for my $i ($start .. $end) {
1307 $NUMERIC{$i} = $value;
7319f91d 1308 }
7319f91d 1309 }
2dc5eb26
KW
1310
1311 # Decided unsafe to use these that aren't officially part of the Unicode
1312 # standard.
1313 #use Math::Trig;
1314 #my $pi = acos(-1.0);
98025745 1315 #$NUMERIC{0x03C0} = $pi;
7319f91d
KW
1316
1317 # Euler's constant, not to be confused with Euler's number
98025745 1318 #$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
7319f91d
KW
1319
1320 # Euler's number
98025745 1321 #$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
2dc5eb26 1322
7319f91d
KW
1323 return;
1324}
1325
1326=pod
1327
67592e11 1328=head2 B<num()>
7319f91d 1329
eefd7bc2
KW
1330 use Unicode::UCD 'num';
1331
1332 my $val = num("123");
1333 my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
1334
7319f91d
KW
1335C<num> returns the numeric value of the input Unicode string; or C<undef> if it
1336doesn't think the entire string has a completely valid, safe numeric value.
1337
1338If the string is just one character in length, the Unicode numeric value
1339is returned if it has one, or C<undef> otherwise. Note that this need
1340not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
2dc5eb26
KW
1341example returns -0.5.
1342
1343=cut
7319f91d 1344
2dc5eb26
KW
1345#A few characters to which Unicode doesn't officially
1346#assign a numeric value are considered numeric by C<num>.
1347#These are:
1348
1349# EULER CONSTANT 0.5772... (this is NOT Euler's number)
1350# SCRIPT SMALL E 2.71828... (this IS Euler's number)
1351# GREEK SMALL LETTER PI 3.14159...
1352
1353=pod
7319f91d
KW
1354
1355If the string is more than one character, C<undef> is returned unless
8bb4c8e2 1356all its characters are decimal digits (that is, they would match C<\d+>),
7319f91d
KW
1357from the same script. For example if you have an ASCII '0' and a Bengali
1358'3', mixed together, they aren't considered a valid number, and C<undef>
1359is returned. A further restriction is that the digits all have to be of
1360the same form. A half-width digit mixed with a full-width one will
1361return C<undef>. The Arabic script has two sets of digits; C<num> will
1362return C<undef> unless all the digits in the string come from the same
1363set.
1364
1365C<num> errs on the side of safety, and there may be valid strings of
1366decimal digits that it doesn't recognize. Note that Unicode defines
1367a number of "digit" characters that aren't "decimal digit" characters.
a278d14b 1368"Decimal digits" have the property that they have a positional value, i.e.,
7319f91d
KW
1369there is a units position, a 10's position, a 100's, etc, AND they are
1370arranged in Unicode in blocks of 10 contiguous code points. The Chinese
1371digits, for example, are not in such a contiguous block, and so Unicode
1372doesn't view them as decimal digits, but merely digits, and so C<\d> will not
1373match them. A single-character string containing one of these digits will
1374have its decimal value returned by C<num>, but any longer string containing
1375only these digits will return C<undef>.
1376
a278d14b
KW
1377Strings of multiple sub- and superscripts are not recognized as numbers. You
1378can use either of the compatibility decompositions in Unicode::Normalize to
7319f91d
KW
1379change these into digits, and then call C<num> on the result.
1380
1381=cut
1382
1383# To handle sub, superscripts, this could if called in list context,
1384# consider those, and return the <decomposition> type in the second
1385# array element.
1386
1387sub num {
1388 my $string = $_[0];
1389
1390 _numeric unless %NUMERIC;
1391
1392 my $length = length($string);
98025745 1393 return $NUMERIC{ord($string)} if $length == 1;
7319f91d
KW
1394 return if $string =~ /\D/;
1395 my $first_ord = ord(substr($string, 0, 1));
98025745 1396 my $value = $NUMERIC{$first_ord};
7319f91d
KW
1397 my $zero_ord = $first_ord - $value;
1398
1399 for my $i (1 .. $length -1) {
1400 my $ord = ord(substr($string, $i, 1));
1401 my $digit = $ord - $zero_ord;
1402 return unless $digit >= 0 && $digit <= 9;
1403 $value = $value * 10 + $digit;
1404 }
1405 return $value;
1406}
1407
7ef25837
KW
1408=pod
1409
1410=head2 B<prop_aliases()>
1411
1412 use Unicode::UCD 'prop_aliases';
1413
1414 my ($short_name, $full_name, @other_names) = prop_aliases("space");
1415 my $same_full_name = prop_aliases("Space"); # Scalar context
1416 my ($same_short_name) = prop_aliases("Space"); # gets 0th element
1417 print "The full name is $full_name\n";
1418 print "The short name is $short_name\n";
1419 print "The other aliases are: ", join(", ", @other_names), "\n";
1420
1421 prints:
1422 The full name is White_Space
1423 The short name is WSpace
1424 The other aliases are: Space
1425
1426Most Unicode properties have several synonymous names. Typically, there is at
1427least a short name, convenient to type, and a long name that more fully
1428describes the property, and hence is more easily understood.
1429
1430If you know one name for a Unicode property, you can use C<prop_aliases> to find
1431either the long name (when called in scalar context), or a list of all of the
1432names, somewhat ordered so that the short name is in the 0th element, the long
1433name in the next element, and any other synonyms are in the remaining
1434elements, in no particular order.
1435
1436The long name is returned in a form nicely capitalized, suitable for printing.
1437
1438The input parameter name is loosely matched, which means that white space,
1439hyphens, and underscores are ignored (except for the trailing underscore in
1440the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and
1441both of which mean C<General_Category=Cased Letter>).
1442
1443If the name is unknown, C<undef> is returned (or an empty list in list
1444context). Note that Perl typically recognizes property names in regular
1445expressions with an optional C<"Is_>" (with or without the underscore)
1446prefixed to them, such as C<\p{isgc=punct}>. This function does not recognize
1447those in the input, returning C<undef>. Nor are they included in the output
1448as possible synonyms.
1449
1450C<prop_aliases> does know about the Perl extensions to Unicode properties,
1451such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode
1452properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>. The
1453final example demonstrates that the C<"Is_"> prefix is recognized for these
1454extensions; it is needed to resolve ambiguities. For example,
1455C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but
1456C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>. This is
1457because C<islc> is a Perl extension which is short for
1458C<General_Category=Cased Letter>. The lists returned for the Perl extensions
1459will not include the C<"Is_"> prefix (whether or not the input had it) unless
1460needed to resolve ambiguities, as shown in the C<"islc"> example, where the
1461returned list had one element containing C<"Is_">, and the other without.
1462
1463It is also possible for the reverse to happen: C<prop_aliases('isc')> returns
1464the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns
1465C<(C, Other)> (the latter being a Perl extension meaning
1466C<General_Category=Other>. L<perluniprops> lists the available forms,
1467including which ones are discouraged from use.
1468
1469Those discouraged forms are accepted as input to C<prop_aliases>, but are not
1470returned in the lists. C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
1471which are old synonyms for C<"Is_LC"> and should not be used in new code, are
1472examples of this. These both return C<(Is_LC, Cased_Letter)>. Thus this
1473function allows you to take a discourarged form, and find its acceptable
1474alternatives. The same goes with single-form Block property equivalences.
1475Only the forms that begin with C<"In_"> are not discouraged; if you pass
1476C<prop_aliases> a discouraged form, you will get back the equivalent ones that
1477begin with C<"In_">. It will otherwise look like a new-style block name (see.
1478L</Old-style versus new-style block names>).
1479
1480C<prop_aliases> does not know about any user-defined properties, and will
1481return C<undef> if called with one of those. Likewise for Perl internal
1482properties, with the exception of "Perl_Decimal_Digit" which it does know
1483about (and which is documented below in L</prop_invmap()>).
1484
1485=cut
1486
1487# It may be that there are use cases where the discouraged forms should be
1488# returned. If that comes up, an optional boolean second parameter to the
1489# function could be created, for example.
1490
1491# These are created by mktables for this routine and stored in unicore/UCD.pl
1492# where their structures are described.
1493our %string_property_loose_to_name;
1494our %ambiguous_names;
1495our %loose_perlprop_to_name;
1496our %prop_aliases;
1497
1498sub prop_aliases ($) {
1499 my $prop = $_[0];
1500 return unless defined $prop;
1501
1502 require "unicore/UCD.pl";
1503 require "unicore/Heavy.pl";
1504 require "utf8_heavy.pl";
1505
1506 # The property name may be loosely or strictly matched; we don't know yet.
1507 # But both types use lower-case.
1508 $prop = lc $prop;
1509
1510 # It is loosely matched if its lower case isn't known to be strict.
1511 my $list_ref;
1512 if (! exists $utf8::stricter_to_file_of{$prop}) {
1513 my $loose = utf8::_loose_name($prop);
1514
1515 # There is a hash that converts from any loose name to its standard
1516 # form, mapping all synonyms for a name to one name that can be used
1517 # as a key into another hash. The whole concept is for memory
1518 # savings, as the second hash doesn't have to have all the
1519 # combinations. Actually, there are two hashes that do the
1520 # converstion. One is used in utf8_heavy.pl (stored in Heavy.pl) for
1521 # looking up properties matchable in regexes. This function needs to
1522 # access string properties, which aren't available in regexes, so a
1523 # second conversion hash is made for them (stored in UCD.pl). Look in
1524 # the string one now, as the rest can have an optional 'is' prefix,
1525 # which these don't.
1526 if (exists $string_property_loose_to_name{$loose}) {
1527
1528 # Convert to its standard loose name.
1529 $prop = $string_property_loose_to_name{$loose};
1530 }
1531 else {
1532 my $retrying = 0; # bool. ? Has an initial 'is' been stripped
1533 RETRY:
1534 if (exists $utf8::loose_property_name_of{$loose}
1535 && (! $retrying
1536 || ! exists $ambiguous_names{$loose}))
1537 {
1538 # Found an entry giving the standard form. We don't get here
1539 # (in the test above) when we've stripped off an
1540 # 'is' and the result is an ambiguous name. That is because
1541 # these are official Unicode properties (though Perl can have
1542 # an optional 'is' prefix meaning the official property), and
1543 # all ambiguous cases involve a Perl single-form extension
1544 # for the gc, script, or block properties, and the stripped
1545 # 'is' means that they mean one of those, and not one of
1546 # these
1547 $prop = $utf8::loose_property_name_of{$loose};
1548 }
1549 elsif (exists $loose_perlprop_to_name{$loose}) {
1550
1551 # This hash is specifically for this function to list Perl
1552 # extensions that aren't in the earlier hashes. If there is
1553 # only one element, the short and long names are identical.
1554 # Otherwise the form is already in the same form as
1555 # %prop_aliases, which is handled at the end of the function.
1556 $list_ref = $loose_perlprop_to_name{$loose};
1557 if (@$list_ref == 1) {
1558 my @list = ($list_ref->[0], $list_ref->[0]);
1559 $list_ref = \@list;
1560 }
1561 }
1562 elsif (! exists $utf8::loose_to_file_of{$loose}) {
1563
1564 # loose_to_file_of is a complete list of loose names. If not
1565 # there, the input is unknown.
1566 return;
1567 }
1568 else {
1569
1570 # Here we found the name but not its aliases, so it has to
1571 # exist. This means it must be one of the Perl single-form
1572 # extensions. First see if it is for a property-value
1573 # combination in one of the following properties.
1574 my @list;
1575 foreach my $property ("gc", "script") {
1576 @list = prop_value_aliases($property, $loose);
1577 last if @list;
1578 }
1579 if (@list) {
1580
1581 # Here, it is one of those property-value combination
1582 # single-form synonyms. There are ambiguities with some
1583 # of these. Check against the list for these, and adjust
1584 # if necessary.
1585 for my $i (0 .. @list -1) {
1586 if (exists $ambiguous_names
1587 {utf8::_loose_name(lc $list[$i])})
1588 {
1589 # The ambiguity is resolved by toggling whether or
1590 # not it has an 'is' prefix
1591 $list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/;
1592 }
1593 }
1594 return @list;
1595 }
1596
1597 # Here, it wasn't one of the gc or script single-form
1598 # extensions. It could be a block property single-form
1599 # extension. An 'in' prefix definitely means that, and should
1600 # be looked up without the prefix.
1601 my $began_with_in = $loose =~ s/^in//;
1602 @list = prop_value_aliases("block", $loose);
1603 if (@list) {
1604 map { $_ =~ s/^/In_/ } @list;
1605 return @list;
1606 }
1607
1608 # Here still haven't found it. The last opportunity for it
1609 # being valid is only if it began with 'is'. We retry without
1610 # the 'is', setting a flag to that effect so that we don't
1611 # accept things that begin with 'isis...'
1612 if (! $retrying && ! $began_with_in && $loose =~ s/^is//) {
1613 $retrying = 1;
1614 goto RETRY;
1615 }
1616
1617 # Here, didn't find it. Since it was in %loose_to_file_of, we
1618 # should have been able to find it.
1619 carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'. Send bug report to perlbug\@perl.org";
1620 return;
1621 }
1622 }
1623 }
1624
1625 if (! $list_ref) {
1626 # Here, we have set $prop to a standard form name of the input. Look
1627 # it up in the structure created by mktables for this purpose, which
1628 # contains both strict and loosely matched properties. Avoid
1629 # autovivifying.
1630 $list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop};
1631 return unless $list_ref;
1632 }
1633
1634 # The full name is in element 1.
1635 return $list_ref->[1] unless wantarray;
1636
1637 return @{dclone $list_ref};
1638}
1639
1640=pod
1641
1642=head2 B<prop_value_aliases()>
1643
1644 use Unicode::UCD 'prop_value_aliases';
1645
1646 my ($short_name, $full_name, @other_names)
1647 = prop_value_aliases("Gc", "Punct");
1648 my $same_full_name = prop_value_aliases("Gc", "P"); # Scalar cntxt
1649 my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
1650 # element
1651 print "The full name is $full_name\n";
1652 print "The short name is $short_name\n";
1653 print "The other aliases are: ", join(", ", @other_names), "\n";
1654
1655 prints:
1656 The full name is Punctuation
1657 The short name is P
1658 The other aliases are: Punct
1659
1660Some Unicode properties have a restricted set of legal values. For example,
1661all binary properties are restricted to just C<true> or C<false>; and there
1662are only a few dozen possible General Categories.
1663
1664For such properties, there are usually several synonyms for each possible
1665value. For example, in binary properties, I<truth> can be represented by any of
1666the strings "Y", "Yes", "T", or "True"; and the General Category
1667"Punctuation" by that string, or "Punct", or simply "P".
1668
1669Like property names, there is typically at least a short name for each such
1670property-value, and a long name. If you know any name of the property-value,
1671you can use C<prop_value_aliases>() to get the long name (when called in
1672scalar context), or a list of all the names, with the short name in the 0th
1673element, the long name in the next element, and any other synonyms in the
1674remaining elements, in no particular order, except that any all-numeric
1675synonyms will be last.
1676
1677The long name is returned in a form nicely capitalized, suitable for printing.
1678
1679Case, white space, hyphens, and underscores are ignored in the input parameters
1680(except for the trailing underscore in the old-form grandfathered-in general
1681category property value C<"L_">, which is better written as C<"LC">).
1682
1683If either name is unknown, C<undef> is returned. Note that Perl typically
1684recognizes property names in regular expressions with an optional C<"Is_>"
1685(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
1686This function does not recognize those in the property parameter, returning
1687C<undef>.
1688
1689If called with a property that doesn't have synonyms for its values, it
1690returns the input value, possibly normalized with capitalization and
1691underscores.
1692
1693For the block property, new-style block names are returned (see
1694L</Old-style versus new-style block names>).
1695
1696To find the synonyms for single-forms, such as C<\p{Any}>, use
1697L</prop_aliases()> instead.
1698
1699C<prop_value_aliases> does not know about any user-defined properties, and
1700will return C<undef> if called with one of those.
1701
1702=cut
1703
1704# These are created by mktables for this routine and stored in unicore/UCD.pl
1705# where their structures are described.
1706our %loose_to_standard_value;
1707our %prop_value_aliases;
1708
1709sub prop_value_aliases ($$) {
1710 my ($prop, $value) = @_;
1711 return unless defined $prop && defined $value;
1712
1713 require "unicore/UCD.pl";
1714 require "utf8_heavy.pl";
1715
1716 # Find the property name synonym that's used as the key in other hashes,
1717 # which is element 0 in the returned list.
1718 ($prop) = prop_aliases($prop);
1719 return if ! $prop;
1720 $prop = utf8::_loose_name(lc $prop);
1721
1722 # Here is a legal property, but the hash below (created by mktables for
1723 # this purpose) only knows about the properties that have a very finite
1724 # number of potential values, that is not ones whose value could be
1725 # anything, like most (if not all) string properties. These don't have
1726 # synonyms anyway. Simply return the input. For example, there is no
1727 # synonym for ('Uppercase_Mapping', A').
1728 return $value if ! exists $prop_value_aliases{$prop};
1729
1730 # The value name may be loosely or strictly matched; we don't know yet.
1731 # But both types use lower-case.
1732 $value = lc $value;
1733
1734 # If the name isn't found under loose matching, it certainly won't be
1735 # found under strict
1736 my $loose_value = utf8::_loose_name($value);
1737 return unless exists $loose_to_standard_value{"$prop=$loose_value"};
1738
1739 # Similarly if the combination under loose matching doesn't exist, it
1740 # won't exist under strict.
1741 my $standard_value = $loose_to_standard_value{"$prop=$loose_value"};
1742 return unless exists $prop_value_aliases{$prop}{$standard_value};
1743
1744 # Here we did find a combination under loose matching rules. But it could
1745 # be that is a strict property match that shouldn't have matched.
1746 # %prop_value_aliases is set up so that the strict matches will appear as
1747 # if they were in loose form. Thus, if the non-loose version is legal,
1748 # we're ok, can skip the further check.
1749 if (! exists $utf8::stricter_to_file_of{"$prop=$value"}
1750
1751 # We're also ok and skip the further check if value loosely matches.
1752 # mktables has verified that no strict name under loose rules maps to
1753 # an existing loose name. This code relies on the very limited
1754 # circumstances that strict names can be here. Strict name matching
1755 # happens under two conditions:
1756 # 1) when the name begins with an underscore. But this function
1757 # doesn't accept those, and %prop_value_aliases doesn't have
1758 # them.
1759 # 2) When the values are numeric, in which case we need to look
1760 # further, but their squeezed-out loose values will be in
1761 # %stricter_to_file_of
1762 && exists $utf8::stricter_to_file_of{"$prop=$loose_value"})
1763 {
1764 # The only thing that's legal loosely under strict is that can have an
1765 # underscore between digit pairs XXX
1766 while ($value =~ s/(\d)_(\d)/$1$2/g) {}
1767 return unless exists $utf8::stricter_to_file_of{"$prop=$value"};
1768 }
1769
1770 # Here, we know that the combination exists. Return it.
1771 my $list_ref = $prop_value_aliases{$prop}{$standard_value};
1772 if (@$list_ref > 1) {
1773 # The full name is in element 1.
1774 return $list_ref->[1] unless wantarray;
1775
1776 return @{dclone $list_ref};
1777 }
1778
1779 return $list_ref->[0] unless wantarray;
1780
1781 # Only 1 element means that it repeats
1782 return ( $list_ref->[0], $list_ref->[0] );
1783}
7319f91d
KW
1784
1785
55d7b906 1786=head2 Unicode::UCD::UnicodeVersion
10a6ecd2 1787
a452d459
KW
1788This returns the version of the Unicode Character Database, in other words, the
1789version of the Unicode standard the database implements. The version is a
1790string of numbers delimited by dots (C<'.'>).
10a6ecd2
JH
1791
1792=cut
1793
1794my $UNICODEVERSION;
1795
1796sub UnicodeVersion {
1797 unless (defined $UNICODEVERSION) {
1798 openunicode(\$VERSIONFH, "version");
1799 chomp($UNICODEVERSION = <$VERSIONFH>);
1800 close($VERSIONFH);
1801 croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
1802 unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
1803 }
1804 return $UNICODEVERSION;
1805}
3aa957f9 1806
a452d459
KW
1807=head2 B<Blocks versus Scripts>
1808
1809The difference between a block and a script is that scripts are closer
1810to the linguistic notion of a set of code points required to present
1811languages, while block is more of an artifact of the Unicode code point
1812numbering and separation into blocks of (mostly) 256 code points.
1813
1814For example the Latin B<script> is spread over several B<blocks>, such
1815as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
1816C<Latin Extended-B>. On the other hand, the Latin script does not
1817contain all the characters of the C<Basic Latin> block (also known as
1818ASCII): it includes only the letters, and not, for example, the digits
1819or the punctuation.
1820
1821For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
1822
1823For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
1824
1825=head2 B<Matching Scripts and Blocks>
1826
1827Scripts are matched with the regular-expression construct
1828C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
f200dd12 1829while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches
a452d459
KW
1830any of the 256 code points in the Tibetan block).
1831
8b731da2
JH
1832=head1 BUGS
1833
1834Does not yet support EBCDIC platforms.
1835
561c79ed
JH
1836=head1 AUTHOR
1837
a18e976f 1838Jarkko Hietaniemi. Now maintained by perl5 porters.
561c79ed
JH
1839
1840=cut
1841
18421;