no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
-our $VERSION = '0.62';
+our $VERSION = '0.70';
require Exporter;
use Unicode::UCD 'search_invlist';
my $index = search_invlist(\@invlist, $code_point);
+ # The following function should be used only internally in
+ # implementations of the Unicode Normalization Algorithm, and there
+ # are better choices than it.
use Unicode::UCD 'compexcl';
my $compexcl = compexcl($codepoint);
223 # Decimal 223 in native character set
0223 # Hexadecimal 223, native (= 547 decimal)
- 0xDF # Hexadecimal DF, native (= 223 decimal
+ 0xDF # Hexadecimal DF, native (= 223 decimal)
+ '0xDF' # String form of hexadecimal (= 223 decimal)
'U+DF' # Hexadecimal DF, in Unicode's character set
(= LATIN SMALL LETTER SHARP S)
=cut
-my $BLOCKSFH;
-my $VERSIONFH;
-my $CASEFOLDFH;
-my $CASESPECFH;
-my $NAMEDSEQFH;
my $v_unicode_version; # v-string.
sub openunicode {
- my ($rfh, @path) = @_;
- my $f;
- unless (defined $$rfh) {
- for my $d (@INC) {
- use File::Spec;
- $f = File::Spec->catfile($d, "unicore", @path);
- last if open($$rfh, $f);
- undef $f;
- }
- croak __PACKAGE__, ": failed to find ",
- File::Spec->catfile(@path), " in @INC"
- unless defined $f;
+ my (@path) = @_;
+ my $rfh;
+ for my $d (@INC) {
+ use File::Spec;
+ my $f = File::Spec->catfile($d, "unicore", @path);
+ return $rfh if open($rfh, '<', $f);
}
- return $f;
+ croak __PACKAGE__, ": failed to find ",
+ File::Spec->catfile("unicore", @path), " in @INC";
}
sub _dclone ($) { # Use Storable::dclone if available; otherwise emulate it.
the script I<code> belongs to.
The L</prop_value_aliases()> function can be used to get all the synonyms
-of the script name.
+of the script name. Note that this is the older "Script" property value, and
+not the improved "Script_Extensions" value.
See L</Blocks versus Scripts>.
my %UNICODE_1_NAMES;
my %ISO_COMMENT;
+# Eval'd so can run on versions earlier than the property is available in
+my $Hangul_Syllables_re = eval 'qr/\p{Block=Hangul_Syllables}/';
+
sub charinfo {
# This function has traditionally mimicked what is in UnicodeData.txt,
# "Canonical" imply a compatible decomposition, and the type is prefixed
# to that, as it is in UnicodeData.txt
UnicodeVersion() unless defined $v_unicode_version;
- if ($v_unicode_version ge v2.0.0 && $char =~ /\p{Block=Hangul_Syllables}/) {
+ if ($v_unicode_version ge v2.0.0 && $char =~ $Hangul_Syllables_re) {
# The code points of the decomposition are output in standard Unicode
# hex format, separated by blanks.
$prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
=cut
-sub charprop ($$) {
- my ($input_cp, $prop) = @_;
+sub charprop ($$;$) {
+ my ($input_cp, $prop, $internal_ok) = @_;
my $cp = _getcode($input_cp);
croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp;
my ($list_ref, $map_ref, $format, $default)
- = prop_invmap($prop);
+ = prop_invmap($prop, $internal_ok);
return undef unless defined $list_ref;
my $i = search_invlist($list_ref, $cp);
push @BLOCKS, $subrange;
push @{$BLOCKS{'No_Block'}}, $subrange;
}
- elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) {
+ else {
+ my $blocksfh = openunicode("Blocks.txt");
local $_;
local $/ = "\n";
- while (<$BLOCKSFH>) {
+ while (<$blocksfh>) {
# Old versions used a different syntax to mark the range.
$_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0;
push @{$BLOCKS{$3}}, $subrange;
}
}
- close($BLOCKSFH);
if (! IS_ASCII_PLATFORM) {
# The first two blocks, through 0xFF, are wrong on EBCDIC
# platforms.
elsif (exists $BLOCKS{$arg}) {
return _dclone $BLOCKS{$arg};
}
+
+ carp __PACKAGE__, "::charblock: unknown code '$arg'";
+ return;
}
=head2 B<charscript()>
The L</prop_value_aliases()> function can be used to get all the synonyms
of the script name.
+Note that the Script_Extensions property is an improved version of the Script
+property, and you should probably be using that instead, with the
+L</charprop()> function.
+
If supplied with an argument that can't be a code point, charscript() tries
to do the opposite and interpret the argument as a script name. The
return value is a I<range set>: an anonymous array of arrays that contain
return _dclone $SCRIPTS{$arg};
}
+ carp __PACKAGE__, "::charscript: unknown code '$arg'";
return;
}
the values.
L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
-different type of data structure.
+different type of data structure. Since the Script_Extensions property is an
+improved version of the Script property, you should instead use
+L<prop_invmap("scx")|/prop_invmap()>.
L<C<prop_values("Script")>|/prop_values()> can be used to get all
the known script names as a list, without the code point ranges.
=head2 B<compexcl()>
+WARNING: Unicode discourages the use of this function or any of the
+alternative mechanisms listed in this section (the documentation of
+C<compexcl()>), except internally in implementations of the Unicode
+Normalization Algorithm. You should be using L<Unicode::Normalize> directly
+instead of these. Using these will likely lead to half-baked results.
+
use Unicode::UCD 'compexcl';
my $compexcl = compexcl(0x09dc);
=cut
+# Eval'd so can run on versions earlier than the property is available in
+my $Composition_Exclusion_re = eval 'qr/\p{Composition_Exclusion}/';
+
sub compexcl {
my $arg = shift;
my $code = _getcode($arg);
return if $v_unicode_version lt v3.0.0;
no warnings "non_unicode"; # So works on non-Unicode code points
- return chr($code) =~ /\p{Composition_Exclusion}/;
+ return chr($code) =~ $Composition_Exclusion_re
}
=head2 B<casefold()>
sub _casespec {
unless (%CASESPEC) {
UnicodeVersion() unless defined $v_unicode_version;
- if ($v_unicode_version lt v2.1.8) {
- %CASESPEC = {};
- }
- elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
+ if ($v_unicode_version ge v2.1.8) {
+ my $casespecfh = openunicode("SpecialCasing.txt");
local $_;
local $/ = "\n";
- while (<$CASESPECFH>) {
+ while (<$casespecfh>) {
if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
my ($hexcode, $lower, $title, $upper, $condition) =
}
}
}
- close($CASESPECFH);
}
}
}
sub _namedseq {
unless (%NAMEDSEQ) {
- if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
- local $_;
- local $/ = "\n";
- while (<$NAMEDSEQFH>) {
- if (/^ [0-9A-F]+ \ /x) {
- chomp;
- my ($sequence, $name) = split /\t/;
- my @s = map { chr(hex($_)) } split(' ', $sequence);
- $NAMEDSEQ{$name} = join("", @s);
- }
- }
- close($NAMEDSEQFH);
- }
+ my $namedseqfh = openunicode("Name.pl");
+ local $_;
+ local $/ = "\n";
+ while (<$namedseqfh>) {
+ if (/^ [0-9A-F]+ \ /x) {
+ chomp;
+ my ($sequence, $name) = split /\t/;
+ my @s = map { chr(hex($_)) } split(' ', $sequence);
+ $NAMEDSEQ{$name} = join("", @s);
+ }
+ }
}
}
my $val = num("123");
my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
+ my $val = num("12a", \$valid_length); # $valid_length contains 2
C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
doesn't think the entire string has a completely valid, safe numeric value.
+If called with an optional second parameter, a reference to a scalar, C<num()>
+will set the scalar to the length of any valid initial substring; or to 0 if none.
If the string is just one character in length, the Unicode numeric value
-is returned if it has one, or C<undef> otherwise. Note that this need
-not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
-example returns -0.5.
+is returned if it has one, or C<undef> otherwise. If the optional scalar ref
+is passed, it would be set to 1 if the return is valid; or 0 if the return is
+C<undef>. Note that the numeric value returned need not be a whole number.
+C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for example returns -0.5.
=cut
the same form. A half-width digit mixed with a full-width one will
return C<undef>. The Arabic script has two sets of digits; C<num> will
return C<undef> unless all the digits in the string come from the same
-set.
+set. In all cases, the optional scalar ref parameter is set to how
+long any valid initial substring of digits is; hence it will be set to the
+entire string length if the main return value is not C<undef>.
C<num> errs on the side of safety, and there may be valid strings of
decimal digits that it doesn't recognize. Note that Unicode defines
# consider those, and return the <decomposition> type in the second
# array element.
-sub num {
- my $string = $_[0];
+sub num ($;$) {
+ my ($string, $retlen_ref) = @_;
+
+ use feature 'unicode_strings';
_numeric unless %NUMERIC;
+ $$retlen_ref = 0 if $retlen_ref; # Assume will fail
+
+ my $length = length $string;
+ return if $length == 0;
- my $length = length($string);
- return $NUMERIC{ord($string)} if $length == 1;
- return if $string =~ /\D/;
my $first_ord = ord(substr($string, 0, 1));
+ return if ! exists $NUMERIC{$first_ord}
+ || ! defined $NUMERIC{$first_ord};
+
+ # Here, we know the first character is numeric
my $value = $NUMERIC{$first_ord};
+ $$retlen_ref = 1 if $retlen_ref; # Assume only this one is numeric
+
+ return $value if $length == 1;
+
+ # Here, the input is longer than a single character. To be valid, it must
+ # be entirely decimal digits, which means it must start with one.
+ return if $string =~ / ^ \D /x;
# To be a valid decimal number, it should be in a block of 10 consecutive
# characters, whose values are 0, 1, 2, ... 9. Therefore this digit's
# release, we verify that this first character is a member of such a
# block. That is, that the block of characters surrounding this one
# consists of all \d characters whose numeric values are the expected
- # ones.
+ # ones. If not, then this single character is numeric, but the string as
+ # a whole is not considered to be.
UnicodeVersion() unless defined $v_unicode_version;
if ($v_unicode_version lt v6.0.0) {
for my $i (0 .. 9) {
# function.
my $ord = ord(substr($string, $i, 1));
my $digit = $ord - $zero_ord;
- return unless $digit >= 0 && $digit <= 9;
+ if ($digit < 0 || $digit > 9) {
+ $$retlen_ref = $i if $retlen_ref;
+ return;
+ }
$value = $value * 10 + $digit;
}
+ $$retlen_ref = $length if $retlen_ref;
return $value;
}
return ( $list_ref->[0], $list_ref->[0] );
}
-# All 1 bits is the largest possible UV.
-$Unicode::UCD::MAX_CP = ~0;
+# All 1 bits but the top one is the largest possible IV.
+$Unicode::UCD::MAX_CP = (~0) >> 1;
=pod
are also specified in L<perluniprops|perluniprops/Properties accessible
through \p{} and \P{}>. Examples of using the "property=value" form are:
- say join ", ", prop_invlist("Script=Shavian");
+ say join ", ", prop_invlist("Script_Extensions=Shavian");
prints:
66640, 66688
for (my $i = 0; $i < @invlist; $i += 2) {
my $upper = ($i + 1) < @invlist
? $invlist[$i+1] - 1 # In range
- : $Unicode::UCD::MAX_CP; # To infinity. You may want
- # to stop much much earlier;
- # going this high may expose
- # perl deficiencies with very
- # large numbers.
+ : $Unicode::UCD::MAX_CP; # To infinity.
for my $j ($invlist[$i] .. $upper) {
push @full_list, $j;
}
Note that the mapping is the one that is specified in the Unicode data files,
and to get the final decomposition, it may need to be applied recursively.
+Unicode in fact discourages use of this property except internally in
+implementations of the Unicode Normalization Algorithm.
The fourth (index [3]) element (C<$default>) in the list returned for this
format is 0.
The returned values for the Perl extension properties, such as C<Any> and
C<Greek> are somewhat misleading. The values are either C<"Y"> or C<"N>".
All Unicode properties are bipartite, so you can actually use the C<"Y"> or
-C<"N>" in a Perl regular rexpression for these, like C<qr/\p{ID_Start=Y/}> or
+C<"N>" in a Perl regular expression for these, like C<qr/\p{ID_Start=Y/}> or
C<qr/\p{Upper=N/}>. But the Perl extensions aren't specified this way, only
like C</qr/\p{Any}>, I<etc>. You can't actually use the C<"Y"> and C<"N>" in
them.
+=head3 Getting every available name
+
+Instead of reading the Unicode Database directly from files, as you were able
+to do for a long time, you are encouraged to use the supplied functions. So,
+instead of reading C<Name.pl> - which may disappear without notice in the
+future - directly, as with
+
+ my (%name, %cp);
+ for (split m/\s*\n/ => do "unicore/Name.pl") {
+ my ($cp, $name) = split m/\t/ => $_;
+ $cp{$name} = $cp;
+ $name{$cp} = $name unless $cp =~ m/ /;
+ }
+
+You ought to use L</prop_invmap()> like this:
+
+ my (%name, %cp, %cps, $n);
+ # All codepoints
+ foreach my $cat (qw( Name Name_Alias )) {
+ my ($codepoints, $names, $format, $default) = prop_invmap($cat);
+ # $format => "n", $default => ""
+ foreach my $i (0 .. @$codepoints - 2) {
+ my ($cp, $n) = ($codepoints->[$i], $names->[$i]);
+ # If $n is a ref, the same codepoint has multiple names
+ foreach my $name (ref $n ? @$n : $n) {
+ $name{$cp} //= $name;
+ $cp{$name} //= $cp;
+ }
+ }
+ }
+ # Named sequences
+ { my %ns = namedseq();
+ foreach my $name (sort { $ns{$a} cmp $ns{$b} } keys %ns) {
+ $cp{$name} //= [ map { ord } split "" => $ns{$name} ];
+ }
+ }
+
=cut
# User-defined properties could be handled with some changes to utf8_heavy.pl;
sub UnicodeVersion {
unless (defined $UNICODEVERSION) {
- openunicode(\$VERSIONFH, "version");
+ my $versionfh = openunicode("version");
local $/ = "\n";
- chomp($UNICODEVERSION = <$VERSIONFH>);
- close($VERSIONFH);
+ chomp($UNICODEVERSION = <$versionfh>);
croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
}