no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
-our $VERSION = '0.61';
+our $VERSION = '0.70';
require Exporter;
use Unicode::UCD 'search_invlist';
my $index = search_invlist(\@invlist, $code_point);
+ # The following function should be used only internally in
+ # implementations of the Unicode Normalization Algorithm, and there
+ # are better choices than it.
use Unicode::UCD 'compexcl';
my $compexcl = compexcl($codepoint);
223 # Decimal 223 in native character set
0223 # Hexadecimal 223, native (= 547 decimal)
- 0xDF # Hexadecimal DF, native (= 223 decimal
+ 0xDF # Hexadecimal DF, native (= 223 decimal)
+ '0xDF' # String form of hexadecimal (= 223 decimal)
'U+DF' # Hexadecimal DF, in Unicode's character set
(= LATIN SMALL LETTER SHARP S)
=cut
-my $BLOCKSFH;
-my $VERSIONFH;
-my $CASEFOLDFH;
-my $CASESPECFH;
-my $NAMEDSEQFH;
my $v_unicode_version; # v-string.
sub openunicode {
- my ($rfh, @path) = @_;
- my $f;
- unless (defined $$rfh) {
- for my $d (@INC) {
- use File::Spec;
- $f = File::Spec->catfile($d, "unicore", @path);
- last if open($$rfh, $f);
- undef $f;
- }
- croak __PACKAGE__, ": failed to find ",
- File::Spec->catfile(@path), " in @INC"
- unless defined $f;
+ my (@path) = @_;
+ my $rfh;
+ for my $d (@INC) {
+ use File::Spec;
+ my $f = File::Spec->catfile($d, "unicore", @path);
+ return $rfh if open($rfh, '<', $f);
}
- return $f;
+ croak __PACKAGE__, ": failed to find ",
+ File::Spec->catfile("unicore", @path), " in @INC";
}
sub _dclone ($) { # Use Storable::dclone if available; otherwise emulate it.
the script I<code> belongs to.
The L</prop_value_aliases()> function can be used to get all the synonyms
-of the script name.
+of the script name. Note that this is the older "Script" property value, and
+not the improved "Script_Extensions" value.
See L</Blocks versus Scripts>.
my %UNICODE_1_NAMES;
my %ISO_COMMENT;
+# Eval'd so can run on versions earlier than the property is available in
+my $Hangul_Syllables_re = eval 'qr/\p{Block=Hangul_Syllables}/';
+
sub charinfo {
# This function has traditionally mimicked what is in UnicodeData.txt,
# "Canonical" imply a compatible decomposition, and the type is prefixed
# to that, as it is in UnicodeData.txt
UnicodeVersion() unless defined $v_unicode_version;
- if ($v_unicode_version ge v2.0.0 && $char =~ /\p{Block=Hangul_Syllables}/) {
+ if ($v_unicode_version ge v2.0.0 && $char =~ $Hangul_Syllables_re) {
# The code points of the decomposition are output in standard Unicode
# hex format, separated by blanks.
$prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
=cut
-sub charprop ($$) {
- my ($input_cp, $prop) = @_;
+sub charprop ($$;$) {
+ my ($input_cp, $prop, $internal_ok) = @_;
my $cp = _getcode($input_cp);
croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp;
my ($list_ref, $map_ref, $format, $default)
- = prop_invmap($prop);
+ = prop_invmap($prop, $internal_ok);
return undef unless defined $list_ref;
my $i = search_invlist($list_ref, $cp);
}
else {
croak __PACKAGE__, "::charprop: Internal error: unknown format '$format'. Please perlbug this";
- return undef;
}
}
push @BLOCKS, $subrange;
push @{$BLOCKS{'No_Block'}}, $subrange;
}
- elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) {
+ else {
+ my $blocksfh = openunicode("Blocks.txt");
local $_;
local $/ = "\n";
- while (<$BLOCKSFH>) {
+ while (<$blocksfh>) {
+
+ # Old versions used a different syntax to mark the range.
+ $_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0;
+
if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
my ($lo, $hi) = (hex($1), hex($2));
my $subrange = [ $lo, $hi, $3 ];
push @{$BLOCKS{$3}}, $subrange;
}
}
- close($BLOCKSFH);
if (! IS_ASCII_PLATFORM) {
# The first two blocks, through 0xFF, are wrong on EBCDIC
# platforms.
elsif (exists $BLOCKS{$arg}) {
return _dclone $BLOCKS{$arg};
}
+
+ carp __PACKAGE__, "::charblock: unknown code '$arg'";
+ return;
}
=head2 B<charscript()>
The L</prop_value_aliases()> function can be used to get all the synonyms
of the script name.
+Note that the Script_Extensions property is an improved version of the Script
+property, and you should probably be using that instead, with the
+L</charprop()> function.
+
If supplied with an argument that can't be a code point, charscript() tries
to do the opposite and interpret the argument as a script name. The
return value is a I<range set>: an anonymous array of arrays that contain
return _dclone $SCRIPTS{$arg};
}
+ carp __PACKAGE__, "::charscript: unknown code '$arg'";
return;
}
the values.
L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
-different type of data structure.
+different type of data structure. Since the Script_Extensions property is an
+improved version of the Script property, you should instead use
+L<prop_invmap("scx")|/prop_invmap()>.
L<C<prop_values("Script")>|/prop_values()> can be used to get all
the known script names as a list, without the code point ranges.
=head2 B<compexcl()>
+WARNING: Unicode discourages the use of this function or any of the
+alternative mechanisms listed in this section (the documentation of
+C<compexcl()>), except internally in implementations of the Unicode
+Normalization Algorithm. You should be using L<Unicode::Normalize> directly
+instead of these. Using these will likely lead to half-baked results.
+
use Unicode::UCD 'compexcl';
my $compexcl = compexcl(0x09dc);
=cut
+# Eval'd so can run on versions earlier than the property is available in
+my $Composition_Exclusion_re = eval 'qr/\p{Composition_Exclusion}/';
+
sub compexcl {
my $arg = shift;
my $code = _getcode($arg);
return if $v_unicode_version lt v3.0.0;
no warnings "non_unicode"; # So works on non-Unicode code points
- return chr($code) =~ /\p{Composition_Exclusion}/;
+ return chr($code) =~ $Composition_Exclusion_re
}
=head2 B<casefold()>
sub _casespec {
unless (%CASESPEC) {
UnicodeVersion() unless defined $v_unicode_version;
- if ($v_unicode_version lt v2.1.8) {
- %CASESPEC = {};
- }
- elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
+ if ($v_unicode_version ge v2.1.8) {
+ my $casespecfh = openunicode("SpecialCasing.txt");
local $_;
local $/ = "\n";
- while (<$CASESPECFH>) {
+ while (<$casespecfh>) {
if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
my ($hexcode, $lower, $title, $upper, $condition) =
}
}
}
- close($CASESPECFH);
}
}
}
sub _namedseq {
unless (%NAMEDSEQ) {
- if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
- local $_;
- local $/ = "\n";
- while (<$NAMEDSEQFH>) {
- if (/^ [0-9A-F]+ \ /x) {
- chomp;
- my ($sequence, $name) = split /\t/;
- my @s = map { chr(hex($_)) } split(' ', $sequence);
- $NAMEDSEQ{$name} = join("", @s);
- }
- }
- close($NAMEDSEQFH);
- }
+ my $namedseqfh = openunicode("Name.pl");
+ local $_;
+ local $/ = "\n";
+ while (<$namedseqfh>) {
+ if (/^ [0-9A-F]+ \ /x) {
+ chomp;
+ my ($sequence, $name) = split /\t/;
+ my @s = map { chr(hex($_)) } split(' ', $sequence);
+ $NAMEDSEQ{$name} = join("", @s);
+ }
+ }
}
}
my $val = num("123");
my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
+ my $val = num("12a", \$valid_length); # $valid_length contains 2
C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
doesn't think the entire string has a completely valid, safe numeric value.
+If called with an optional second parameter, a reference to a scalar, C<num()>
+will set the scalar to the length of any valid initial substring; or to 0 if none.
If the string is just one character in length, the Unicode numeric value
-is returned if it has one, or C<undef> otherwise. Note that this need
-not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
-example returns -0.5.
+is returned if it has one, or C<undef> otherwise. If the optional scalar ref
+is passed, it would be set to 1 if the return is valid; or 0 if the return is
+C<undef>. Note that the numeric value returned need not be a whole number.
+C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for example returns -0.5.
=cut
the same form. A half-width digit mixed with a full-width one will
return C<undef>. The Arabic script has two sets of digits; C<num> will
return C<undef> unless all the digits in the string come from the same
-set.
+set. In all cases, the optional scalar ref parameter is set to how
+long any valid initial substring of digits is; hence it will be set to the
+entire string length if the main return value is not C<undef>.
C<num> errs on the side of safety, and there may be valid strings of
decimal digits that it doesn't recognize. Note that Unicode defines
# consider those, and return the <decomposition> type in the second
# array element.
-sub num {
- my $string = $_[0];
+sub num ($;$) {
+ my ($string, $retlen_ref) = @_;
+
+ use feature 'unicode_strings';
_numeric unless %NUMERIC;
+ $$retlen_ref = 0 if $retlen_ref; # Assume will fail
+
+ my $length = length $string;
+ return if $length == 0;
- my $length = length($string);
- return $NUMERIC{ord($string)} if $length == 1;
- return if $string =~ /\D/;
my $first_ord = ord(substr($string, 0, 1));
+ return if ! exists $NUMERIC{$first_ord}
+ || ! defined $NUMERIC{$first_ord};
+
+ # Here, we know the first character is numeric
my $value = $NUMERIC{$first_ord};
+ $$retlen_ref = 1 if $retlen_ref; # Assume only this one is numeric
+
+ return $value if $length == 1;
+
+ # Here, the input is longer than a single character. To be valid, it must
+ # be entirely decimal digits, which means it must start with one.
+ return if $string =~ / ^ \D /x;
# To be a valid decimal number, it should be in a block of 10 consecutive
# characters, whose values are 0, 1, 2, ... 9. Therefore this digit's
# release, we verify that this first character is a member of such a
# block. That is, that the block of characters surrounding this one
# consists of all \d characters whose numeric values are the expected
- # ones.
+ # ones. If not, then this single character is numeric, but the string as
+ # a whole is not considered to be.
UnicodeVersion() unless defined $v_unicode_version;
if ($v_unicode_version lt v6.0.0) {
for my $i (0 .. 9) {
# function.
my $ord = ord(substr($string, $i, 1));
my $digit = $ord - $zero_ord;
- return unless $digit >= 0 && $digit <= 9;
+ if ($digit < 0 || $digit > 9) {
+ $$retlen_ref = $i if $retlen_ref;
+ return;
+ }
$value = $value * 10 + $digit;
}
+ $$retlen_ref = $length if $retlen_ref;
return $value;
}
return ( $list_ref->[0], $list_ref->[0] );
}
-# All 1 bits is the largest possible UV.
-$Unicode::UCD::MAX_CP = ~0;
+# All 1 bits but the top one is the largest possible IV.
+$Unicode::UCD::MAX_CP = (~0) >> 1;
=pod
are also specified in L<perluniprops|perluniprops/Properties accessible
through \p{} and \P{}>. Examples of using the "property=value" form are:
- say join ", ", prop_invlist("Script=Shavian");
+ say join ", ", prop_invlist("Script_Extensions=Shavian");
prints:
66640, 66688
for (my $i = 0; $i < @invlist; $i += 2) {
my $upper = ($i + 1) < @invlist
? $invlist[$i+1] - 1 # In range
- : $Unicode::UCD::MAX_CP; # To infinity. You may want
- # to stop much much earlier;
- # going this high may expose
- # perl deficiencies with very
- # large numbers.
+ : $Unicode::UCD::MAX_CP; # To infinity.
for my $j ($invlist[$i] .. $upper) {
push @full_list, $j;
}
Many Unicode properties have more than one name (or alias). C<prop_invmap>
understands all of these, including Perl extensions to them. Ambiguities are
-resolved as described above for L</prop_aliases()>. The Perl internal
-property "Perl_Decimal_Digit, described below, is also accepted. An empty
-list is returned if the property name is unknown.
+resolved as described above for L</prop_aliases()> (except if a property has
+both a complete mapping, and a binary C<Y>/C<N> mapping, then specifying the
+property name prefixed by C<"is"> causes the binary one to be returned). The
+Perl internal property "Perl_Decimal_Digit, described below, is also accepted.
+An empty list is returned if the property name is unknown.
See L<perluniprops/Properties accessible through Unicode::UCD> for the
properties acceptable as inputs to this function.
Note that the mapping is the one that is specified in the Unicode data files,
and to get the final decomposition, it may need to be applied recursively.
+Unicode in fact discourages use of this property except internally in
+implementations of the Unicode Normalization Algorithm.
The fourth (index [3]) element (C<$default>) in the list returned for this
format is 0.
The returned values for the Perl extension properties, such as C<Any> and
C<Greek> are somewhat misleading. The values are either C<"Y"> or C<"N>".
All Unicode properties are bipartite, so you can actually use the C<"Y"> or
-C<"N>" in a Perl regular rexpression for these, like C<qr/\p{ID_Start=Y/}> or
+C<"N>" in a Perl regular expression for these, like C<qr/\p{ID_Start=Y/}> or
C<qr/\p{Upper=N/}>. But the Perl extensions aren't specified this way, only
like C</qr/\p{Any}>, I<etc>. You can't actually use the C<"Y"> and C<"N>" in
them.
+=head3 Getting every available name
+
+Instead of reading the Unicode Database directly from files, as you were able
+to do for a long time, you are encouraged to use the supplied functions. So,
+instead of reading C<Name.pl> - which may disappear without notice in the
+future - directly, as with
+
+ my (%name, %cp);
+ for (split m/\s*\n/ => do "unicore/Name.pl") {
+ my ($cp, $name) = split m/\t/ => $_;
+ $cp{$name} = $cp;
+ $name{$cp} = $name unless $cp =~ m/ /;
+ }
+
+You ought to use L</prop_invmap()> like this:
+
+ my (%name, %cp, %cps, $n);
+ # All codepoints
+ foreach my $cat (qw( Name Name_Alias )) {
+ my ($codepoints, $names, $format, $default) = prop_invmap($cat);
+ # $format => "n", $default => ""
+ foreach my $i (0 .. @$codepoints - 2) {
+ my ($cp, $n) = ($codepoints->[$i], $names->[$i]);
+ # If $n is a ref, the same codepoint has multiple names
+ foreach my $name (ref $n ? @$n : $n) {
+ $name{$cp} //= $name;
+ $cp{$name} //= $cp;
+ }
+ }
+ }
+ # Named sequences
+ { my %ns = namedseq();
+ foreach my $name (sort { $ns{$a} cmp $ns{$b} } keys %ns) {
+ $cp{$name} //= [ map { ord } split "" => $ns{$name} ];
+ }
+ }
+
=cut
# User-defined properties could be handled with some changes to utf8_heavy.pl;
# we need to also read in that table. Create a hash with the keys
# being the code points, and the values being a list of the
# aliases for the code point key.
- my ($aliases_code_points, $aliases_maps, undef, undef) =
- &prop_invmap('Name_Alias');
+ my ($aliases_code_points, $aliases_maps, undef, undef)
+ = &prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok');
my %aliases;
for (my $i = 0; $i < @$aliases_code_points; $i++) {
my $code_point = $aliases_code_points->[$i];
if ($swash->{'LIST'} =~ /^V/) {
@invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
- shift @invlist;
+
+ shift @invlist; # Get rid of 'V';
+
+ # Could need to be inverted: add or subtract a 0 at the beginning of
+ # the list.
+ if ($swash->{'INVERT_IT'}) {
+ if (@invlist && $invlist[0] == 0) {
+ shift @invlist;
+ }
+ else {
+ unshift @invlist, 0;
+ }
+ }
foreach my $i (0 .. @invlist - 1) {
$invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N'
}
}
}
else {
+ if ($swash->{'INVERT_IT'}) {
+ croak __PACKAGE__, ":prop_invmap: Don't know how to deal with inverted";
+ }
+
# The LIST input lines look like:
# ...
# 0374\t\tCommon
map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap;
$format = 'sl';
}
- elsif ($returned_prop eq 'ToNameAlias') {
+ elsif ($returned_prop =~ / To ( _Perl )? NameAlias/x) {
# This property currently doesn't have any lists, but theoretically
# could
# to indicate that need to add code point to it.
$format = 'ar';
}
- elsif ($format ne 'n' && $format ne 'a') {
+ elsif ($format eq 'ax') {
+
+ # Normally 'ax' properties have overrides, and will have been handled
+ # above, but if not, they still need adjustment, and the hex values
+ # have already been converted to decimal
+ $format = 'a';
+ }
+ elsif ($format ne 'n' && $format !~ / ^ a /x) {
# All others are simple scalars
$format = 's';
sub UnicodeVersion {
unless (defined $UNICODEVERSION) {
- openunicode(\$VERSIONFH, "version");
+ my $versionfh = openunicode("version");
local $/ = "\n";
- chomp($UNICODEVERSION = <$VERSIONFH>);
- close($VERSIONFH);
+ chomp($UNICODEVERSION = <$versionfh>);
croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
}
Note that starting in Unicode 6.1, many of the block names have shorter
synonyms. These are always given in the new style.
+=head2 Use with older Unicode versions
+
+The functions in this module work as well as can be expected when
+used on earlier Unicode versions. But, obviously, they use the available data
+from that Unicode version. For example, if the Unicode version predates the
+definition of the script property (Unicode 3.1), then any function that deals
+with scripts is going to return C<undef> for the script portion of the return
+value.
+
=head1 AUTHOR
Jarkko Hietaniemi. Now maintained by perl5 porters.