no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
-our $VERSION = '0.61';
+our $VERSION = '0.70';
require Exporter;
charblock charscript
charblocks charscripts
charinrange
+ charprop
+ charprops_all
general_categories bidi_types
compexcl
casefold all_casefolds casespec
use Unicode::UCD 'charinfo';
my $charinfo = charinfo($codepoint);
+ use Unicode::UCD 'charprop';
+ my $value = charprop($codepoint, $property);
+
+ use Unicode::UCD 'charprops_all';
+ my $all_values_hash_ref = charprops_all($codepoint);
+
use Unicode::UCD 'casefold';
my $casefold = casefold($codepoint);
use Unicode::UCD 'search_invlist';
my $index = search_invlist(\@invlist, $code_point);
+ # The following function should be used only internally in
+ # implementations of the Unicode Normalization Algorithm, and there
+ # are better choices than it.
use Unicode::UCD 'compexcl';
my $compexcl = compexcl($codepoint);
223 # Decimal 223 in native character set
0223 # Hexadecimal 223, native (= 547 decimal)
- 0xDF # Hexadecimal DF, native (= 223 decimal
+ 0xDF # Hexadecimal DF, native (= 223 decimal)
+ '0xDF' # String form of hexadecimal (= 223 decimal)
'U+DF' # Hexadecimal DF, in Unicode's character set
(= LATIN SMALL LETTER SHARP S)
=cut
-my $BLOCKSFH;
-my $VERSIONFH;
-my $CASEFOLDFH;
-my $CASESPECFH;
-my $NAMEDSEQFH;
my $v_unicode_version; # v-string.
sub openunicode {
- my ($rfh, @path) = @_;
- my $f;
- unless (defined $$rfh) {
- for my $d (@INC) {
- use File::Spec;
- $f = File::Spec->catfile($d, "unicore", @path);
- last if open($$rfh, $f);
- undef $f;
- }
- croak __PACKAGE__, ": failed to find ",
- File::Spec->catfile(@path), " in @INC"
- unless defined $f;
+ my (@path) = @_;
+ my $rfh;
+ for my $d (@INC) {
+ use File::Spec;
+ my $f = File::Spec->catfile($d, "unicore", @path);
+ return $rfh if open($rfh, '<', $f);
}
- return $f;
+ croak __PACKAGE__, ": failed to find ",
+ File::Spec->catfile("unicore", @path), " in @INC";
}
sub _dclone ($) { # Use Storable::dclone if available; otherwise emulate it.
Fields that aren't applicable to the particular code point argument exist in the
returned hash, and are empty.
+For results that are less "raw" than this function returns, or to get the values for
+any property, not just the few covered by this function, use the
+L</charprop()> function.
+
The keys in the hash with the meanings of their values are:
=over
hexdigits. This indicates that the full uppercase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple uppercase mapping is
-I<code> itself; you'll need some other means, (like
+I<code> itself; you'll need some other means, (like L</charprop()> or
L</casespec()> to get the full mapping.
=item B<lower>
hexdigits. This indicates that the full lowercase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple lowercase mapping is
-I<code> itself; you'll need some other means, (like
+I<code> itself; you'll need some other means, (like L</charprop()> or
L</casespec()> to get the full mapping.
=item B<title>
hexdigits. This indicates that the full titlecase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple titlecase mapping is
-I<code> itself; you'll need some other means, (like
+I<code> itself; you'll need some other means, (like L</charprop()> or
L</casespec()> to get the full mapping.
=item B<block>
the script I<code> belongs to.
The L</prop_value_aliases()> function can be used to get all the synonyms
-of the script name.
+of the script name. Note that this is the older "Script" property value, and
+not the improved "Script_Extensions" value.
See L</Blocks versus Scripts>.
my %UNICODE_1_NAMES;
my %ISO_COMMENT;
+# Eval'd so can run on versions earlier than the property is available in
+my $Hangul_Syllables_re = eval 'qr/\p{Block=Hangul_Syllables}/';
+
sub charinfo {
# This function has traditionally mimicked what is in UnicodeData.txt,
# "Canonical" imply a compatible decomposition, and the type is prefixed
# to that, as it is in UnicodeData.txt
UnicodeVersion() unless defined $v_unicode_version;
- if ($v_unicode_version ge v2.0.0 && $char =~ /\p{Block=Hangul_Syllables}/) {
+ if ($v_unicode_version ge v2.0.0 && $char =~ $Hangul_Syllables_re) {
# The code points of the decomposition are output in standard Unicode
# hex format, separated by blanks.
$prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
_search($range, 0, $#$range, $code);
}
+=head2 B<charprop()>
+
+ use Unicode::UCD 'charprop';
+
+ print charprop(0x41, "Gc"), "\n";
+ print charprop(0x61, "General_Category"), "\n";
+
+ prints
+ Lu
+ Ll
+
+This returns the value of the Unicode property given by the second parameter
+for the L</code point argument> given by the first.
+
+The passed-in property may be specified as any of the synonyms returned by
+L</prop_aliases()>.
+
+The return value is always a scalar, either a string or a number. For
+properties where there are synonyms for the values, the synonym returned by
+this function is the longest, most descriptive form, the one returned by
+L</prop_value_aliases()> when called in a scalar context. Of course, you can
+call L</prop_value_aliases()> on the result to get other synonyms.
+
+The return values are more "cooked" than the L</charinfo()> ones. For
+example, the C<"uc"> property value is the actual string containing the full
+uppercase mapping of the input code point. You have to go to extra trouble
+with C<charinfo> to get this value from its C<upper> hash element when the
+full mapping differs from the simple one.
+
+Special note should be made of the return values for a few properties:
+
+=over
+
+=item Block
+
+The value returned is the new-style (see L</Old-style versus new-style block
+names>).
+
+=item Decomposition_Mapping
+
+Like L</charinfo()>, the result may be an intermediate decomposition whose
+components are also decomposable. Use L<Unicode::Normalize> to get the final
+decomposition in one step.
+
+Unlike L</charinfo()>, this does not include the decomposition type. Use the
+C<Decomposition_Type> property to get that.
+
+=item Name_Alias
+
+If the input code point's name has more than one synonym, they are returned
+joined into a single comma-separated string.
+
+=item Numeric_Value
+
+If the result is a fraction, it is converted into a floating point number to
+the accuracy of your platform.
+
+=item Script_Extensions
+
+If the result is multiple script names, they are returned joined into a single
+comma-separated string.
+
+=back
+
+When called with a property that is a Perl extension that isn't expressible in
+a compound form, this function currently returns C<undef>, as the only two
+possible values are I<true> or I<false> (1 or 0 I suppose). This behavior may
+change in the future, so don't write code that relies on it. C<Present_In> is
+a Perl extension that is expressible in a bipartite or compound form (for
+example, C<\p{Present_In=4.0}>), so C<charprop> accepts it. But C<Any> is a
+Perl extension that isn't expressible that way, so C<charprop> returns
+C<undef> for it. Also C<charprop> returns C<undef> for all Perl extensions
+that are internal-only.
+
+=cut
+
+sub charprop ($$;$) {
+ my ($input_cp, $prop, $internal_ok) = @_;
+
+ my $cp = _getcode($input_cp);
+ croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp;
+
+ my ($list_ref, $map_ref, $format, $default)
+ = prop_invmap($prop, $internal_ok);
+ return undef unless defined $list_ref;
+
+ my $i = search_invlist($list_ref, $cp);
+ croak __PACKAGE__, "::charprop: prop_invmap return is invalid for charprop('$input_cp', '$prop)" unless defined $i;
+
+ # $i is the index into both the inversion list and map of $cp.
+ my $map = $map_ref->[$i];
+
+ # Convert enumeration values to their most complete form.
+ if (! ref $map) {
+ my $long_form = prop_value_aliases($prop, $map);
+ $map = $long_form if defined $long_form;
+ }
+
+ if ($format =~ / ^ s /x) { # Scalars
+ return join ",", @$map if ref $map; # Convert to scalar with comma
+ # separated array elements
+
+ # Resolve ambiguity as to whether an all digit value is a code point
+ # that should be converted to a character, or whether it is really
+ # just a number. To do this, look at the default. If it is a
+ # non-empty number, we can safely assume the result is also a number.
+ if ($map =~ / ^ \d+ $ /ax && $default !~ / ^ \d+ $ /ax) {
+ $map = chr $map;
+ }
+ elsif ($map =~ / ^ (?: Y | N ) $ /x) {
+
+ # prop_invmap() returns these values for properties that are Perl
+ # extensions. But this is misleading. For now, return undef for
+ # these, as currently documented.
+ undef $map unless
+ exists $Unicode::UCD::prop_aliases{utf8::_loose_name(lc $prop)};
+ }
+ return $map;
+ }
+ elsif ($format eq 'ar') { # numbers, including rationals
+ my $offset = $cp - $list_ref->[$i];
+ return $map if $map =~ /nan/i;
+ return $map + $offset if $offset != 0; # If needs adjustment
+ return eval $map; # Convert e.g., 1/2 to 0.5
+ }
+ elsif ($format =~ /^a/) { # Some entries need adjusting
+
+ # Linearize sequences into a string.
+ return join "", map { chr $_ } @$map if ref $map; # XXX && $format =~ /^ a [dl] /x;
+
+ return "" if $map eq "" && $format =~ /^a.*e/;
+
+ # These are all character mappings. Return the chr if no adjustment
+ # is needed
+ return chr $cp if $map eq "0";
+
+ # Convert special entry.
+ if ($map eq '<hangul syllable>' && $format eq 'ad') {
+ use Unicode::Normalize qw(NFD);
+ return NFD(chr $cp);
+ }
+
+ # The rest need adjustment from the first entry in the inversion list
+ # corresponding to this map.
+ my $offset = $cp - $list_ref->[$i];
+ return chr($map + $cp - $list_ref->[$i]);
+ }
+ elsif ($format eq 'n') { # The name property
+
+ # There are two special cases, handled here.
+ if ($map =~ / ( .+ ) <code\ point> $ /x) {
+ $map = sprintf("$1%04X", $cp);
+ }
+ elsif ($map eq '<hangul syllable>') {
+ $map = charnames::viacode($cp);
+ }
+ return $map;
+ }
+ else {
+ croak __PACKAGE__, "::charprop: Internal error: unknown format '$format'. Please perlbug this";
+ }
+}
+
+=head2 B<charprops_all()>
+
+ use Unicode::UCD 'charprops_all';
+
+ my $%properties_of_A_hash_ref = charprops_all("U+41");
+
+This returns a reference to a hash whose keys are all the distinct Unicode (no
+Perl extension) properties, and whose values are the respective values for
+those properties for the input L</code point argument>.
+
+Each key is the property name in its longest, most descriptive form. The
+values are what L</charprop()> would return.
+
+This function is expensive in time and memory.
+
+=cut
+
+sub charprops_all($) {
+ my $input_cp = shift;
+
+ my $cp = _getcode($input_cp);
+ croak __PACKAGE__, "::charprops_all: unknown code point '$input_cp'" unless defined $cp;
+
+ my %return;
+
+ require "unicore/UCD.pl";
+
+ foreach my $prop (keys %Unicode::UCD::prop_aliases) {
+
+ # Don't return a Perl extension. (This is the only one that
+ # %prop_aliases has in it.)
+ next if $prop eq 'perldecimaldigit';
+
+ # Use long name for $prop in the hash
+ $return{scalar prop_aliases($prop)} = charprop($cp, $prop);
+ }
+
+ return \%return;
+}
+
=head2 B<charblock()>
use Unicode::UCD 'charblock';
push @BLOCKS, $subrange;
push @{$BLOCKS{'No_Block'}}, $subrange;
}
- elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) {
+ else {
+ my $blocksfh = openunicode("Blocks.txt");
local $_;
local $/ = "\n";
- while (<$BLOCKSFH>) {
+ while (<$blocksfh>) {
+
+ # Old versions used a different syntax to mark the range.
+ $_ =~ s/;\s+/../ if $v_unicode_version lt v3.1.0;
+
if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
my ($lo, $hi) = (hex($1), hex($2));
my $subrange = [ $lo, $hi, $3 ];
push @{$BLOCKS{$3}}, $subrange;
}
}
- close($BLOCKSFH);
if (! IS_ASCII_PLATFORM) {
# The first two blocks, through 0xFF, are wrong on EBCDIC
# platforms.
elsif (exists $BLOCKS{$arg}) {
return _dclone $BLOCKS{$arg};
}
+
+ carp __PACKAGE__, "::charblock: unknown code '$arg'";
+ return;
}
=head2 B<charscript()>
The L</prop_value_aliases()> function can be used to get all the synonyms
of the script name.
+Note that the Script_Extensions property is an improved version of the Script
+property, and you should probably be using that instead, with the
+L</charprop()> function.
+
If supplied with an argument that can't be a code point, charscript() tries
to do the opposite and interpret the argument as a script name. The
return value is a I<range set>: an anonymous array of arrays that contain
return _dclone $SCRIPTS{$arg};
}
+ carp __PACKAGE__, "::charscript: unknown code '$arg'";
return;
}
the values.
L<prop_invmap("script")|/prop_invmap()> can be used to get this same data in a
-different type of data structure.
+different type of data structure. Since the Script_Extensions property is an
+improved version of the Script property, you should instead use
+L<prop_invmap("scx")|/prop_invmap()>.
L<C<prop_values("Script")>|/prop_values()> can be used to get all
the known script names as a list, without the code point ranges.
=head2 B<compexcl()>
+WARNING: Unicode discourages the use of this function or any of the
+alternative mechanisms listed in this section (the documentation of
+C<compexcl()>), except internally in implementations of the Unicode
+Normalization Algorithm. You should be using L<Unicode::Normalize> directly
+instead of these. Using these will likely lead to half-baked results.
+
use Unicode::UCD 'compexcl';
my $compexcl = compexcl(0x09dc);
=cut
+# Eval'd so can run on versions earlier than the property is available in
+my $Composition_Exclusion_re = eval 'qr/\p{Composition_Exclusion}/';
+
sub compexcl {
my $arg = shift;
my $code = _getcode($arg);
return if $v_unicode_version lt v3.0.0;
no warnings "non_unicode"; # So works on non-Unicode code points
- return chr($code) =~ /\p{Composition_Exclusion}/;
+ return chr($code) =~ $Composition_Exclusion_re
}
=head2 B<casefold()>
sub _casespec {
unless (%CASESPEC) {
UnicodeVersion() unless defined $v_unicode_version;
- if ($v_unicode_version lt v2.1.8) {
- %CASESPEC = {};
- }
- elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
+ if ($v_unicode_version ge v2.1.8) {
+ my $casespecfh = openunicode("SpecialCasing.txt");
local $_;
local $/ = "\n";
- while (<$CASESPECFH>) {
+ while (<$casespecfh>) {
if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
my ($hexcode, $lower, $title, $upper, $condition) =
}
}
}
- close($CASESPECFH);
}
}
}
sub _namedseq {
unless (%NAMEDSEQ) {
- if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
- local $_;
- local $/ = "\n";
- while (<$NAMEDSEQFH>) {
- if (/^ [0-9A-F]+ \ /x) {
- chomp;
- my ($sequence, $name) = split /\t/;
- my @s = map { chr(hex($_)) } split(' ', $sequence);
- $NAMEDSEQ{$name} = join("", @s);
- }
- }
- close($NAMEDSEQFH);
- }
+ my $namedseqfh = openunicode("Name.pl");
+ local $_;
+ local $/ = "\n";
+ while (<$namedseqfh>) {
+ if (/^ [0-9A-F]+ \ /x) {
+ chomp;
+ my ($sequence, $name) = split /\t/;
+ my @s = map { chr(hex($_)) } split(' ', $sequence);
+ $NAMEDSEQ{$name} = join("", @s);
+ }
+ }
}
}
my $val = num("123");
my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
+ my $val = num("12a", \$valid_length); # $valid_length contains 2
C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
doesn't think the entire string has a completely valid, safe numeric value.
+If called with an optional second parameter, a reference to a scalar, C<num()>
+will set the scalar to the length of any valid initial substring; or to 0 if none.
If the string is just one character in length, the Unicode numeric value
-is returned if it has one, or C<undef> otherwise. Note that this need
-not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
-example returns -0.5.
+is returned if it has one, or C<undef> otherwise. If the optional scalar ref
+is passed, it would be set to 1 if the return is valid; or 0 if the return is
+C<undef>. Note that the numeric value returned need not be a whole number.
+C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for example returns -0.5.
=cut
the same form. A half-width digit mixed with a full-width one will
return C<undef>. The Arabic script has two sets of digits; C<num> will
return C<undef> unless all the digits in the string come from the same
-set.
+set. In all cases, the optional scalar ref parameter is set to how
+long any valid initial substring of digits is; hence it will be set to the
+entire string length if the main return value is not C<undef>.
C<num> errs on the side of safety, and there may be valid strings of
decimal digits that it doesn't recognize. Note that Unicode defines
# consider those, and return the <decomposition> type in the second
# array element.
-sub num {
- my $string = $_[0];
+sub num ($;$) {
+ my ($string, $retlen_ref) = @_;
+
+ use feature 'unicode_strings';
_numeric unless %NUMERIC;
+ $$retlen_ref = 0 if $retlen_ref; # Assume will fail
+
+ my $length = length $string;
+ return if $length == 0;
- my $length = length($string);
- return $NUMERIC{ord($string)} if $length == 1;
- return if $string =~ /\D/;
my $first_ord = ord(substr($string, 0, 1));
+ return if ! exists $NUMERIC{$first_ord}
+ || ! defined $NUMERIC{$first_ord};
+
+ # Here, we know the first character is numeric
my $value = $NUMERIC{$first_ord};
+ $$retlen_ref = 1 if $retlen_ref; # Assume only this one is numeric
+
+ return $value if $length == 1;
+
+ # Here, the input is longer than a single character. To be valid, it must
+ # be entirely decimal digits, which means it must start with one.
+ return if $string =~ / ^ \D /x;
# To be a valid decimal number, it should be in a block of 10 consecutive
# characters, whose values are 0, 1, 2, ... 9. Therefore this digit's
# release, we verify that this first character is a member of such a
# block. That is, that the block of characters surrounding this one
# consists of all \d characters whose numeric values are the expected
- # ones.
+ # ones. If not, then this single character is numeric, but the string as
+ # a whole is not considered to be.
UnicodeVersion() unless defined $v_unicode_version;
if ($v_unicode_version lt v6.0.0) {
for my $i (0 .. 9) {
# function.
my $ord = ord(substr($string, $i, 1));
my $digit = $ord - $zero_ord;
- return unless $digit >= 0 && $digit <= 9;
+ if ($digit < 0 || $digit > 9) {
+ $$retlen_ref = $i if $retlen_ref;
+ return;
+ }
$value = $value * 10 + $digit;
}
+ $$retlen_ref = $length if $retlen_ref;
return $value;
}
return ( $list_ref->[0], $list_ref->[0] );
}
-# All 1 bits is the largest possible UV.
-$Unicode::UCD::MAX_CP = ~0;
+# All 1 bits but the top one is the largest possible IV.
+$Unicode::UCD::MAX_CP = (~0) >> 1;
=pod
are also specified in L<perluniprops|perluniprops/Properties accessible
through \p{} and \P{}>. Examples of using the "property=value" form are:
- say join ", ", prop_invlist("Script=Shavian");
+ say join ", ", prop_invlist("Script_Extensions=Shavian");
prints:
66640, 66688
for (my $i = 0; $i < @invlist; $i += 2) {
my $upper = ($i + 1) < @invlist
? $invlist[$i+1] - 1 # In range
- : $Unicode::UCD::MAX_CP; # To infinity. You may want
- # to stop much much earlier;
- # going this high may expose
- # perl deficiencies with very
- # large numbers.
+ : $Unicode::UCD::MAX_CP; # To infinity.
for my $j ($invlist[$i] .. $upper) {
push @full_list, $j;
}
Many Unicode properties have more than one name (or alias). C<prop_invmap>
understands all of these, including Perl extensions to them. Ambiguities are
-resolved as described above for L</prop_aliases()>. The Perl internal
-property "Perl_Decimal_Digit, described below, is also accepted. An empty
-list is returned if the property name is unknown.
+resolved as described above for L</prop_aliases()> (except if a property has
+both a complete mapping, and a binary C<Y>/C<N> mapping, then specifying the
+property name prefixed by C<"is"> causes the binary one to be returned). The
+Perl internal property "Perl_Decimal_Digit, described below, is also accepted.
+An empty list is returned if the property name is unknown.
See L<perluniprops/Properties accessible through Unicode::UCD> for the
properties acceptable as inputs to this function.
Note that the mapping is the one that is specified in the Unicode data files,
and to get the final decomposition, it may need to be applied recursively.
+Unicode in fact discourages use of this property except internally in
+implementations of the Unicode Normalization Algorithm.
The fourth (index [3]) element (C<$default>) in the list returned for this
format is 0.
The returned values for the Perl extension properties, such as C<Any> and
C<Greek> are somewhat misleading. The values are either C<"Y"> or C<"N>".
All Unicode properties are bipartite, so you can actually use the C<"Y"> or
-C<"N>" in a Perl regular rexpression for these, like C<qr/\p{ID_Start=Y/}> or
+C<"N>" in a Perl regular expression for these, like C<qr/\p{ID_Start=Y/}> or
C<qr/\p{Upper=N/}>. But the Perl extensions aren't specified this way, only
like C</qr/\p{Any}>, I<etc>. You can't actually use the C<"Y"> and C<"N>" in
them.
+=head3 Getting every available name
+
+Instead of reading the Unicode Database directly from files, as you were able
+to do for a long time, you are encouraged to use the supplied functions. So,
+instead of reading C<Name.pl> - which may disappear without notice in the
+future - directly, as with
+
+ my (%name, %cp);
+ for (split m/\s*\n/ => do "unicore/Name.pl") {
+ my ($cp, $name) = split m/\t/ => $_;
+ $cp{$name} = $cp;
+ $name{$cp} = $name unless $cp =~ m/ /;
+ }
+
+You ought to use L</prop_invmap()> like this:
+
+ my (%name, %cp, %cps, $n);
+ # All codepoints
+ foreach my $cat (qw( Name Name_Alias )) {
+ my ($codepoints, $names, $format, $default) = prop_invmap($cat);
+ # $format => "n", $default => ""
+ foreach my $i (0 .. @$codepoints - 2) {
+ my ($cp, $n) = ($codepoints->[$i], $names->[$i]);
+ # If $n is a ref, the same codepoint has multiple names
+ foreach my $name (ref $n ? @$n : $n) {
+ $name{$cp} //= $name;
+ $cp{$name} //= $cp;
+ }
+ }
+ }
+ # Named sequences
+ { my %ns = namedseq();
+ foreach my $name (sort { $ns{$a} cmp $ns{$b} } keys %ns) {
+ $cp{$name} //= [ map { ord } split "" => $ns{$name} ];
+ }
+ }
+
=cut
# User-defined properties could be handled with some changes to utf8_heavy.pl;
# we need to also read in that table. Create a hash with the keys
# being the code points, and the values being a list of the
# aliases for the code point key.
- my ($aliases_code_points, $aliases_maps, undef, undef) =
- &prop_invmap('Name_Alias');
+ my ($aliases_code_points, $aliases_maps, undef, undef)
+ = &prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok');
my %aliases;
for (my $i = 0; $i < @$aliases_code_points; $i++) {
my $code_point = $aliases_code_points->[$i];
if ($swash->{'LIST'} =~ /^V/) {
@invlist = split "\n", $swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr;
- shift @invlist;
+
+ shift @invlist; # Get rid of 'V';
+
+ # Could need to be inverted: add or subtract a 0 at the beginning of
+ # the list.
+ if ($swash->{'INVERT_IT'}) {
+ if (@invlist && $invlist[0] == 0) {
+ shift @invlist;
+ }
+ else {
+ unshift @invlist, 0;
+ }
+ }
foreach my $i (0 .. @invlist - 1) {
$invmap[$i] = ($i % 2 == 0) ? 'Y' : 'N'
}
}
}
else {
+ if ($swash->{'INVERT_IT'}) {
+ croak __PACKAGE__, ":prop_invmap: Don't know how to deal with inverted";
+ }
+
# The LIST input lines look like:
# ...
# 0374\t\tCommon
map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap;
$format = 'sl';
}
- elsif ($returned_prop eq 'ToNameAlias') {
+ elsif ($returned_prop =~ / To ( _Perl )? NameAlias/x) {
# This property currently doesn't have any lists, but theoretically
# could
# to indicate that need to add code point to it.
$format = 'ar';
}
- elsif ($format ne 'n' && $format ne 'a') {
+ elsif ($format eq 'ax') {
+
+ # Normally 'ax' properties have overrides, and will have been handled
+ # above, but if not, they still need adjustment, and the hex values
+ # have already been converted to decimal
+ $format = 'a';
+ }
+ elsif ($format ne 'n' && $format !~ / ^ a /x) {
# All others are simple scalars
$format = 's';
sub UnicodeVersion {
unless (defined $UNICODEVERSION) {
- openunicode(\$VERSIONFH, "version");
+ my $versionfh = openunicode("version");
local $/ = "\n";
- chomp($UNICODEVERSION = <$VERSIONFH>);
- close($VERSIONFH);
+ chomp($UNICODEVERSION = <$versionfh>);
croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
}
This newer style is consistent with the values of other Unicode properties.
To preserve backward compatibility, all the functions in Unicode::UCD that
-return block names (except one) return the old-style ones. That one function,
-L</prop_value_aliases()> can be used to convert from old-style to new-style:
+return block names (except as noted) return the old-style ones.
+L</prop_value_aliases()> returns the new-style and can be used to convert from
+old-style to new-style:
my $new_style = prop_values_aliases("block", $old_style);
Note that starting in Unicode 6.1, many of the block names have shorter
synonyms. These are always given in the new style.
+=head2 Use with older Unicode versions
+
+The functions in this module work as well as can be expected when
+used on earlier Unicode versions. But, obviously, they use the available data
+from that Unicode version. For example, if the Unicode version predates the
+definition of the script property (Unicode 3.1), then any function that deals
+with scripts is going to return C<undef> for the script portion of the return
+value.
+
=head1 AUTHOR
Jarkko Hietaniemi. Now maintained by perl5 porters.