charblock charscript
charblocks charscripts
charinrange
+ charprop
general_categories bidi_types
compexcl
casefold all_casefolds casespec
use Unicode::UCD 'charinfo';
my $charinfo = charinfo($codepoint);
+ use Unicode::UCD 'charprop';
+ my $value = charprop($codepoint, $property);
+
use Unicode::UCD 'casefold';
my $casefold = casefold($codepoint);
Fields that aren't applicable to the particular code point argument exist in the
returned hash, and are empty.
+For results that are less "raw" than this function returns, or to get the values for
+any property, not just the few covered by this function, use the
+L</charprop()> function.
+
The keys in the hash with the meanings of their values are:
=over
hexdigits. This indicates that the full uppercase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple uppercase mapping is
-I<code> itself; you'll need some other means, (like
+I<code> itself; you'll need some other means, (like L</charprop()> or
L</casespec()> to get the full mapping.
=item B<lower>
hexdigits. This indicates that the full lowercase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple lowercase mapping is
-I<code> itself; you'll need some other means, (like
+I<code> itself; you'll need some other means, (like L</charprop()> or
L</casespec()> to get the full mapping.
=item B<title>
hexdigits. This indicates that the full titlecase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple titlecase mapping is
-I<code> itself; you'll need some other means, (like
+I<code> itself; you'll need some other means, (like L</charprop()> or
L</casespec()> to get the full mapping.
=item B<block>
_search($range, 0, $#$range, $code);
}
+=head2 B<charprop()>
+
+ use Unicode::UCD 'charprop';
+
+ print charprop(0x41, "Gc"), "\n";
+ print charprop(0x61, "General_Category"), "\n";
+
+ prints
+ Lu
+ Ll
+
+This returns the value of the Unicode property given by the second parameter
+for the L</code point argument> given by the first.
+
+The passed-in property may be specified as any of the synonyms returned by
+L</prop_aliases()>.
+
+The return value is always a scalar, either a string or a number. For
+properties where there are synonyms for the values, the synonym returned by
+this function is the longest, most descriptive form, the one returned by
+L</prop_value_aliases()> when called in a scalar context. Of course, you can
+call L</prop_value_aliases()> on the result to get other synonyms.
+
+The return values are more "cooked" than the L</charinfo()> ones. For
+example, the C<"uc"> property value is the actual string containing the full
+uppercase mapping of the input code point. You have to go to extra trouble
+with C<charinfo> to get this value from its C<upper> hash element when the
+full mapping differs from the simple one.
+
+Special note should be made of the return values for a few properties:
+
+=over
+
+=item Block
+
+The value returned is the new-style (see L</Old-style versus new-style block
+names>).
+
+=item Decomposition_Mapping
+
+Like L</charinfo()>, the result may be an intermediate decomposition whose
+components are also decomposable. Use L<Unicode::Normalize> to get the final
+decomposition in one step.
+
+Unlike L</charinfo()>, this does not include the decomposition type. Use the
+C<Decomposition_Type> property to get that.
+
+=item Name_Alias
+
+If the input code point's name has more than one synonym, they are returned
+joined into a single comma-separated string.
+
+=item Numeric_Value
+
+If the result is a fraction, it is converted into a floating point number to
+the accuracy of your platform.
+
+=item Script_Extensions
+
+If the result is multiple script names, they are returned joined into a single
+comma-separated string.
+
+=back
+
+When called with a property that is a Perl extension that isn't expressible in
+a compound form, this function currently returns C<undef>, as the only two
+possible values are I<true> or I<false> (1 or 0 I suppose). This behavior may
+change in the future, so don't write code that relies on it. C<Present_In> is
+a Perl extension that is expressible in a bipartite or compound form (for
+example, C<\p{Present_In=4.0}>), so C<charprop> accepts it. But C<Any> is a
+Perl extension that isn't expressible that way, so C<charprop> returns
+C<undef> for it. Also C<charprop> returns C<undef> for all Perl extensions
+that are internal-only.
+
+=cut
+
+sub charprop ($$) {
+ my ($input_cp, $prop) = @_;
+
+ my $cp = _getcode($input_cp);
+ croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp;
+
+ my ($list_ref, $map_ref, $format, $default)
+ = prop_invmap($prop);
+ return undef unless defined $list_ref;
+
+ my $i = search_invlist($list_ref, $cp);
+ croak __PACKAGE__, "::charprop: prop_invmap return is invalid for charprop('$input_cp', '$prop)" unless defined $i;
+
+ # $i is the index into both the inversion list and map of $cp.
+ my $map = $map_ref->[$i];
+
+ # Convert enumeration values to their most complete form.
+ if (! ref $map) {
+ my $long_form = prop_value_aliases($prop, $map);
+ $map = $long_form if defined $long_form;
+ }
+
+ if ($format =~ / ^ s /x) { # Scalars
+ return join ",", @$map if ref $map; # Convert to scalar with comma
+ # separated array elements
+
+ # Resolve ambiguity as to whether an all digit value is a code point
+ # that should be converted to a character, or whether it is really
+ # just a number. To do this, look at the default. If it is a
+ # non-empty number, we can safely assume the result is also a number.
+ if ($map =~ / ^ \d+ $ /ax && $default !~ / ^ \d+ $ /ax) {
+ $map = chr $map;
+ }
+ elsif ($map =~ / ^ (?: Y | N ) $ /x) {
+
+ # prop_invmap() returns these values for properties that are Perl
+ # extensions. But this is misleading. For now, return undef for
+ # these, as currently documented.
+ undef $map unless
+ exists $Unicode::UCD::prop_aliases{utf8::_loose_name(lc $prop)};
+ }
+ return $map;
+ }
+ elsif ($format eq 'ar') { # numbers, including rationals
+ my $offset = $cp - $list_ref->[$i];
+ return $map if $map =~ /nan/i;
+ return $map + $offset if $offset != 0; # If needs adjustment
+ return eval $map; # Convert e.g., 1/2 to 0.5
+ }
+ elsif ($format =~ /^a/) { # Some entries need adjusting
+
+ # Linearize sequences into a string.
+ return join "", map { chr $_ } @$map if ref $map; # XXX && $format =~ /^ a [dl] /x;
+
+ return "" if $map eq "" && $format =~ /^a.*e/;
+
+ # These are all character mappings. Return the chr if no adjustment
+ # is needed
+ return chr $cp if $map eq "0";
+
+ # Convert special entry.
+ if ($map eq '<hangul syllable>' && $format eq 'ad') {
+ use Unicode::Normalize qw(NFD);
+ return NFD(chr $cp);
+ }
+
+ # The rest need adjustment from the first entry in the inversion list
+ # corresponding to this map.
+ my $offset = $cp - $list_ref->[$i];
+ return chr($map + $cp - $list_ref->[$i]);
+ }
+ elsif ($format eq 'n') { # The name property
+
+ # There are two special cases, handled here.
+ if ($map =~ / ( .+ ) <code\ point> $ /x) {
+ $map = sprintf("$1%04X", $cp);
+ }
+ elsif ($map eq '<hangul syllable>') {
+ $map = charnames::viacode($cp);
+ }
+ return $map;
+ }
+ else {
+ croak __PACKAGE__, "::charprop: Internal error: unknown format '$format'. Please perlbug this";
+ return undef;
+ }
+}
+
=head2 B<charblock()>
use Unicode::UCD 'charblock';
This newer style is consistent with the values of other Unicode properties.
To preserve backward compatibility, all the functions in Unicode::UCD that
-return block names (except one) return the old-style ones. That one function,
-L</prop_value_aliases()> can be used to convert from old-style to new-style:
+return block names (except as noted) return the old-style ones.
+L</prop_value_aliases()> returns the new-style and can be used to convert from
+old-style to new-style:
my $new_style = prop_values_aliases("block", $old_style);
local $SIG{__WARN__} = sub { push @warnings, @_ };
use strict;
-use Unicode::UCD;
use Test::More;
-use Unicode::UCD 'charinfo';
+use Unicode::UCD qw(charinfo charprop);
my $input_record_separator = 7; # Make sure Unicode::UCD isn't affected by
$/ = $input_record_separator; # setting this.
my $charinfo;
is(charinfo(0x110000), undef, "Verify charinfo() of non-unicode is undef");
+is(charprop(0x110000, 'age'), "Unassigned", "Verify charprop(age) of non-unicode is Unassigned");
+is(charprop(0x110000, 'in'), "Unassigned", "Verify charprop(in), a bipartite Perl extension, works");
+is(charprop(0x110000, 'Any'), undef, "Verify charprop of non-bipartite Perl extension returns undef");
-$charinfo = charinfo(0); # Null is often problematic, so test it.
+my $cp = 0;
+$charinfo = charinfo($cp); # Null is often problematic, so test it.
-is($charinfo->{code}, "0000", "<control>");
+is($charinfo->{code}, "0000",
+ "Next tests are for charinfo and charprop; first NULL");
is($charinfo->{name}, "<control>");
+is(charprop($cp, "name"), "");
+
+# This gets a sl-type property returning a flattened list
+is(charprop($cp, "name_alias"), "NULL: control,NUL: abbreviation");
+
is($charinfo->{category}, "Cc");
+is(charprop($cp, "category"), "Control");
is($charinfo->{combining}, "0");
+is(charprop($cp, "ccc"), "Not_Reordered");
is($charinfo->{bidi}, "BN");
+is(charprop($cp, "bc"), "Boundary_Neutral");
is($charinfo->{decomposition}, "");
+is(charprop($cp, "dm"), "\0");
is($charinfo->{decimal}, "");
is($charinfo->{digit}, "");
is($charinfo->{numeric}, "");
+is(charprop($cp, "nv"), "NaN");
is($charinfo->{mirrored}, "N");
+is(charprop($cp, "bidim"), "No");
is($charinfo->{unicode10}, "NULL");
+is(charprop($cp, "na1"), "NULL");
is($charinfo->{comment}, "");
+is(charprop($cp, "isc"), "");
is($charinfo->{upper}, "");
+is(charprop($cp, "uc"), "\0");
is($charinfo->{lower}, "");
+is(charprop($cp, "lc"), "\0");
is($charinfo->{title}, "");
+is(charprop($cp, "tc"), "\0");
is($charinfo->{block}, "Basic Latin");
+is(charprop($cp, "block"), "Basic_Latin");
is($charinfo->{script}, "Common");
+is(charprop($cp, "script"), "Common");
+$cp = utf8::unicode_to_native(0x41);
my $A_code = sprintf("%04X", ord("A"));
my $a_code = sprintf("%04X", ord("a"));
-$charinfo = charinfo(utf8::unicode_to_native(0x41));
+$charinfo = charinfo(utf8::unicode_to_native($cp));
is($charinfo->{code}, $A_code, "LATIN CAPITAL LETTER A");
is($charinfo->{name}, "LATIN CAPITAL LETTER A");
+is(charprop($cp, 'name'), "LATIN CAPITAL LETTER A");
is($charinfo->{category}, "Lu");
+is(charprop($cp, 'gc'), "Uppercase_Letter");
is($charinfo->{combining}, "0");
+is(charprop($cp, 'ccc'), "Not_Reordered");
is($charinfo->{bidi}, "L");
+is(charprop($cp, 'bc'), "Left_To_Right");
is($charinfo->{decomposition}, "");
+is(charprop($cp, 'dm'), "A");
is($charinfo->{decimal}, "");
is($charinfo->{digit}, "");
is($charinfo->{numeric}, "");
+is(charprop($cp, 'nv'), "NaN");
is($charinfo->{mirrored}, "N");
+is(charprop($cp, 'bidim'), "No");
is($charinfo->{unicode10}, "");
+is(charprop($cp, 'na1'), "");
is($charinfo->{comment}, "");
+is(charprop($cp, 'isc'), "");
is($charinfo->{upper}, "");
+is(charprop($cp, 'uc'), "A");
is($charinfo->{lower}, $a_code);
+is(charprop($cp, 'lc'), "a");
is($charinfo->{title}, "");
+is(charprop($cp, 'tc'), "A");
is($charinfo->{block}, "Basic Latin");
+is(charprop($cp, 'block'), "Basic_Latin");
is($charinfo->{script}, "Latin");
+is(charprop($cp, 'script'), "Latin");
-$charinfo = charinfo(0x100);
+$cp = 0x100;
+$charinfo = charinfo($cp);
is($charinfo->{code}, "0100", "LATIN CAPITAL LETTER A WITH MACRON");
is($charinfo->{name}, "LATIN CAPITAL LETTER A WITH MACRON");
+is(charprop($cp, 'name'), "LATIN CAPITAL LETTER A WITH MACRON");
is($charinfo->{category}, "Lu");
+is(charprop($cp, 'gc'), "Uppercase_Letter");
is($charinfo->{combining}, "0");
+is(charprop($cp, 'ccc'), "Not_Reordered");
is($charinfo->{bidi}, "L");
+is(charprop($cp, 'bc'), "Left_To_Right");
is($charinfo->{decomposition}, "$A_code 0304");
+is(charprop($cp, 'dm'), "A\x{0304}");
is($charinfo->{decimal}, "");
is($charinfo->{digit}, "");
is($charinfo->{numeric}, "");
+is(charprop($cp, 'nv'), "NaN");
is($charinfo->{mirrored}, "N");
+is(charprop($cp, 'bidim'), "No");
is($charinfo->{unicode10}, "LATIN CAPITAL LETTER A MACRON");
+is(charprop($cp, 'na1'), "LATIN CAPITAL LETTER A MACRON");
is($charinfo->{comment}, "");
+is(charprop($cp, 'isc'), "");
is($charinfo->{upper}, "");
+is(charprop($cp, 'uc'), "\x{100}");
is($charinfo->{lower}, "0101");
+is(charprop($cp, 'lc'), "\x{101}");
is($charinfo->{title}, "");
+is(charprop($cp, 'tc'), "\x{100}");
is($charinfo->{block}, "Latin Extended-A");
+is(charprop($cp, 'block'), "Latin_Extended_A");
is($charinfo->{script}, "Latin");
+is(charprop($cp, 'script'), "Latin");
-# 0x0590 is in the Hebrew block but unused.
-
-$charinfo = charinfo(0x590);
+$cp = 0x590; # 0x0590 is in the Hebrew block but unused.
+$charinfo = charinfo($cp);
is($charinfo->{code}, undef, "0x0590 - unused Hebrew");
is($charinfo->{name}, undef);
+is(charprop($cp, 'name'), "");
is($charinfo->{category}, undef);
+is(charprop($cp, 'gc'), "Unassigned");
is($charinfo->{combining}, undef);
+is(charprop($cp, 'ccc'), "Not_Reordered");
is($charinfo->{bidi}, undef);
+is(charprop($cp, 'bc'), "Right_To_Left");
is($charinfo->{decomposition}, undef);
+is(charprop($cp, 'dm'), "\x{590}");
is($charinfo->{decimal}, undef);
is($charinfo->{digit}, undef);
is($charinfo->{numeric}, undef);
+is(charprop($cp, 'nv'), "NaN");
is($charinfo->{mirrored}, undef);
+is(charprop($cp, 'bidim'), "No");
is($charinfo->{unicode10}, undef);
+is(charprop($cp, 'na1'), "");
is($charinfo->{comment}, undef);
+is(charprop($cp, 'isc'), "");
is($charinfo->{upper}, undef);
+is(charprop($cp, 'uc'), "\x{590}");
is($charinfo->{lower}, undef);
+is(charprop($cp, 'lc'), "\x{590}");
is($charinfo->{title}, undef);
+is(charprop($cp, 'tc'), "\x{590}");
is($charinfo->{block}, undef);
+is(charprop($cp, 'block'), "Hebrew");
is($charinfo->{script}, undef);
+is(charprop($cp, 'script'), "Unknown");
# 0x05d0 is in the Hebrew block and used.
-$charinfo = charinfo(0x5d0);
+$cp = 0x5d0;
+$charinfo = charinfo($cp);
is($charinfo->{code}, "05D0", "05D0 - used Hebrew");
is($charinfo->{name}, "HEBREW LETTER ALEF");
+is(charprop($cp, 'name'), "HEBREW LETTER ALEF");
is($charinfo->{category}, "Lo");
+is(charprop($cp, 'gc'), "Other_Letter");
is($charinfo->{combining}, "0");
+is(charprop($cp, 'ccc'), "Not_Reordered");
is($charinfo->{bidi}, "R");
+is(charprop($cp, 'bc'), "Right_To_Left");
is($charinfo->{decomposition}, "");
+is(charprop($cp, 'dm'), "\x{5d0}");
is($charinfo->{decimal}, "");
is($charinfo->{digit}, "");
is($charinfo->{numeric}, "");
+is(charprop($cp, 'nv'), "NaN");
is($charinfo->{mirrored}, "N");
+is(charprop($cp, 'bidim'), "No");
is($charinfo->{unicode10}, "");
+is(charprop($cp, 'na1'), "");
is($charinfo->{comment}, "");
+is(charprop($cp, 'isc'), "");
is($charinfo->{upper}, "");
+is(charprop($cp, 'uc'), "\x{5d0}");
is($charinfo->{lower}, "");
+is(charprop($cp, 'lc'), "\x{5d0}");
is($charinfo->{title}, "");
+is(charprop($cp, 'tc'), "\x{5d0}");
is($charinfo->{block}, "Hebrew");
+is(charprop($cp, 'block'), "Hebrew");
is($charinfo->{script}, "Hebrew");
+is(charprop($cp, 'script'), "Hebrew");
# An open syllable in Hangul.
-$charinfo = charinfo(0xAC00);
+$cp = 0xAC00;
+$charinfo = charinfo($cp);
is($charinfo->{code}, "AC00", "HANGUL SYLLABLE U+AC00");
is($charinfo->{name}, "HANGUL SYLLABLE GA");
+is(charprop($cp, 'name'), "HANGUL SYLLABLE GA");
is($charinfo->{category}, "Lo");
+is(charprop($cp, 'gc'), "Other_Letter");
is($charinfo->{combining}, "0");
+is(charprop($cp, 'ccc'), "Not_Reordered");
is($charinfo->{bidi}, "L");
+is(charprop($cp, 'bc'), "Left_To_Right");
is($charinfo->{decomposition}, "1100 1161");
+is(charprop($cp, 'dm'), "\x{1100}\x{1161}");
is($charinfo->{decimal}, "");
is($charinfo->{digit}, "");
is($charinfo->{numeric}, "");
+is(charprop($cp, 'nv'), "NaN");
is($charinfo->{mirrored}, "N");
+is(charprop($cp, 'bidim'), "No");
is($charinfo->{unicode10}, "");
+is(charprop($cp, 'na1'), "");
is($charinfo->{comment}, "");
+is(charprop($cp, 'isc'), "");
is($charinfo->{upper}, "");
+is(charprop($cp, 'uc'), "\x{AC00}");
is($charinfo->{lower}, "");
+is(charprop($cp, 'lc'), "\x{AC00}");
is($charinfo->{title}, "");
+is(charprop($cp, 'tc'), "\x{AC00}");
is($charinfo->{block}, "Hangul Syllables");
+is(charprop($cp, 'block'), "Hangul_Syllables");
is($charinfo->{script}, "Hangul");
+is(charprop($cp, 'script'), "Hangul");
# A closed syllable in Hangul.
-$charinfo = charinfo(0xAE00);
+$cp = 0xAE00;
+$charinfo = charinfo($cp);
is($charinfo->{code}, "AE00", "HANGUL SYLLABLE U+AE00");
is($charinfo->{name}, "HANGUL SYLLABLE GEUL");
+is(charprop($cp, 'name'), "HANGUL SYLLABLE GEUL");
is($charinfo->{category}, "Lo");
+is(charprop($cp, 'gc'), "Other_Letter");
is($charinfo->{combining}, "0");
+is(charprop($cp, 'ccc'), "Not_Reordered");
is($charinfo->{bidi}, "L");
+is(charprop($cp, 'bc'), "Left_To_Right");
is($charinfo->{decomposition}, "1100 1173 11AF");
+is(charprop($cp, 'dm'), "\x{1100}\x{1173}\x{11AF}");
is($charinfo->{decimal}, "");
is($charinfo->{digit}, "");
is($charinfo->{numeric}, "");
+is(charprop($cp, 'nv'), "NaN");
is($charinfo->{mirrored}, "N");
+is(charprop($cp, 'bidim'), "No");
is($charinfo->{unicode10}, "");
+is(charprop($cp, 'na1'), "");
is($charinfo->{comment}, "");
+is(charprop($cp, 'isc'), "");
is($charinfo->{upper}, "");
+is(charprop($cp, 'uc'), "\x{AE00}");
is($charinfo->{lower}, "");
+is(charprop($cp, 'lc'), "\x{AE00}");
is($charinfo->{title}, "");
+is(charprop($cp, 'tc'), "\x{AE00}");
is($charinfo->{block}, "Hangul Syllables");
+is(charprop($cp, 'block'), "Hangul_Syllables");
is($charinfo->{script}, "Hangul");
+is(charprop($cp, 'script'), "Hangul");
-$charinfo = charinfo(0x1D400);
+$cp = 0x1D400;
+$charinfo = charinfo($cp);
is($charinfo->{code}, "1D400", "MATHEMATICAL BOLD CAPITAL A");
is($charinfo->{name}, "MATHEMATICAL BOLD CAPITAL A");
+is(charprop($cp, 'name'), "MATHEMATICAL BOLD CAPITAL A");
is($charinfo->{category}, "Lu");
+is(charprop($cp, 'gc'), "Uppercase_Letter");
is($charinfo->{combining}, "0");
+is(charprop($cp, 'ccc'), "Not_Reordered");
is($charinfo->{bidi}, "L");
+is(charprop($cp, 'bc'), "Left_To_Right");
is($charinfo->{decomposition}, "<font> $A_code");
+is(charprop($cp, 'dm'), "A");
is($charinfo->{decimal}, "");
is($charinfo->{digit}, "");
is($charinfo->{numeric}, "");
+is(charprop($cp, 'nv'), "NaN");
is($charinfo->{mirrored}, "N");
+is(charprop($cp, 'bidim'), "No");
is($charinfo->{unicode10}, "");
+is(charprop($cp, 'na1'), "");
is($charinfo->{comment}, "");
+is(charprop($cp, 'isc'), "");
is($charinfo->{upper}, "");
+is(charprop($cp, 'uc'), "\x{1D400}");
is($charinfo->{lower}, "");
+is(charprop($cp, 'lc'), "\x{1D400}");
is($charinfo->{title}, "");
+is(charprop($cp, 'tc'), "\x{1D400}");
is($charinfo->{block}, "Mathematical Alphanumeric Symbols");
+is(charprop($cp, 'block'), "Mathematical_Alphanumeric_Symbols");
is($charinfo->{script}, "Common");
+is(charprop($cp, 'script'), "Common");
-$charinfo = charinfo(0x9FBA); #Bug 58428
+$cp = 0x9FBA; #Bug 58428
+$charinfo = charinfo(0x9FBA);
is($charinfo->{code}, "9FBA", "U+9FBA");
is($charinfo->{name}, "CJK UNIFIED IDEOGRAPH-9FBA");
+is(charprop($cp, 'name'), "CJK UNIFIED IDEOGRAPH-9FBA");
is($charinfo->{category}, "Lo");
+is(charprop($cp, 'gc'), "Other_Letter");
is($charinfo->{combining}, "0");
+is(charprop($cp, 'ccc'), "Not_Reordered");
is($charinfo->{bidi}, "L");
+is(charprop($cp, 'bc'), "Left_To_Right");
is($charinfo->{decomposition}, "");
+is(charprop($cp, 'dm'), "\x{9FBA}");
is($charinfo->{decimal}, "");
is($charinfo->{digit}, "");
is($charinfo->{numeric}, "");
+is(charprop($cp, 'nv'), "NaN");
is($charinfo->{mirrored}, "N");
+is(charprop($cp, 'bidim'), "No");
is($charinfo->{unicode10}, "");
+is(charprop($cp, 'na1'), "");
is($charinfo->{comment}, "");
+is(charprop($cp, 'isc'), "");
is($charinfo->{upper}, "");
+is(charprop($cp, 'uc'), "\x{9FBA}");
is($charinfo->{lower}, "");
+is(charprop($cp, 'lc'), "\x{9FBA}");
is($charinfo->{title}, "");
+is(charprop($cp, 'tc'), "\x{9FBA}");
is($charinfo->{block}, "CJK Unified Ideographs");
+is(charprop($cp, 'block'), "CJK_Unified_Ideographs");
is($charinfo->{script}, "Han");
+is(charprop($cp, 'script'), "Han");
use Unicode::UCD qw(charblock charscript);
is(charblock(0x1FFFF), "No_Block", "0x1FFFF - unused charblock");
my $fraction_3_4_code = sprintf("%04X", utf8::unicode_to_native(0xbe));
+$cp = $fraction_3_4_code;
$charinfo = charinfo(hex $fraction_3_4_code);
is($charinfo->{code}, $fraction_3_4_code, "VULGAR FRACTION THREE QUARTERS");
is($charinfo->{name}, "VULGAR FRACTION THREE QUARTERS");
+is(charprop($cp, 'name'), "VULGAR FRACTION THREE QUARTERS");
is($charinfo->{category}, "No");
+is(charprop($cp, 'gc'), "Other_Number");
is($charinfo->{combining}, "0");
+is(charprop($cp, 'ccc'), "Not_Reordered");
is($charinfo->{bidi}, "ON");
+is(charprop($cp, 'bc'), "Other_Neutral");
is($charinfo->{decomposition}, "<fraction> "
. sprintf("%04X", ord "3")
. " 2044 "
. sprintf("%04X", ord "4"));
+is(charprop($cp, 'dm'), "3\x{2044}4");
is($charinfo->{decimal}, "");
is($charinfo->{digit}, "");
is($charinfo->{numeric}, "3/4");
+is(charprop($cp, 'nv'), "0.75");
is($charinfo->{mirrored}, "N");
+is(charprop($cp, 'bidim'), "No");
is($charinfo->{unicode10}, "FRACTION THREE QUARTERS");
+is(charprop($cp, 'na1'), "FRACTION THREE QUARTERS");
is($charinfo->{comment}, "");
+is(charprop($cp, 'isc'), "");
is($charinfo->{upper}, "");
+is(charprop($cp, 'uc'), "\x{be}");
is($charinfo->{lower}, "");
+is(charprop($cp, 'lc'), "\x{be}");
is($charinfo->{title}, "");
+is(charprop($cp, 'tc'), "\x{be}");
is($charinfo->{block}, "Latin-1 Supplement");
+is(charprop($cp, 'block'), "Latin_1_Supplement");
is($charinfo->{script}, "Common");
+is(charprop($cp, 'script'), "Common");
# This is to test a case where both simple and full lowercases exist and
# differ
-$charinfo = charinfo(0x130);
+$cp = 0x130;
+$charinfo = charinfo($cp);
my $I_code = sprintf("%04X", ord("I"));
my $i_code = sprintf("%04X", ord("i"));
is($charinfo->{code}, "0130", "LATIN CAPITAL LETTER I WITH DOT ABOVE");
is($charinfo->{name}, "LATIN CAPITAL LETTER I WITH DOT ABOVE");
+is(charprop($cp, 'name'), "LATIN CAPITAL LETTER I WITH DOT ABOVE");
is($charinfo->{category}, "Lu");
+is(charprop($cp, 'gc'), "Uppercase_Letter");
is($charinfo->{combining}, "0");
+is(charprop($cp, 'ccc'), "Not_Reordered");
is($charinfo->{bidi}, "L");
+is(charprop($cp, 'bc'), "Left_To_Right");
is($charinfo->{decomposition}, "$I_code 0307");
+is(charprop($cp, 'dm'), "I\x{0307}");
is($charinfo->{decimal}, "");
is($charinfo->{digit}, "");
is($charinfo->{numeric}, "");
+is(charprop($cp, 'nv'), "NaN");
is($charinfo->{mirrored}, "N");
+is(charprop($cp, 'bidim'), "No");
is($charinfo->{unicode10}, "LATIN CAPITAL LETTER I DOT");
+is(charprop($cp, 'na1'), "LATIN CAPITAL LETTER I DOT");
is($charinfo->{comment}, "");
+is(charprop($cp, 'isc'), "");
is($charinfo->{upper}, "");
+is(charprop($cp, 'uc'), "\x{130}");
is($charinfo->{lower}, $i_code);
+is(charprop($cp, 'lc'), "i\x{307}");
is($charinfo->{title}, "");
+is(charprop($cp, 'tc'), "\x{130}");
is($charinfo->{block}, "Latin Extended-A");
+is(charprop($cp, 'block'), "Latin_Extended_A");
is($charinfo->{script}, "Latin");
+is(charprop($cp, 'script'), "Latin");
# This is to test a case where both simple and full uppercases exist and
# differ
-$charinfo = charinfo(0x1F80);
+$cp = 0x1F80;
+$charinfo = charinfo($cp);
is($charinfo->{code}, "1F80", "GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI");
is($charinfo->{name}, "GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI");
+is(charprop($cp, "name"), "GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI");
is($charinfo->{category}, "Ll");
+is(charprop($cp, "gc"), "Lowercase_Letter");
is($charinfo->{combining}, "0");
+is(charprop($cp, "ccc"), "Not_Reordered");
is($charinfo->{bidi}, "L");
+is(charprop($cp, "bc"), "Left_To_Right");
is($charinfo->{decomposition}, "1F00 0345");
+is(charprop($cp, "dm"), "\x{1F00}\x{0345}");
is($charinfo->{decimal}, "");
is($charinfo->{digit}, "");
is($charinfo->{numeric}, "");
+is(charprop($cp, "nv"), "NaN");
is($charinfo->{mirrored}, "N");
+is(charprop($cp, "bidim"), "No");
is($charinfo->{unicode10}, "");
+is(charprop($cp, "na1"), "");
is($charinfo->{comment}, "");
+is(charprop($cp, "isc"), "");
is($charinfo->{upper}, "1F88");
+is(charprop($cp, "uc"), "\x{1F08}\x{0399}");
+is(charprop($cp, "suc"), "\x{1F88}");
is($charinfo->{lower}, "");
+is(charprop($cp, "lc"), "\x{1F80}");
is($charinfo->{title}, "1F88");
+is(charprop($cp, "tc"), "\x{1F88}");
is($charinfo->{block}, "Greek Extended");
+is(charprop($cp, "block"), "Greek_Extended");
is($charinfo->{script}, "Greek");
+is(charprop($cp, "script"), "Greek");
+
+is(charprop(ord("A"), "foo"), undef,
+ "Verify charprop of unknown property returns <undef>");
+
+# These were created from inspection of the code to exercise the branches
+is(charprop(ord("("), "bpb"), ")",
+ "Verify charprop figures out that s-type properties can be char");
+is(charprop(ord("9"), "nv"), 9,
+ "Verify charprop can adjust an ar-type property");
+is(charprop(utf8::unicode_to_native(0xAD), "NFKC_Casefold"), "",
+ "Verify charprop can handle an \"\" in ae-type property");
use Unicode::UCD qw(charblocks charscripts);