# selection of names is tested, a higher percentage of regular names is tested
# than algorithmically-determined names.
+my $run_slow_tests = $ENV{PERL_RUN_SLOW_TESTS} || 0;
+
my $RUN_SLOW_TESTS_EVERY_CODE_POINT = 100;
# If $ENV{PERL_RUN_SLOW_TESTS} is at least 1 and less than the number above,
-# all code points with names are tested. If it is at least that number, all
-# 1,114,112 Unicode code points are tested.
+# all code points with names are tested, including wildcard search names. If
+# it is at least that number, all 1,114,112 Unicode code points are tested.
# Because \N{} is compile time, any warnings will get generated before
# execution, so have to have an array, and arrange things so no warning
1
EOE
- like($@, "above 0xFF", "Verify get warning for \\N{above ff} under 'use bytes' with :full");
+ like($@, qr/above 0xFF/, "Verify get warning for \\N{above ff} under 'use bytes' with :full");
ok(! defined $res, "... and result is undefined");
$res = eval <<'EOE';
"Here: \N{Be}!";
1
EOE
- like($@, "CYRILLIC CAPITAL LETTER BE.*above 0xFF", "Verify get warning under 'use bytes' with explicit script");
+ like($@, qr/CYRILLIC CAPITAL LETTER BE.*above 0xFF/, "Verify get warning under 'use bytes' with explicit script");
ok(! defined $res, "... and result is undefined");
$res = eval <<'EOE';
use charnames ":alias" => { mychar1 => "0xE8000",
mychar2 => 983040, # U+F0000
mychar3 => "U+100000",
- myctrl => 0x80,
+ myctrl => utf8::unicode_to_native(0x80),
mylarge => "U+111000",
};
is ("\N{PILE OF POO}", chr(0x1F4A9), "Verify :alias alone implies :full");
is (charnames::viacode(0x100000), "mychar3", "And that can get the alias back");
is ("\N{mylarge}", chr(0x111000), "Verify that can define alias beyond Unicode");
is (charnames::viacode(0x111000), "mylarge", "And that can get the alias back");
- is (charnames::viacode(0x80), "myctrl", "Verify that can name a nameless control");
+ is (charnames::viacode(utf8::unicode_to_native(0x80)), "myctrl", "Verify that can name a nameless control");
}
-my $encoded_be;
-my $encoded_alpha;
-my $encoded_bet;
-my $encoded_deseng;
-
-# If octal representation of unicode char is \0xyzt, then the utf8 is \3xy\2zt
-if (ord('A') == 65) { # as on ASCII or UTF-8 machines
- $encoded_be = "\320\261";
- $encoded_alpha = "\316\261";
- $encoded_bet = "\327\221";
- $encoded_deseng = "\360\220\221\215";
-}
-else { # EBCDIC where UTF-EBCDIC may be used (this may be 1047 specific since
- # UTF-EBCDIC is codepage specific)
- $encoded_be = "\270\102\130";
- $encoded_alpha = "\264\130";
- $encoded_bet = "\270\125\130";
- $encoded_deseng = "\336\102\103\124";
-}
+my $encoded_be = byte_utf8a_to_utf8n("\320\261");
+my $encoded_alpha = byte_utf8a_to_utf8n("\316\261");
+my $encoded_bet = byte_utf8a_to_utf8n("\327\221");
+my $encoded_deseng = byte_utf8a_to_utf8n("\360\220\221\215");
sub to_bytes {
unpack"U0a*", shift;
}
sub test_vianame ($$$) {
+ CORE::state $wildcard_count = 0;
# Run the vianame tests on a code point, both loose and full
# Get a copy of the name modified to stress the loose tests.
my $loose_name = get_loose_name($name);
+ my $right_anchor;
+
# Switch loose and full in vianame vs string_vianame half the time
if (rand() < .5) {
use charnames ":full";
- $all_pass &= is(charnames::vianame($name), $i, "Verify vianame(\"$name\") is 0x$hex");
+ $all_pass &= is(charnames::vianame($name), $i,
+ "Verify vianame(\"$name\") is 0x$hex");
use charnames ":loose";
- $all_pass &= is(charnames::string_vianame($loose_name), chr($i), "Verify string_vianame(\"$loose_name\") is chr(0x$hex)");
+ $all_pass &= is(charnames::string_vianame($loose_name), chr($i),
+ "Verify string_vianame(\"$loose_name\") is chr(0x$hex)");
+ $right_anchor = '\\Z';
}
else {
use charnames ":loose";
- $all_pass &= is(charnames::vianame($loose_name), $i, "Verify vianame(\"$loose_name\") is 0x$hex");
+ $all_pass &= is(charnames::vianame($loose_name), $i,
+ "Verify vianame(\"$loose_name\") is 0x$hex");
use charnames ":full";
- $all_pass &= is(charnames::string_vianame($name), chr($i), "Verify string_vianame(\"$name\") is chr(0x$hex)");
+ $all_pass &= is(charnames::string_vianame($name), chr($i),
+ "Verify string_vianame(\"$name\") is chr(0x$hex)");
+ $right_anchor = '\\z';
}
+
+ my $left_anchor = (rand() < .5) ? '^' : '\\A';
+
+ # \p{name=} is always loose matching
+ $all_pass &= like(chr($i), qr/^\p{name=$loose_name}$/,
+ "Verify /\p{name=$loose_name}/ matches chr(0x$hex)");
+
+ $wildcard_count++;
+
+ # XXX temporary to see if the failure we are occasionally seeing is
+ # confined to this code point. GH #17671
+ next if $i == 0;
+
+ # Because wildcard name matching is so real-time intensive, do it less
+ # frequently than the others
+ if ($wildcard_count >= 10) {
+ $wildcard_count = 0;
+
+ # A few control characters have anomalous names containing
+ # parentheses, which need to be escaped.
+ my $name_ref = \$name;
+ my $mod_name;
+ if ($i <= 0x85) { # NEL in ASCII; affected controls are lower than
+ # this in EBCDIC
+ $mod_name = $name =~ s/([()])/\\$1/gr;
+ $name_ref = \$mod_name;
+ }
+
+ # We anchor the name, randomly with the possible anchors.
+ my $assembled = $left_anchor. $$name_ref . $right_anchor;
+
+ # \p{name=/.../} is always full matching
+ $all_pass &= like(chr($i), qr!^\p{name=/$assembled/}!,
+ "Verify /\p{name=/$assembled/} matches chr(0x$hex)");
+ }
+
return $all_pass;
}
}
{
- # 20001114.001
+ # 20001114.001 (#4690)
no utf8; # naked Latin-1
use charnames ':full';
my $text = "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}";
- is($text, latin1_to_native("\xc4"), 'Verify \N{} returns correct string under "no utf8"');
+ is($text, chr utf8::unicode_to_native(0xc4), 'Verify \N{} returns correct string under "no utf8"');
# I'm not sure that this tests anything different from the above.
- cmp_ok(ord($text), '==', ord(latin1_to_native("\xc4")), '... and ords are ok');
+ cmp_ok(ord($text), '==', utf8::unicode_to_native(0xc4), '... and ords are ok');
}
{
cmp_ok($warning_count, '==', scalar @WARN, "Verify vianame doesn't warn on unknown names");
ok (! defined charnames::string_vianame("MORE NONE SUCH"), "Verify string_vianame returns undef for an undefined name");
cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't warn on unknown names");
+ ok (! defined charnames::vianame(""), "Verify vianame returns undef for an empty value");
+ cmp_ok($warning_count, '==', scalar @WARN, "... and no warning is generated");
+ ok (! defined charnames::string_vianame(""), "Verify string_vianame returns undef for an empty value");
+ cmp_ok($warning_count, '==', scalar @WARN, "... and no warning is generated");
+
+ eval "qr/\\p{name=MORE NONE SUCH}/";
+ like($@, qr/Can't find Unicode property definition "name=MORE NONE SUCH"/,
+ '\p{name=} returns an appropriate error message on an undefined name');
use bytes;
is(charnames::vianame("GOTHIC LETTER AHSA"), 0x10330, "Verify vianame \\N{name} is unaffected by 'use bytes'");
- is(charnames::vianame("U+FF"), chr(0xFF), "Verify vianame \\N{U+FF} is unaffected by 'use bytes'");
+ is(charnames::vianame("U+FF"), chr(utf8::unicode_to_native(0xFF)), "Verify vianame \\N{U+FF} is unaffected by 'use bytes'");
cmp_ok($warning_count, '==', scalar @WARN, "Verify vianame doesn't warn on legal inputs under 'use bytes'");
ok(! defined charnames::vianame("U+100"), "Verify vianame \\N{U+100} is undef under 'use bytes'");
ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify vianame gives appropriate warning for previous test");
ok(! defined charnames::string_vianame("GOTHIC LETTER AHSA"), "Verify string_vianame(\"GOTHIC LETTER AHSA\") is undefined under 'use bytes'");
ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify string_vianame gives appropriate warning for previous test");
$warning_count = @WARN;
- is(charnames::string_vianame("U+FF"), chr(0xFF), "Verify string_vianame(\"U+FF\") is chr(0xFF) under 'use bytes'");
+ eval "qr/\\p{name=GOTHIC LETTER AHSA}/";
+ is($@, "", '\p{name=...} is unaffect by "use bytes"');
+ is(charnames::string_vianame("U+FF"), chr(utf8::unicode_to_native(0xFF)), "Verify string_vianame(\"U+FF\") is chr(0xFF) under 'use bytes'");
cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't warn on legal inputs under 'use bytes'");
- is(charnames::string_vianame("LATIN SMALL LETTER Y WITH DIAERESIS"), chr(0xFF), "Verify string_vianame(\"LATIN SMALL LETTER Y WITH DIAERESIS\") is chr(0xFF) under 'use bytes'");
+ is(charnames::string_vianame("LATIN SMALL LETTER Y WITH DIAERESIS"), chr(utf8::unicode_to_native(0xFF)), "Verify string_vianame(\"LATIN SMALL LETTER Y WITH DIAERESIS\") is chr(native 0xFF) under 'use bytes'");
cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't warn on legal inputs under 'use bytes'");
ok(! defined charnames::string_vianame("U+100"), "Verify string_vianame \\N{U+100} is undef under 'use bytes'");
ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify string_vianame gives appropriate warning for previous test");
is("\N{LINE FEED}", "\n", 'Verify "\N{LINE FEED}" eq "\n"');
is("\N{LF}", "\n", 'Verify "\N{LF}" eq "\n"');
-my $nel = latin1_to_native("\x85");
+my $nel = chr utf8::unicode_to_native(0x85);
$nel = qr/^$nel$/;
like("\N{NEXT LINE (NEL)}", $nel, 'Verify "\N{NEXT LINE (NEL)}" is correct');
ok((grep { /\Qyou asked for U+110000/ } @WARN), '... and gives warning');
is(charnames::viacode(0), "NULL", 'Verify charnames::viacode(0) eq "NULL"');
-is(charnames::viacode("BE"), "VULGAR FRACTION THREE QUARTERS", 'Verify charnames::viacode("BE") eq "VULGAR FRACTION THREE QUARTERS"');
+my $three_quarters = sprintf("%2X", utf8::unicode_to_native(0xBE));
+is(charnames::viacode("$three_quarters"), "VULGAR FRACTION THREE QUARTERS", 'Verify charnames::viacode(native "BE") eq "VULGAR FRACTION THREE QUARTERS"');
is(charnames::viacode("U+00000000000FEED"), "ARABIC LETTER WAW ISOLATED FORM", 'Verify charnames::viacode("U+00000000000FEED") eq "ARABIC LETTER WAW ISOLATED FORM"');
+test_vianame(0x116C, "116C", "HANGUL JUNGSEONG OE");
+test_vianame(0x1180, "1180", "HANGUL JUNGSEONG O-E");
+like(chr(0x59C3), qr/\p{name=\/\ACJK UNIFIED IDEOGRAPH-59C3\z\/}/,
+ 'Verify name wildcards delimitters can be escaped');
+like(chr(0xD800), qr!\p{name=/\A\z/}!,
+ 'Verify works on matching an empty name');
+
{
no warnings 'deprecated';
is("\N{LINE FEED}", "\N{LINE FEED (LF)}", 'Verify "\N{LINE FEED}" eq "\N{LINE FEED (LF)}"', 'Verify \N{LINE FEED} eq \N{LINE FEED (LF)}');
# SADAHIRO Tomoyuki's suggestion is to ensure that the UTF-8ness of both
# arguments are identical before calling index.
# To do this can take advantage of the fact that unicore/Name.pl is 7 bit
-# (or at least should be). So assert that that it's true here. EBCDIC
+# (or at least should be). So assert that that is true here. EBCDIC
# may be a problem (khw).
my $names = do "unicore/Name.pl";
ok(defined $names, "Verify can read 'unicore/Name.pl'");
-my $non_ascii = native_to_latin1($names) =~ tr/\0-\177//c;
+my $non_ascii = native_to_uni($names) =~ tr/\0-\177//c;
ok(! $non_ascii, "Verify all official names are ASCII-only");
# Verify that charnames propagate to eval("")
is("\N{mychar1}", "e", "Outer block: verify that \\N{mychar1} works");
is(charnames::vianame("mychar1"), ord("e"), "Outer block: verify that vianame(mychar1) works");
is(charnames::string_vianame("mychar1"), "e", "Outer block: verify that string_vianame(mychar1) works");
+ eval "qr/\\p{name=mychar1}/";
+ like($@, qr/Can't find Unicode property definition "name=mychar1"/,
+ '\p{name=} returns an appropriate error message on an alias');
is("\N{mychar2}", "A", "Outer block: verify that \\N{mychar2} works");
is(charnames::vianame("mychar2"), ord("A"), "Outer block: verify that vianame(mychar2) works");
is(charnames::string_vianame("mychar2"), "A", "Outer block: verify that string_vianame(mychar2) works");
+ eval "qr/\\p{name=mychar2}/";
+ like($@, qr/Can't find Unicode property definition "name=mychar2"/,
+ '\p{name=} returns an appropriate error message on an alias');
is("\N{myprivate1}", "\x{E8000}", "Outer block: verify that \\N{myprivate1} works");
cmp_ok(charnames::vianame("myprivate1"), "==", 0xE8000, "Outer block: verify that vianame(myprivate1) works");
is(charnames::string_vianame("myprivate1"), chr(0xE8000), "Outer block: verify that string_vianame(myprivate1) works");
+ eval "qr/\\p{name=myprivate1}/";
+ like($@, qr/Can't find Unicode property definition "name=myprivate1"/,
+ '\p{name=} returns an appropriate error message on an alias');
is(charnames::viacode(0xE8000), "myprivate1", "Outer block: verify that myprivate1 viacode works");
is("\N{myprivate2}", "\x{100000}", "Outer block: verify that \\N{myprivate2} works");
cmp_ok(charnames::vianame("myprivate2"), "==", 0x100000, "Outer block: verify that vianame(myprivate2) works");
is(charnames::string_vianame("myprivate2"), chr(0x100000), "Outer block: verify that string_vianame(myprivate2) works");
+ eval "qr/\\p{name=myprivate2}/";
+ like($@, qr/Can't find Unicode property definition "name=myprivate2"/,
+ '\p{name=} returns an appropriate error message on an alias');
is(charnames::viacode(0x100000), "myprivate2", "Outer block: verify that myprivate2 viacode works");
is("\N{BE}", "\N{KATAKANA LETTER BE}", "Outer block: verify that \\N uses the correct script ");
cmp_ok(charnames::vianame("BE"), "==", ord("\N{KATAKANA LETTER BE}"), "Outer block: verify that vianame uses the correct script");
is("\N{Hiragana: BE}", $hiragana_be, "Outer block: verify that :short works with \\N");
cmp_ok(charnames::vianame("Hiragana: BE"), "==", ord($hiragana_be), "Outer block: verify that :short works with vianame");
cmp_ok(charnames::string_vianame("Hiragana: BE"), "==", $hiragana_be, "Outer block: verify that :short works with string_vianame");
+ eval "qr/\\p{name=Hiragana: BE}/";
+ like($@, qr/Can't find Unicode property definition "name=Hiragana: BE"/,
+ '\p{name=} returns an appropriate error message on :short attempt');
{
use charnames ":full",
$seed = srand;
}
- my $run_slow_tests = $ENV{PERL_RUN_SLOW_TESTS} || 0;
-
# We will look at the data grouped in "blocks" of the following
# size.
- my $block_size_bits = 7; # above 16 is not sensible
+ my $block_size_bits = 8; # above 16 is not sensible
my $block_size = 2**$block_size_bits;
# There are the regular names, like "SPACE", plus the ones
# of the character. The percentage of each type to test is
# fuzzily independently settable. This breaks down when the block size is
# 1 or is large enough that both types of names occur in the same block
- my $percentage_of_regular_names = ($run_slow_tests) ? 100 : 13;
+ my $percentage_of_regular_names = ($run_slow_tests) ? 100 : 10;
my $percentage_of_algorithmic_names = (100 / $block_size); # 1 test/block
# If wants everything tested, do so by changing the block size to 1 so
die "Can't open ../../lib/unicore/UnicodeData.txt: $!";
while (<$fh>) {
chomp;
- my ($code, $name, undef, undef, undef, undef, undef, undef, undef, undef, $u1name) = split ";";
- my $decimal = hex $code;
+ my ($code, $name, $category, undef, undef, undef, undef, undef, undef, undef, $u1name) = split ";";
+ my $decimal = utf8::unicode_to_native(hex $code);
+ $code = sprintf("%04X", $decimal) unless $::IS_ASCII;
+
+ $decimal = hex $code;
# The Unicode version 1 name is used instead of any that are
# marked <control>.
$name = "" if $^V lt v5.17.0 && $decimal == 0x1F514;
# ALERT overrides BELL
- $name = 'ALERT' if $decimal == 7;
+ $name = 'ALERT' if $decimal == utf8::unicode_to_native(7);
# Some don't have names, leave those array elements undefined
next unless $name;
/^(.*?);/;
my $end_decimal = hex $1;
- # Only the CJK (and the Hangul which are instead dealt with below)
- # ones have names, and they all have the code point as part of the
- # name, which we can construct
- if ($name =~ /^<CJK/) {
+ # Only the ones whose category is a letter currently have names,
+ # and of those the Hangul Syllables are dealt with below
+ if ( $category eq 'Lo' && $name !~ /^Hangul/i) {
+
+ # The CJK ones all get translated to a particular form; we
+ # just capitalize any others in the hopes that Unicode will
+ # use the correct term in any future ones it might add.
+ if ($name =~ /^<CJK/) {
+ $name = "CJK UNIFIED IDEOGRAPH";
+ }
+ else {
+ $name =~ s/<//;
+ $name =~ s/,.*//;
+ $name = uc($name);
+ }
+
+ # They all have the code point as part of the name, which we
+ # can construct
for my $i ($decimal .. $end_decimal) {
- $names[$i] = sprintf "CJK UNIFIED IDEOGRAPH-%04X", $i;
+ $names[$i] = sprintf "$name-%04X", $i;
my $block = $i >> $block_size_bits;
$algorithmic_names_count[$block]++;
}
# These four code points now have names, from NameAlias, but
# aren't listed as having names in UnicodeData.txt, so viacode
# returns their alias names, not undef
- next if $i == 0x80 || $i == 0x81 || $i == 0x84 || $i == 0x99;
+ next if $i == utf8::unicode_to_native(0x80)
+ || $i == utf8::unicode_to_native(0x81)
+ || $i == utf8::unicode_to_native(0x84)
+ || $i == utf8::unicode_to_native(0x99);
# If there is no name for this code point, all we can
# test is that.
# These four code points have a different Unicode1 name than
# regular name, and viacode has already specifically tested
# for the regular name
- if ($i != 0x0a && $i != 0x0c && $i != 0x0d && $i != 0x85) {
+ if ($i != utf8::unicode_to_native(0x0a)
+ && $i != utf8::unicode_to_native(0x0c)
+ && $i != utf8::unicode_to_native(0x0d)
+ && $i != utf8::unicode_to_native(0x85))
+ {
$all_pass &= is(charnames::viacode($i), $names[$i], "Verify viacode(0x$hex) is \"$names[$i]\"");
}
$all_pass &= ok(! defined charnames::vianame("CJK UNIFIED IDEOGRAPH-$hex"), "Verify vianame(\"CJK UNIFIED IDEOGRAPH-$hex\") is undefined");
} else {
$all_pass &= ok(! defined charnames::string_vianame("CJK UNIFIED IDEOGRAPH-$hex"), "Verify string_vianame(\"CJK UNIFIED IDEOGRAPH-$hex\") is undefined");
+ eval "qr/\\p{name=CJK UNIFIED IDEOGRAPH-$hex}/";
+ $all_pass &= like($@, qr/Can't find Unicode property definition "name=CJK UNIFIED IDEOGRAPH-$hex\"/,
+ "Verify string_vianame(\"CJK UNIFIED IDEOGRAPH-$hex\") is undefined");
}
}
}
s/^\s*#.*//;
next unless $_;
my ($name, $codes) = split ";";
- my $utf8 = pack("U*", map { hex } split " ", $codes);
+ $codes =~ s{ \b 00 ( [0-9A-F]{2} ) \b }
+ { sprintf("%04X", utf8::unicode_to_native(hex $1)) }gxe
+ if ord "A" != 65;
+ my $utf8 = pack("W*", map { hex } split " ", $codes);
is(charnames::string_vianame($name), $utf8, "Verify string_vianame(\"$name\") is the proper utf8");
my $loose_name = get_loose_name($name);
use charnames ":loose";
is(charnames::string_vianame($loose_name), $utf8, "Verify string_vianame(\"$loose_name\") is the proper utf8");
+
+ like($utf8, qr/^\p{name=$name}$/, "Verify /\p{name=$name}\$/ is the proper utf8");
+ like($utf8, qr/^\p{name=$loose_name}$/, "Verify /\p{name=$loose_name}\$/ is the proper utf8");
+ like($utf8, qr!^\p{name=/\A$name\z/}!, "Verify /\p{name=/$\A$name\z/} is the proper utf8");
#diag("$name, $utf8");
}
close $fh;