X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/6deb7a5e3707524fd23c0080d6a762ff30e50494..e406736c4117f8f403b44413687d4c8df036c44b:/lib/charnames.t diff --git a/lib/charnames.t b/lib/charnames.t index 5629f3a..01e1fd7 100644 --- a/lib/charnames.t +++ b/lib/charnames.t @@ -5,11 +5,13 @@ use strict; # selection of names is tested, a higher percentage of regular names is tested # than algorithmically-determined names. +my $run_slow_tests = $ENV{PERL_RUN_SLOW_TESTS} || 0; + my $RUN_SLOW_TESTS_EVERY_CODE_POINT = 100; # If $ENV{PERL_RUN_SLOW_TESTS} is at least 1 and less than the number above, -# all code points with names are tested. If it is at least that number, all -# 1,114,112 Unicode code points are tested. +# all code points with names are tested, including wildcard search names. If +# it is at least that number, all 1,114,112 Unicode code points are tested. # Because \N{} is compile time, any warnings will get generated before # execution, so have to have an array, and arrange things so no warning @@ -41,7 +43,7 @@ use charnames ":full"; 1 EOE - like($@, "above 0xFF", "Verify get warning for \\N{above ff} under 'use bytes' with :full"); + like($@, qr/above 0xFF/, "Verify get warning for \\N{above ff} under 'use bytes' with :full"); ok(! defined $res, "... and result is undefined"); $res = eval <<'EOE'; @@ -49,7 +51,7 @@ use charnames 'cyrillic'; "Here: \N{Be}!"; 1 EOE - like($@, "CYRILLIC CAPITAL LETTER BE.*above 0xFF", "Verify get warning under 'use bytes' with explicit script"); + like($@, qr/CYRILLIC CAPITAL LETTER BE.*above 0xFF/, "Verify get warning under 'use bytes' with explicit script"); ok(! defined $res, "... and result is undefined"); $res = eval <<'EOE'; @@ -65,7 +67,7 @@ EOE use charnames ":alias" => { mychar1 => "0xE8000", mychar2 => 983040, # U+F0000 mychar3 => "U+100000", - myctrl => 0x80, + myctrl => utf8::unicode_to_native(0x80), mylarge => "U+111000", }; is ("\N{PILE OF POO}", chr(0x1F4A9), "Verify :alias alone implies :full"); @@ -77,29 +79,14 @@ EOE is (charnames::viacode(0x100000), "mychar3", "And that can get the alias back"); is ("\N{mylarge}", chr(0x111000), "Verify that can define alias beyond Unicode"); is (charnames::viacode(0x111000), "mylarge", "And that can get the alias back"); - is (charnames::viacode(0x80), "myctrl", "Verify that can name a nameless control"); + is (charnames::viacode(utf8::unicode_to_native(0x80)), "myctrl", "Verify that can name a nameless control"); } -my $encoded_be; -my $encoded_alpha; -my $encoded_bet; -my $encoded_deseng; - -# If octal representation of unicode char is \0xyzt, then the utf8 is \3xy\2zt -if (ord('A') == 65) { # as on ASCII or UTF-8 machines - $encoded_be = "\320\261"; - $encoded_alpha = "\316\261"; - $encoded_bet = "\327\221"; - $encoded_deseng = "\360\220\221\215"; -} -else { # EBCDIC where UTF-EBCDIC may be used (this may be 1047 specific since - # UTF-EBCDIC is codepage specific) - $encoded_be = "\270\102\130"; - $encoded_alpha = "\264\130"; - $encoded_bet = "\270\125\130"; - $encoded_deseng = "\336\102\103\124"; -} +my $encoded_be = byte_utf8a_to_utf8n("\320\261"); +my $encoded_alpha = byte_utf8a_to_utf8n("\316\261"); +my $encoded_bet = byte_utf8a_to_utf8n("\327\221"); +my $encoded_deseng = byte_utf8a_to_utf8n("\360\220\221\215"); sub to_bytes { unpack"U0a*", shift; @@ -127,6 +114,7 @@ sub get_loose_name ($) { # Modify name to stress the loose tests. } sub test_vianame ($$$) { + CORE::state $wildcard_count = 0; # Run the vianame tests on a code point, both loose and full @@ -139,19 +127,63 @@ sub test_vianame ($$$) { # Get a copy of the name modified to stress the loose tests. my $loose_name = get_loose_name($name); + my $right_anchor; + # Switch loose and full in vianame vs string_vianame half the time if (rand() < .5) { use charnames ":full"; - $all_pass &= is(charnames::vianame($name), $i, "Verify vianame(\"$name\") is 0x$hex"); + $all_pass &= is(charnames::vianame($name), $i, + "Verify vianame(\"$name\") is 0x$hex"); use charnames ":loose"; - $all_pass &= is(charnames::string_vianame($loose_name), chr($i), "Verify string_vianame(\"$loose_name\") is chr(0x$hex)"); + $all_pass &= is(charnames::string_vianame($loose_name), chr($i), + "Verify string_vianame(\"$loose_name\") is chr(0x$hex)"); + $right_anchor = '\\Z'; } else { use charnames ":loose"; - $all_pass &= is(charnames::vianame($loose_name), $i, "Verify vianame(\"$loose_name\") is 0x$hex"); + $all_pass &= is(charnames::vianame($loose_name), $i, + "Verify vianame(\"$loose_name\") is 0x$hex"); use charnames ":full"; - $all_pass &= is(charnames::string_vianame($name), chr($i), "Verify string_vianame(\"$name\") is chr(0x$hex)"); + $all_pass &= is(charnames::string_vianame($name), chr($i), + "Verify string_vianame(\"$name\") is chr(0x$hex)"); + $right_anchor = '\\z'; } + + my $left_anchor = (rand() < .5) ? '^' : '\\A'; + + # \p{name=} is always loose matching + $all_pass &= like(chr($i), qr/^\p{name=$loose_name}$/, + "Verify /\p{name=$loose_name}/ matches chr(0x$hex)"); + + $wildcard_count++; + + # XXX temporary to see if the failure we are occasionally seeing is + # confined to this code point. GH #17671 + next if $i == 0; + + # Because wildcard name matching is so real-time intensive, do it less + # frequently than the others + if ($wildcard_count >= 10) { + $wildcard_count = 0; + + # A few control characters have anomalous names containing + # parentheses, which need to be escaped. + my $name_ref = \$name; + my $mod_name; + if ($i <= 0x85) { # NEL in ASCII; affected controls are lower than + # this in EBCDIC + $mod_name = $name =~ s/([()])/\\$1/gr; + $name_ref = \$mod_name; + } + + # We anchor the name, randomly with the possible anchors. + my $assembled = $left_anchor. $$name_ref . $right_anchor; + + # \p{name=/.../} is always full matching + $all_pass &= like(chr($i), qr!^\p{name=/$assembled/}!, + "Verify /\p{name=/$assembled/} matches chr(0x$hex)"); + } + return $all_pass; } @@ -203,7 +235,7 @@ sub test_vianame ($$$) { } { - # 20001114.001 + # 20001114.001 (#4690) no utf8; # naked Latin-1 @@ -232,10 +264,18 @@ sub test_vianame ($$$) { cmp_ok($warning_count, '==', scalar @WARN, "Verify vianame doesn't warn on unknown names"); ok (! defined charnames::string_vianame("MORE NONE SUCH"), "Verify string_vianame returns undef for an undefined name"); cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't warn on unknown names"); + ok (! defined charnames::vianame(""), "Verify vianame returns undef for an empty value"); + cmp_ok($warning_count, '==', scalar @WARN, "... and no warning is generated"); + ok (! defined charnames::string_vianame(""), "Verify string_vianame returns undef for an empty value"); + cmp_ok($warning_count, '==', scalar @WARN, "... and no warning is generated"); + + eval "qr/\\p{name=MORE NONE SUCH}/"; + like($@, qr/Can't find Unicode property definition "name=MORE NONE SUCH"/, + '\p{name=} returns an appropriate error message on an undefined name'); use bytes; is(charnames::vianame("GOTHIC LETTER AHSA"), 0x10330, "Verify vianame \\N{name} is unaffected by 'use bytes'"); - is(charnames::vianame("U+FF"), chr(0xFF), "Verify vianame \\N{U+FF} is unaffected by 'use bytes'"); + is(charnames::vianame("U+FF"), chr(utf8::unicode_to_native(0xFF)), "Verify vianame \\N{U+FF} is unaffected by 'use bytes'"); cmp_ok($warning_count, '==', scalar @WARN, "Verify vianame doesn't warn on legal inputs under 'use bytes'"); ok(! defined charnames::vianame("U+100"), "Verify vianame \\N{U+100} is undef under 'use bytes'"); ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify vianame gives appropriate warning for previous test"); @@ -244,9 +284,11 @@ sub test_vianame ($$$) { ok(! defined charnames::string_vianame("GOTHIC LETTER AHSA"), "Verify string_vianame(\"GOTHIC LETTER AHSA\") is undefined under 'use bytes'"); ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify string_vianame gives appropriate warning for previous test"); $warning_count = @WARN; - is(charnames::string_vianame("U+FF"), chr(0xFF), "Verify string_vianame(\"U+FF\") is chr(0xFF) under 'use bytes'"); + eval "qr/\\p{name=GOTHIC LETTER AHSA}/"; + is($@, "", '\p{name=...} is unaffect by "use bytes"'); + is(charnames::string_vianame("U+FF"), chr(utf8::unicode_to_native(0xFF)), "Verify string_vianame(\"U+FF\") is chr(0xFF) under 'use bytes'"); cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't warn on legal inputs under 'use bytes'"); - is(charnames::string_vianame("LATIN SMALL LETTER Y WITH DIAERESIS"), chr(0xFF), "Verify string_vianame(\"LATIN SMALL LETTER Y WITH DIAERESIS\") is chr(0xFF) under 'use bytes'"); + is(charnames::string_vianame("LATIN SMALL LETTER Y WITH DIAERESIS"), chr(utf8::unicode_to_native(0xFF)), "Verify string_vianame(\"LATIN SMALL LETTER Y WITH DIAERESIS\") is chr(native 0xFF) under 'use bytes'"); cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't warn on legal inputs under 'use bytes'"); ok(! defined charnames::string_vianame("U+100"), "Verify string_vianame \\N{U+100} is undef under 'use bytes'"); ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify string_vianame gives appropriate warning for previous test"); @@ -350,9 +392,17 @@ ok(! defined charnames::viacode(0x110000), ok((grep { /\Qyou asked for U+110000/ } @WARN), '... and gives warning'); is(charnames::viacode(0), "NULL", 'Verify charnames::viacode(0) eq "NULL"'); -is(charnames::viacode("BE"), "VULGAR FRACTION THREE QUARTERS", 'Verify charnames::viacode("BE") eq "VULGAR FRACTION THREE QUARTERS"'); +my $three_quarters = sprintf("%2X", utf8::unicode_to_native(0xBE)); +is(charnames::viacode("$three_quarters"), "VULGAR FRACTION THREE QUARTERS", 'Verify charnames::viacode(native "BE") eq "VULGAR FRACTION THREE QUARTERS"'); is(charnames::viacode("U+00000000000FEED"), "ARABIC LETTER WAW ISOLATED FORM", 'Verify charnames::viacode("U+00000000000FEED") eq "ARABIC LETTER WAW ISOLATED FORM"'); +test_vianame(0x116C, "116C", "HANGUL JUNGSEONG OE"); +test_vianame(0x1180, "1180", "HANGUL JUNGSEONG O-E"); +like(chr(0x59C3), qr/\p{name=\/\ACJK UNIFIED IDEOGRAPH-59C3\z\/}/, + 'Verify name wildcards delimitters can be escaped'); +like(chr(0xD800), qr!\p{name=/\A\z/}!, + 'Verify works on matching an empty name'); + { no warnings 'deprecated'; is("\N{LINE FEED}", "\N{LINE FEED (LF)}", 'Verify "\N{LINE FEED}" eq "\N{LINE FEED (LF)}"', 'Verify \N{LINE FEED} eq \N{LINE FEED (LF)}'); @@ -742,12 +792,12 @@ is($_, 'foobar', 'Verify charnames.pm doesnt clobbers $_'); # SADAHIRO Tomoyuki's suggestion is to ensure that the UTF-8ness of both # arguments are identical before calling index. # To do this can take advantage of the fact that unicore/Name.pl is 7 bit -# (or at least should be). So assert that that it's true here. EBCDIC +# (or at least should be). So assert that that is true here. EBCDIC # may be a problem (khw). my $names = do "unicore/Name.pl"; ok(defined $names, "Verify can read 'unicore/Name.pl'"); -my $non_ascii = native_to_latin1($names) =~ tr/\0-\177//c; +my $non_ascii = native_to_uni($names) =~ tr/\0-\177//c; ok(! $non_ascii, "Verify all official names are ASCII-only"); # Verify that charnames propagate to eval("") @@ -804,16 +854,28 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V is("\N{mychar1}", "e", "Outer block: verify that \\N{mychar1} works"); is(charnames::vianame("mychar1"), ord("e"), "Outer block: verify that vianame(mychar1) works"); is(charnames::string_vianame("mychar1"), "e", "Outer block: verify that string_vianame(mychar1) works"); + eval "qr/\\p{name=mychar1}/"; + like($@, qr/Can't find Unicode property definition "name=mychar1"/, + '\p{name=} returns an appropriate error message on an alias'); is("\N{mychar2}", "A", "Outer block: verify that \\N{mychar2} works"); is(charnames::vianame("mychar2"), ord("A"), "Outer block: verify that vianame(mychar2) works"); is(charnames::string_vianame("mychar2"), "A", "Outer block: verify that string_vianame(mychar2) works"); + eval "qr/\\p{name=mychar2}/"; + like($@, qr/Can't find Unicode property definition "name=mychar2"/, + '\p{name=} returns an appropriate error message on an alias'); is("\N{myprivate1}", "\x{E8000}", "Outer block: verify that \\N{myprivate1} works"); cmp_ok(charnames::vianame("myprivate1"), "==", 0xE8000, "Outer block: verify that vianame(myprivate1) works"); is(charnames::string_vianame("myprivate1"), chr(0xE8000), "Outer block: verify that string_vianame(myprivate1) works"); + eval "qr/\\p{name=myprivate1}/"; + like($@, qr/Can't find Unicode property definition "name=myprivate1"/, + '\p{name=} returns an appropriate error message on an alias'); is(charnames::viacode(0xE8000), "myprivate1", "Outer block: verify that myprivate1 viacode works"); is("\N{myprivate2}", "\x{100000}", "Outer block: verify that \\N{myprivate2} works"); cmp_ok(charnames::vianame("myprivate2"), "==", 0x100000, "Outer block: verify that vianame(myprivate2) works"); is(charnames::string_vianame("myprivate2"), chr(0x100000), "Outer block: verify that string_vianame(myprivate2) works"); + eval "qr/\\p{name=myprivate2}/"; + like($@, qr/Can't find Unicode property definition "name=myprivate2"/, + '\p{name=} returns an appropriate error message on an alias'); is(charnames::viacode(0x100000), "myprivate2", "Outer block: verify that myprivate2 viacode works"); is("\N{BE}", "\N{KATAKANA LETTER BE}", "Outer block: verify that \\N uses the correct script "); cmp_ok(charnames::vianame("BE"), "==", ord("\N{KATAKANA LETTER BE}"), "Outer block: verify that vianame uses the correct script"); @@ -821,6 +883,9 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V is("\N{Hiragana: BE}", $hiragana_be, "Outer block: verify that :short works with \\N"); cmp_ok(charnames::vianame("Hiragana: BE"), "==", ord($hiragana_be), "Outer block: verify that :short works with vianame"); cmp_ok(charnames::string_vianame("Hiragana: BE"), "==", $hiragana_be, "Outer block: verify that :short works with string_vianame"); + eval "qr/\\p{name=Hiragana: BE}/"; + like($@, qr/Can't find Unicode property definition "name=Hiragana: BE"/, + '\p{name=} returns an appropriate error message on :short attempt'); { use charnames ":full", @@ -974,11 +1039,9 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V $seed = srand; } - my $run_slow_tests = $ENV{PERL_RUN_SLOW_TESTS} || 0; - # We will look at the data grouped in "blocks" of the following # size. - my $block_size_bits = 7; # above 16 is not sensible + my $block_size_bits = 8; # above 16 is not sensible my $block_size = 2**$block_size_bits; # There are the regular names, like "SPACE", plus the ones @@ -987,7 +1050,7 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V # of the character. The percentage of each type to test is # fuzzily independently settable. This breaks down when the block size is # 1 or is large enough that both types of names occur in the same block - my $percentage_of_regular_names = ($run_slow_tests) ? 100 : 13; + my $percentage_of_regular_names = ($run_slow_tests) ? 100 : 10; my $percentage_of_algorithmic_names = (100 / $block_size); # 1 test/block # If wants everything tested, do so by changing the block size to 1 so @@ -1023,8 +1086,11 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V die "Can't open ../../lib/unicore/UnicodeData.txt: $!"; while (<$fh>) { chomp; - my ($code, $name, undef, undef, undef, undef, undef, undef, undef, undef, $u1name) = split ";"; - my $decimal = hex $code; + my ($code, $name, $category, undef, undef, undef, undef, undef, undef, undef, $u1name) = split ";"; + my $decimal = utf8::unicode_to_native(hex $code); + $code = sprintf("%04X", $decimal) unless $::IS_ASCII; + + $decimal = hex $code; # The Unicode version 1 name is used instead of any that are # marked . @@ -1034,7 +1100,7 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V $name = "" if $^V lt v5.17.0 && $decimal == 0x1F514; # ALERT overrides BELL - $name = 'ALERT' if $decimal == 7; + $name = 'ALERT' if $decimal == utf8::unicode_to_native(7); # Some don't have names, leave those array elements undefined next unless $name; @@ -1053,12 +1119,26 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V /^(.*?);/; my $end_decimal = hex $1; - # Only the CJK (and the Hangul which are instead dealt with below) - # ones have names, and they all have the code point as part of the - # name, which we can construct - if ($name =~ /^> $block_size_bits; $algorithmic_names_count[$block]++; } @@ -1209,7 +1289,10 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V # These four code points now have names, from NameAlias, but # aren't listed as having names in UnicodeData.txt, so viacode # returns their alias names, not undef - next if $i == 0x80 || $i == 0x81 || $i == 0x84 || $i == 0x99; + next if $i == utf8::unicode_to_native(0x80) + || $i == utf8::unicode_to_native(0x81) + || $i == utf8::unicode_to_native(0x84) + || $i == utf8::unicode_to_native(0x99); # If there is no name for this code point, all we can # test is that. @@ -1223,7 +1306,11 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V # These four code points have a different Unicode1 name than # regular name, and viacode has already specifically tested # for the regular name - if ($i != 0x0a && $i != 0x0c && $i != 0x0d && $i != 0x85) { + if ($i != utf8::unicode_to_native(0x0a) + && $i != utf8::unicode_to_native(0x0c) + && $i != utf8::unicode_to_native(0x0d) + && $i != utf8::unicode_to_native(0x85)) + { $all_pass &= is(charnames::viacode($i), $names[$i], "Verify viacode(0x$hex) is \"$names[$i]\""); } @@ -1234,6 +1321,9 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V $all_pass &= ok(! defined charnames::vianame("CJK UNIFIED IDEOGRAPH-$hex"), "Verify vianame(\"CJK UNIFIED IDEOGRAPH-$hex\") is undefined"); } else { $all_pass &= ok(! defined charnames::string_vianame("CJK UNIFIED IDEOGRAPH-$hex"), "Verify string_vianame(\"CJK UNIFIED IDEOGRAPH-$hex\") is undefined"); + eval "qr/\\p{name=CJK UNIFIED IDEOGRAPH-$hex}/"; + $all_pass &= like($@, qr/Can't find Unicode property definition "name=CJK UNIFIED IDEOGRAPH-$hex\"/, + "Verify string_vianame(\"CJK UNIFIED IDEOGRAPH-$hex\") is undefined"); } } } @@ -1249,11 +1339,18 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V s/^\s*#.*//; next unless $_; my ($name, $codes) = split ";"; - my $utf8 = pack("U*", map { hex } split " ", $codes); + $codes =~ s{ \b 00 ( [0-9A-F]{2} ) \b } + { sprintf("%04X", utf8::unicode_to_native(hex $1)) }gxe + if ord "A" != 65; + my $utf8 = pack("W*", map { hex } split " ", $codes); is(charnames::string_vianame($name), $utf8, "Verify string_vianame(\"$name\") is the proper utf8"); my $loose_name = get_loose_name($name); use charnames ":loose"; is(charnames::string_vianame($loose_name), $utf8, "Verify string_vianame(\"$loose_name\") is the proper utf8"); + + like($utf8, qr/^\p{name=$name}$/, "Verify /\p{name=$name}\$/ is the proper utf8"); + like($utf8, qr/^\p{name=$loose_name}$/, "Verify /\p{name=$loose_name}\$/ is the proper utf8"); + like($utf8, qr!^\p{name=/\A$name\z/}!, "Verify /\p{name=/$\A$name\z/} is the proper utf8"); #diag("$name, $utf8"); } close $fh;