X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/b17e42fe49ac2df2159376ef44fd8f417e8db0a3..d89ea36076afa0a17a3faa7cb33a1c9c215a26eb:/lib/charnames.t diff --git a/lib/charnames.t b/lib/charnames.t index 62bb8ca..cd87350 100644 --- a/lib/charnames.t +++ b/lib/charnames.t @@ -1,7 +1,7 @@ #!./perl use strict; -# Test charnames.pm. If $ENV{PERL_RUN_SLOW_TESTS} is unset or 0, a random +# Test charnames.pm. If $ENV{PERL_RUN_SLOW_TESTS} is unset or 0, a random # selection of names is tested, a higher percentage of regular names is tested # than algorithmically-determined names. @@ -29,9 +29,8 @@ our $local_tests = 'no_plan'; # ---- For the alias extensions require "../t/lib/common.pl"; -use charnames ':full'; - -is("Here\N{EXCLAMATION MARK}?", "Here!?"); +is("Here\N{EXCLAMATION MARK}?", "Here!?", "Basic sanity, autoload of :full upon \\N"); +is("\N{latin: Q}", "Q", "autoload of :short upon \\N"); { use bytes; # TEST -utf8 can switch utf8 on @@ -42,32 +41,34 @@ use charnames ":full"; 1 EOE - like($@, "above 0xFF"); - ok(! defined $res); + like($@, qr/above 0xFF/, "Verify get warning for \\N{above ff} under 'use bytes' with :full"); + ok(! defined $res, "... and result is undefined"); $res = eval <<'EOE'; use charnames 'cyrillic'; "Here: \N{Be}!"; 1 EOE - like($@, "CYRILLIC CAPITAL LETTER BE.*above 0xFF"); + like($@, qr/CYRILLIC CAPITAL LETTER BE.*above 0xFF/, "Verify get warning under 'use bytes' with explicit script"); + ok(! defined $res, "... and result is undefined"); $res = eval <<'EOE'; use charnames ':full', ":alias" => { BOM => "LATIN SMALL LETTER B" }; "\N{BOM}"; EOE - is ($@, ""); + is ($@, "", "Verify that there is no warning for \\N{below 256} under 'use bytes'"); is ($res, 'b', "Verify that can redefine a standard alias"); } { - use charnames ':full', ":alias" => { mychar1 => "0xE8000", - mychar2 => 983040, # U+F0000 - mychar3 => "U+100000", - myctrl => 0x80, - mylarge => "U+111000", - }; + use charnames ":alias" => { mychar1 => "0xE8000", + mychar2 => 983040, # U+F0000 + mychar3 => "U+100000", + myctrl => utf8::unicode_to_native(0x80), + mylarge => "U+111000", + }; + is ("\N{PILE OF POO}", chr(0x1F4A9), "Verify :alias alone implies :full"); is ("\N{mychar1}", chr(0xE8000), "Verify that can define hex alias"); is (charnames::viacode(0xE8000), "mychar1", "And that can get the alias back"); is ("\N{mychar2}", chr(0xF0000), "Verify that can define decimal alias"); @@ -76,65 +77,91 @@ EOE is (charnames::viacode(0x100000), "mychar3", "And that can get the alias back"); is ("\N{mylarge}", chr(0x111000), "Verify that can define alias beyond Unicode"); is (charnames::viacode(0x111000), "mylarge", "And that can get the alias back"); - is (charnames::viacode(0x80), "myctrl", "Verify that can name a nameless control"); + is (charnames::viacode(utf8::unicode_to_native(0x80)), "myctrl", "Verify that can name a nameless control"); } -my $encoded_be; -my $encoded_alpha; -my $encoded_bet; -my $encoded_deseng; - -# If octal representation of unicode char is \0xyzt, then the utf8 is \3xy\2zt -if (ord('A') == 65) { # as on ASCII or UTF-8 machines - $encoded_be = "\320\261"; - $encoded_alpha = "\316\261"; - $encoded_bet = "\327\221"; - $encoded_deseng = "\360\220\221\215"; -} -else { # EBCDIC where UTF-EBCDIC may be used (this may be 1047 specific since - # UTF-EBCDIC is codepage specific) - $encoded_be = "\270\102\130"; - $encoded_alpha = "\264\130"; - $encoded_bet = "\270\125\130"; - $encoded_deseng = "\336\102\103\124"; -} +my $encoded_be = byte_utf8a_to_utf8n("\320\261"); +my $encoded_alpha = byte_utf8a_to_utf8n("\316\261"); +my $encoded_bet = byte_utf8a_to_utf8n("\327\221"); +my $encoded_deseng = byte_utf8a_to_utf8n("\360\220\221\215"); sub to_bytes { unpack"U0a*", shift; } +sub get_loose_name ($) { # Modify name to stress the loose tests. + + # First, all lower case, + my $loose_name = lc shift; + + # Then squeeze out all the blanks not adjacent to hyphens, but make the + # spaces that are adjacent to hypens into two, to make sure the code isn't + # looking for just one when looking for non-medial hyphens. + $loose_name =~ s/ (?) { chomp; my ($code, $name, undef, undef, undef, undef, undef, undef, undef, undef, $u1name) = split ";"; - my $decimal = hex $code; + my $decimal = utf8::unicode_to_native(hex $code); + $code = sprintf("%04X", $decimal) unless $::IS_ASCII; + + $decimal = hex $code; # The Unicode version 1 name is used instead of any that are - # marked + # marked . $name = $u1name if $name eq ""; - $name = 'ALERT' if $decimal == 7; + # In earlier Perls, we reject this code point's name (BELL) + $name = "" if $^V lt v5.17.0 && $decimal == 0x1F514; - # XXX This test should be changed for 5.16 when we convert to use - # Unicode's BELL - $name = "" if $decimal == 0x1F514; + # ALERT overrides BELL + $name = 'ALERT' if $decimal == utf8::unicode_to_native(7); # Some don't have names, leave those array elements undefined next unless $name; @@ -960,33 +1056,77 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}"); } close $fh; - # The Hangul syllable names aren't in the file above; their names - # are algorithmically determinable, but to avoid perpetuating any - # programming errors, this file contains the complete list, gathered - # from the web. - while () { - chomp; - next unless $_; # Guard against empty lines getting inserted. - my ($code, $name) = split ";"; - my $decimal = hex $code; - $names[$decimal] = $name; - my $block = $decimal >> $block_size_bits; - $algorithmic_names_count[$block] = 1; + use Unicode::UCD; + if (pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) gt v1.1.5) { + # The Hangul syllable names aren't in the file above; their names + # are algorithmically determinable, but to avoid perpetuating any + # programming errors, this file contains the complete list, gathered + # from the web. + while () { + chomp; + next unless $_; # Guard against empty lines getting inserted. + my ($code, $name) = split ";"; + my $decimal = hex $code; + $names[$decimal] = $name; + my $block = $decimal >> $block_size_bits; + $algorithmic_names_count[$block] = 1; + } } - open $fh, "<", "../../lib/unicore/NameAliases.txt" or - die "Can't open ../../lib/unicore/NameAliases.txt: $!"; - while (<$fh>) { + my @name_aliases; + use Unicode::UCD; + if (ord('A') == 65 + && pack( "C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v6.1.0) + { + open my $fh, "<", "../../lib/unicore/NameAliases.txt" + or die "Can't open ../../lib/unicore/NameAliases.txt: $!"; + @name_aliases = <$fh> + } + else { + + # If this Unicode version doesn't have the full .txt file, or are on + # an EBCDIC platform where they need to be translated, get the data + # from prop_invmap() (which should do the translation) and convert it + # to the file's format + use Unicode::UCD 'prop_invmap'; + my ($invlist_ref, $invmap_ref, undef, $default) + = prop_invmap('Name_Alias'); + for my $i (0 .. @$invlist_ref - 1) { + + # Convert the aliases for code points that have just one alias to + # single element arrays for uniform handling below. + if (! ref $invmap_ref->[$i]) { + + # But we test only the real aliases, not the ones which are + # just really placeholders. + next if $invmap_ref->[$i] eq $default; + + $invmap_ref->[$i] = [ $invmap_ref->[$i] ]; + } + + + # Change each alias for the code point to the form that the file + # has + foreach my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] - 1) { + foreach my $value (@{$invmap_ref->[$i]}) { + $value =~ s/: /;/; + push @name_aliases, sprintf("%04X;%s\n", $j, $value); + } + } + } + } + + for (@name_aliases) { chomp; s/^\s*#.*//; next unless $_; - my ($hex, $name) = split ";"; + my ($hex, $name, $type) = split ";"; my $i = CORE::hex $hex; # Make sure that both aliases (the one in UnicodeData, and the one we # just read) return the same code point. test_vianame($i, $hex, $name); - test_vianame($i, $hex, $names[$i]); + test_vianame($i, $hex, $names[$i]) if $names[$i] ne ""; # Set up so that a test below of this code point will use the alias # instead of the less-correct original. We can't test here that @@ -994,7 +1134,7 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}"); # aliases for the same code point, and viacode should return only the # final one. So don't do it here; instead rely on the loop below to # pick up the test. - $names[$i] = $name; + $names[$i] = $name if $type eq 'correction'; } close $fh; @@ -1055,6 +1195,14 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}"); my $hex = sprintf("%04X", $i); if (! $names[$i]) { + # These four code points now have names, from NameAlias, but + # aren't listed as having names in UnicodeData.txt, so viacode + # returns their alias names, not undef + next if $i == utf8::unicode_to_native(0x80) + || $i == utf8::unicode_to_native(0x81) + || $i == utf8::unicode_to_native(0x84) + || $i == utf8::unicode_to_native(0x99); + # If there is no name for this code point, all we can # test is that. $all_pass &= ok(! defined charnames::viacode($i), "Verify viacode(0x$hex) is undefined"); @@ -1063,7 +1211,17 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}"); # Otherwise, test that the name and code point map # correctly. $all_pass &= test_vianame($i, $hex, $names[$i]); - $all_pass &= is(charnames::viacode($i), $names[$i], "Verify viacode(0x$hex) is \"$names[$i]\""); + + # These four code points have a different Unicode1 name than + # regular name, and viacode has already specifically tested + # for the regular name + if ($i != utf8::unicode_to_native(0x0a) + && $i != utf8::unicode_to_native(0x0c) + && $i != utf8::unicode_to_native(0x0d) + && $i != utf8::unicode_to_native(0x85)) + { + $all_pass &= is(charnames::viacode($i), $names[$i], "Verify viacode(0x$hex) is \"$names[$i]\""); + } # And make sure that a non-algorithmically named code # point doesn't also map to one that is. @@ -1081,18 +1239,26 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}"); $block = $end_block + 1; } - open $fh, "<", "../../lib/unicore/NamedSequences.txt" or - die "Can't open ../../lib/unicore/NamedSequences.txt: $!"; - while (<$fh>) { - chomp; - s/^\s*#.*//; - next unless $_; - my ($name, $codes) = split ";"; - my $utf8 = pack("U*", map { hex } split " ", $codes); - is(charnames::string_vianame($name), $utf8, "Verify string_vianame(\"$name\") is the proper utf8"); - #diag("$name, $utf8"); + if (open my $fh, "<", "../../lib/unicore/NamedSequences.txt") { + while (<$fh>) { + chomp; + s/^\s*#.*//; + next unless $_; + my ($name, $codes) = split ";"; + my $utf8 = pack("W*", map { hex } split " ", $codes); + is(charnames::string_vianame($name), $utf8, "Verify string_vianame(\"$name\") is the proper utf8"); + my $loose_name = get_loose_name($name); + use charnames ":loose"; + is(charnames::string_vianame($loose_name), $utf8, "Verify string_vianame(\"$loose_name\") is the proper utf8"); + #diag("$name, $utf8"); + } + close $fh; + } + else { + use Unicode::UCD; + die "Can't open ../../lib/unicore/NamedSequences.txt: $!" + if pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v4.1.0; } - close $fh; unless ($all_pass) {