X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/fa1e80ba41f52550cc193f33fda5d48962169788..b489e20f5bc292b1e257500b577944b52ec6c7d5:/lib/_charnames.pm diff --git a/lib/_charnames.pm b/lib/_charnames.pm index 26ba0fa..bb7d7c6 100644 --- a/lib/_charnames.pm +++ b/lib/_charnames.pm @@ -65,10 +65,10 @@ $Carp::Internal{ (__PACKAGE__) } = 1; my %system_aliases = ( - 'SINGLE-SHIFT 2' => pack("U", utf8::unicode_to_native(0x8E)), - 'SINGLE-SHIFT 3' => pack("U", utf8::unicode_to_native(0x8F)), - 'PRIVATE USE 1' => pack("U", utf8::unicode_to_native(0x91)), - 'PRIVATE USE 2' => pack("U", utf8::unicode_to_native(0x92)), + 'SINGLE-SHIFT 2' => chr utf8::unicode_to_native(0x8E), + 'SINGLE-SHIFT 3' => chr utf8::unicode_to_native(0x8F), + 'PRIVATE USE 1' => chr utf8::unicode_to_native(0x91), + 'PRIVATE USE 2' => chr utf8::unicode_to_native(0x92), ); # These are the aliases above that differ under :loose and :full matching @@ -77,15 +77,15 @@ my %system_aliases = ( #); #my %deprecated_aliases; -#$deprecated_aliases{'BELL'} = pack("U", utf8::unicode_to_native(0x07)) if $^V lt v5.17.0; +#$deprecated_aliases{'BELL'} = chr utf8::unicode_to_native(0x07) if $^V lt v5.17.0; #my %loose_deprecated_aliases = ( #); # These are special cased in :loose matching, differing only in a medial # hyphen -my $HANGUL_JUNGSEONG_O_E_utf8 = pack("U", 0x1180); -my $HANGUL_JUNGSEONG_OE_utf8 = pack("U", 0x116C); +my $HANGUL_JUNGSEONG_O_E_utf8 = chr 0x1180; +my $HANGUL_JUNGSEONG_OE_utf8 = chr 0x116C; my $txt; # The table of official character names @@ -163,7 +163,7 @@ sub alias (@) # Set up a single alias } if ($value =~ $decimal_qr) { no warnings qw(non_unicode surrogate nonchar); # Allow any of these - $^H{charnames_ord_aliases}{$name} = pack("U", $value); + $^H{charnames_ord_aliases}{$name} = chr $value; # Use a canonical form. $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name; @@ -285,7 +285,7 @@ sub lookup_name ($$$) { # It looks first in the aliases, then in the large table of official Unicode # names. - my $utf8; # The string result + my $result; # The string result my $save_input; if ($runtime) { @@ -298,7 +298,7 @@ sub lookup_name ($$$) { || (! defined $hints_ref->{charnames_full} && ! defined $hints_ref->{charnames_loose}); - # At runtime, but currently not at compile time, $^H gets + # At runtime, but currently not at compile time, %^H gets # stringified, so un-stringify back to the original data structures. # These get thrown away by perl before the next invocation # Also fill in the hash with the non-stringified data. @@ -321,7 +321,7 @@ sub lookup_name ($$$) { # User alias should be checked first or else can't override ours, and if we # were to add any, could conflict with theirs. if (exists $^H{charnames_ord_aliases}{$name}) { - $utf8 = $^H{charnames_ord_aliases}{$name}; + $result = $^H{charnames_ord_aliases}{$name}; } elsif (exists $^H{charnames_name_aliases}{$name}) { $name = $^H{charnames_name_aliases}{$name}; @@ -362,13 +362,13 @@ sub lookup_name ($$$) { # interested in convenience over speed, and the time for this second check # is miniscule compared to the rest of the routine. if (exists $system_aliases{$lookup_name}) { - $utf8 = $system_aliases{$lookup_name}; + $result = $system_aliases{$lookup_name}; } # There are currently no entries in this hash, so don't waste time looking # for them. But the code is retained for the unlikely possibility that # some will be added in the future. # elsif ($loose && exists $loose_system_aliases{$lookup_name}) { -# $utf8 = $loose_system_aliases{$lookup_name}; +# $result = $loose_system_aliases{$lookup_name}; # } # if (exists $deprecated_aliases{$lookup_name}) { # require warnings; @@ -376,7 +376,7 @@ sub lookup_name ($$$) { # "Unicode character name \"$name\" is deprecated, use \"" # . viacode(ord $deprecated_aliases{$lookup_name}) # . "\" instead"); -# $utf8 = $deprecated_aliases{$lookup_name}; +# $result = $deprecated_aliases{$lookup_name}; # } # There are currently no entries in this hash, so don't waste time looking # for them. But the code is retained for the unlikely possibility that @@ -387,21 +387,21 @@ sub lookup_name ($$$) { # "Unicode character name \"$name\" is deprecated, use \"" # . viacode(ord $loose_deprecated_aliases{$lookup_name}) # . "\" instead"); -# $utf8 = $loose_deprecated_aliases{$lookup_name}; +# $result = $loose_deprecated_aliases{$lookup_name}; # } } my @off; # Offsets into table of pattern match begin and end # If haven't found it yet... - if (! defined $utf8) { + if (! defined $result) { # See if has looked this input up earlier. if (! $loose && $^H{charnames_full} && exists $full_names_cache{$name}) { - $utf8 = $full_names_cache{$name}; + $result = $full_names_cache{$name}; } elsif ($loose && exists $loose_names_cache{$name}) { - $utf8 = $loose_names_cache{$name}; + $result = $loose_names_cache{$name}; } else { # Here, must do a look-up @@ -432,7 +432,7 @@ sub lookup_name ($$$) { if (($loose || $^H{charnames_full}) && (defined (my $ord = charnames::name_to_code_point_special($lookup_name, $loose)))) { - $utf8 = pack("U", $ord); + $result = chr $ord; } else { @@ -525,14 +525,14 @@ sub lookup_name ($$$) { # therefore yield the very last character in the table, which should # also be a \n, so the statement works anyway.) if (substr($txt, $off[0] - 7, 1) eq "\n") { - $utf8 = pack("U", CORE::hex substr($txt, $off[0] - 6, 5)); + $result = chr CORE::hex substr($txt, $off[0] - 6, 5); # Handle the single loose matching special case, in which two names # differ only by a single medial hyphen. If the original had a # hyphen (or more) in the right place, then it is that one. - $utf8 = $HANGUL_JUNGSEONG_O_E_utf8 + $result = $HANGUL_JUNGSEONG_O_E_utf8 if $loose - && $utf8 eq $HANGUL_JUNGSEONG_OE_utf8 + && $result eq $HANGUL_JUNGSEONG_OE_utf8 && $name =~ m/O \s* - [-\s]* E/ix; # Note that this wouldn't work if there were a 2nd # OE in the name @@ -544,7 +544,7 @@ sub lookup_name ($$$) { # The +1 skips past that newline, or, if the rindex() fails, to put # us to an offset of zero. my $charstart = rindex($txt, "\n", $off[0] - 7) + 1; - $utf8 = pack("U*", map { CORE::hex } + $result = pack("W*", map { CORE::hex } split " ", substr($txt, $charstart, $off[0] - $charstart - 1)); } } @@ -553,15 +553,27 @@ sub lookup_name ($$$) { # again, but only if it came from the one search that we cache. # (Haven't bothered with the pain of sorting out scoping issues for the # scripts searches.) - $cache_ref->{$name} = $utf8 if defined $cache_ref; + $cache_ref->{$name} = $result if defined $cache_ref; } } - - # Here, have the utf8. If the return is to be an ord, must be any single - # character. + # Here, have the result character. If the return is to be an ord, must be + # any single character. if ($wants_ord) { - return ord($utf8) if length $utf8 == 1; + return ord($result) if length $result == 1; + } + elsif (! utf8::is_utf8($result)) { + + # Here isn't UTF-8. That's OK if it is all ASCII, or we are being called + # at compile time where we know we can guarantee that Unicode rules are + # correctly imposed on the result, or under 'bytes' where we don't want + # those rules. But otherwise we have to make it UTF8 to guarantee Unicode + # rules on the returned string. + return $result if ! $runtime + || (caller $runtime)[8] & $bytes::hint_bits + || $result !~ /[[:^ascii:]]/; + utf8::upgrade($result); + return $result; } else { @@ -570,7 +582,7 @@ sub lookup_name ($$$) { my $in_bytes = ($runtime) ? (caller $runtime)[8] & $bytes::hint_bits : $^H & $bytes::hint_bits; - return $utf8 if (! $in_bytes || utf8::downgrade($utf8, 1)) # The 1 arg + return $result if (! $in_bytes || utf8::downgrade($result, 1)) # The 1 arg # means don't die on failure } @@ -594,10 +606,10 @@ sub lookup_name ($$$) { # Only other possible failure here is from use bytes. if ($runtime) { - carp not_legal_use_bytes_msg($name, $utf8); + carp not_legal_use_bytes_msg($name, $result); return; } else { - croak not_legal_use_bytes_msg($name, $utf8); + croak not_legal_use_bytes_msg($name, $result); } } # lookup_name