X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/558de9fac032ae33c59eaf248f2c2ff4ef66b176..4943a717433fa4a342d031e05d80444da6cb0e11:/lib/_charnames.pm diff --git a/lib/_charnames.pm b/lib/_charnames.pm index 18b71a9..bb7d7c6 100644 --- a/lib/_charnames.pm +++ b/lib/_charnames.pm @@ -6,8 +6,7 @@ package _charnames; use strict; use warnings; -use File::Spec; -our $VERSION = '1.35'; +our $VERSION = '1.43'; use unicore::Name; # mktables-generated algorithmically-defined names use bytes (); # for $bytes::hint_bits @@ -66,10 +65,10 @@ $Carp::Internal{ (__PACKAGE__) } = 1; my %system_aliases = ( - 'SINGLE-SHIFT 2' => pack("U", 0x8E), - 'SINGLE-SHIFT 3' => pack("U", 0x8F), - 'PRIVATE USE 1' => pack("U", 0x91), - 'PRIVATE USE 2' => pack("U", 0x92), + 'SINGLE-SHIFT 2' => chr utf8::unicode_to_native(0x8E), + 'SINGLE-SHIFT 3' => chr utf8::unicode_to_native(0x8F), + 'PRIVATE USE 1' => chr utf8::unicode_to_native(0x91), + 'PRIVATE USE 2' => chr utf8::unicode_to_native(0x92), ); # These are the aliases above that differ under :loose and :full matching @@ -78,15 +77,15 @@ my %system_aliases = ( #); #my %deprecated_aliases; -#$deprecated_aliases{'BELL'} = pack("U", 0x07) if $^V lt v5.17.0; +#$deprecated_aliases{'BELL'} = chr utf8::unicode_to_native(0x07) if $^V lt v5.17.0; #my %loose_deprecated_aliases = ( #); # These are special cased in :loose matching, differing only in a medial # hyphen -my $HANGUL_JUNGSEONG_O_E_utf8 = pack("U", 0x1180); -my $HANGUL_JUNGSEONG_OE_utf8 = pack("U", 0x116C); +my $HANGUL_JUNGSEONG_O_E_utf8 = chr 0x1180; +my $HANGUL_JUNGSEONG_OE_utf8 = chr 0x116C; my $txt; # The table of official character names @@ -144,6 +143,7 @@ sub carp sub alias (@) # Set up a single alias { my @errors; + my $nbsp = chr utf8::unicode_to_native(0xA0); my $alias = ref $_[0] ? $_[0] : { @_ }; foreach my $name (sort keys %$alias) { # Sort only because it helps having @@ -157,26 +157,58 @@ sub alias (@) # Set up a single alias # hex, but makes the code easier to maintain, and is called # infrequently, only at compile-time if ($value !~ $decimal_qr && $value =~ $hex_qr) { - $value = CORE::hex $1; + my $temp = CORE::hex $1; + $temp = utf8::unicode_to_native($temp) if $value =~ /^[Uu]\+/; + $value = $temp; } if ($value =~ $decimal_qr) { - no warnings qw(non_unicode surrogate nonchar); # Allow any non-malformed - $^H{charnames_ord_aliases}{$name} = pack("U", $value); + no warnings qw(non_unicode surrogate nonchar); # Allow any of these + $^H{charnames_ord_aliases}{$name} = chr $value; # Use a canonical form. $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name; } else { - # This regex needs to be sync'd with the code in toke.c that checks - # for the same thing - if ($name !~ / ^ - \p{_Perl_Charname_Begin} - \p{_Perl_Charname_Continue}* - $ /x) { - push @errors, $name; + my $ok_portion = ""; + $ok_portion = $1 if $name =~ / ^ ( + \p{_Perl_Charname_Begin} + \p{_Perl_Charname_Continue}* + ) /x; + + # If the name was fully correct, the above should have matched all of + # it. + if (length $ok_portion < length $name) { + my $first_bad = substr($name, length($ok_portion), 1); + push @errors, "Invalid character in charnames alias definition; " + . "marked by <-- HERE in '$ok_portion$first_bad<-- HERE " + . substr($name, length($ok_portion) + 1) + . "'"; } else { - $^H{charnames_name_aliases}{$name} = $value; + if ($name =~ / ( .* \s ) ( \s* ) $ /x) { + push @errors, "charnames alias definitions may not contain " + . "trailing white-space; marked by <-- HERE in " + . "'$1 <-- HERE " . $2 . "'"; + next; + } + + # Use '+' instead of '*' in this regex, because any trailing + # blanks have already been found + if ($name =~ / ( .*? \s{2} ) ( .+ ) /x) { + push @errors, "charnames alias definitions may not contain a " + . "sequence of multiple spaces; marked by <-- HERE " + . "in '$1 <-- HERE " . $2 . "'"; + next; + } + + $^H{charnames_name_aliases}{$name} = $value; + if (warnings::enabled('deprecated') + && $name =~ / ( .* $nbsp ) ( .* ) $ /x) + { + carp "NO-BREAK SPACE in a charnames alias definition is " + . "deprecated; marked by <-- HERE in '$1 <-- HERE " + . $2 . "'"; + } } } } @@ -184,12 +216,6 @@ sub alias (@) # Set up a single alias # We find and output all errors from this :alias definition, rather than # failing on the first one, so fewer runs are needed to get it to compile if (@errors) { - foreach my $name (@errors) { - my $ok = ""; - $ok = $1 if $name =~ / ^ ( \p{Alpha} [-\p{XPosixWord} ():\xa0]* ) /x; - my $first_bad = substr($name, length($ok), 1); - $name = "Invalid character in charnames alias definition; marked by <-- HERE in '$ok$first_bad<-- HERE " . substr($name, length($ok) + 1) . "'"; - } croak join "\n", @errors; } @@ -210,15 +236,16 @@ sub not_legal_use_bytes_msg { sub alias_file ($) # Reads a file containing alias definitions { + require File::Spec; my ($arg, $file) = @_; if (-f $arg && File::Spec->file_name_is_absolute ($arg)) { $file = $arg; } - elsif ($arg =~ m/^\w+$/) { + elsif ($arg =~ m/ ^ \p{_Perl_IDStart} \p{_Perl_IDCont}* $/x) { $file = "unicore/${arg}_alias.pl"; } else { - croak "Charnames alias files can only have identifier characters"; + croak "Charnames alias file names can only have identifier characters"; } if (my @alias = do $file) { @alias == 1 && !defined $alias[0] and @@ -258,7 +285,7 @@ sub lookup_name ($$$) { # It looks first in the aliases, then in the large table of official Unicode # names. - my $utf8; # The string result + my $result; # The string result my $save_input; if ($runtime) { @@ -271,7 +298,7 @@ sub lookup_name ($$$) { || (! defined $hints_ref->{charnames_full} && ! defined $hints_ref->{charnames_loose}); - # At runtime, but currently not at compile time, $^H gets + # At runtime, but currently not at compile time, %^H gets # stringified, so un-stringify back to the original data structures. # These get thrown away by perl before the next invocation # Also fill in the hash with the non-stringified data. @@ -294,7 +321,7 @@ sub lookup_name ($$$) { # User alias should be checked first or else can't override ours, and if we # were to add any, could conflict with theirs. if (exists $^H{charnames_ord_aliases}{$name}) { - $utf8 = $^H{charnames_ord_aliases}{$name}; + $result = $^H{charnames_ord_aliases}{$name}; } elsif (exists $^H{charnames_name_aliases}{$name}) { $name = $^H{charnames_name_aliases}{$name}; @@ -335,13 +362,13 @@ sub lookup_name ($$$) { # interested in convenience over speed, and the time for this second check # is miniscule compared to the rest of the routine. if (exists $system_aliases{$lookup_name}) { - $utf8 = $system_aliases{$lookup_name}; + $result = $system_aliases{$lookup_name}; } # There are currently no entries in this hash, so don't waste time looking # for them. But the code is retained for the unlikely possibility that # some will be added in the future. # elsif ($loose && exists $loose_system_aliases{$lookup_name}) { -# $utf8 = $loose_system_aliases{$lookup_name}; +# $result = $loose_system_aliases{$lookup_name}; # } # if (exists $deprecated_aliases{$lookup_name}) { # require warnings; @@ -349,7 +376,7 @@ sub lookup_name ($$$) { # "Unicode character name \"$name\" is deprecated, use \"" # . viacode(ord $deprecated_aliases{$lookup_name}) # . "\" instead"); -# $utf8 = $deprecated_aliases{$lookup_name}; +# $result = $deprecated_aliases{$lookup_name}; # } # There are currently no entries in this hash, so don't waste time looking # for them. But the code is retained for the unlikely possibility that @@ -360,21 +387,21 @@ sub lookup_name ($$$) { # "Unicode character name \"$name\" is deprecated, use \"" # . viacode(ord $loose_deprecated_aliases{$lookup_name}) # . "\" instead"); -# $utf8 = $loose_deprecated_aliases{$lookup_name}; +# $result = $loose_deprecated_aliases{$lookup_name}; # } } my @off; # Offsets into table of pattern match begin and end # If haven't found it yet... - if (! defined $utf8) { + if (! defined $result) { # See if has looked this input up earlier. if (! $loose && $^H{charnames_full} && exists $full_names_cache{$name}) { - $utf8 = $full_names_cache{$name}; + $result = $full_names_cache{$name}; } elsif ($loose && exists $loose_names_cache{$name}) { - $utf8 = $loose_names_cache{$name}; + $result = $loose_names_cache{$name}; } else { # Here, must do a look-up @@ -405,7 +432,7 @@ sub lookup_name ($$$) { if (($loose || $^H{charnames_full}) && (defined (my $ord = charnames::name_to_code_point_special($lookup_name, $loose)))) { - $utf8 = pack("U", $ord); + $result = chr $ord; } else { @@ -462,7 +489,7 @@ sub lookup_name ($$$) { /xs) { # Even in non-loose matching, the script traditionally has been - # case insensitve + # case insensitive $scripts_trie = "\U$1"; $lookup_name = $2; @@ -498,14 +525,14 @@ sub lookup_name ($$$) { # therefore yield the very last character in the table, which should # also be a \n, so the statement works anyway.) if (substr($txt, $off[0] - 7, 1) eq "\n") { - $utf8 = pack("U", CORE::hex substr($txt, $off[0] - 6, 5)); + $result = chr CORE::hex substr($txt, $off[0] - 6, 5); # Handle the single loose matching special case, in which two names # differ only by a single medial hyphen. If the original had a # hyphen (or more) in the right place, then it is that one. - $utf8 = $HANGUL_JUNGSEONG_O_E_utf8 + $result = $HANGUL_JUNGSEONG_O_E_utf8 if $loose - && $utf8 eq $HANGUL_JUNGSEONG_OE_utf8 + && $result eq $HANGUL_JUNGSEONG_OE_utf8 && $name =~ m/O \s* - [-\s]* E/ix; # Note that this wouldn't work if there were a 2nd # OE in the name @@ -517,7 +544,7 @@ sub lookup_name ($$$) { # The +1 skips past that newline, or, if the rindex() fails, to put # us to an offset of zero. my $charstart = rindex($txt, "\n", $off[0] - 7) + 1; - $utf8 = pack("U*", map { CORE::hex } + $result = pack("W*", map { CORE::hex } split " ", substr($txt, $charstart, $off[0] - $charstart - 1)); } } @@ -526,15 +553,27 @@ sub lookup_name ($$$) { # again, but only if it came from the one search that we cache. # (Haven't bothered with the pain of sorting out scoping issues for the # scripts searches.) - $cache_ref->{$name} = $utf8 if defined $cache_ref; + $cache_ref->{$name} = $result if defined $cache_ref; } } - - # Here, have the utf8. If the return is to be an ord, must be any single - # character. + # Here, have the result character. If the return is to be an ord, must be + # any single character. if ($wants_ord) { - return ord($utf8) if length $utf8 == 1; + return ord($result) if length $result == 1; + } + elsif (! utf8::is_utf8($result)) { + + # Here isn't UTF-8. That's OK if it is all ASCII, or we are being called + # at compile time where we know we can guarantee that Unicode rules are + # correctly imposed on the result, or under 'bytes' where we don't want + # those rules. But otherwise we have to make it UTF8 to guarantee Unicode + # rules on the returned string. + return $result if ! $runtime + || (caller $runtime)[8] & $bytes::hint_bits + || $result !~ /[[:^ascii:]]/; + utf8::upgrade($result); + return $result; } else { @@ -543,7 +582,7 @@ sub lookup_name ($$$) { my $in_bytes = ($runtime) ? (caller $runtime)[8] & $bytes::hint_bits : $^H & $bytes::hint_bits; - return $utf8 if (! $in_bytes || utf8::downgrade($utf8, 1)) # The 1 arg + return $result if (! $in_bytes || utf8::downgrade($result, 1)) # The 1 arg # means don't die on failure } @@ -567,10 +606,10 @@ sub lookup_name ($$$) { # Only other possible failure here is from use bytes. if ($runtime) { - carp not_legal_use_bytes_msg($name, $utf8); + carp not_legal_use_bytes_msg($name, $result); return; } else { - croak not_legal_use_bytes_msg($name, $utf8); + croak not_legal_use_bytes_msg($name, $result); } } # lookup_name @@ -612,6 +651,7 @@ sub import ref $alias eq "HASH" or croak "Only HASH reference supported as argument to :alias"; alias ($alias); + $promote = 1; next; } if ($alias =~ m{:(\w+)$}) { @@ -620,7 +660,7 @@ sub import alias_file ($1) and $promote = 1; next; } - alias_file ($alias); + alias_file ($alias) and $promote = 1; next; } if (substr($arg, 0, 1) eq ':' @@ -683,6 +723,11 @@ sub import # not an issue. my %viacode; +my $no_name_code_points_re = join "|", map { sprintf("%05X", + utf8::unicode_to_native($_)) } + 0x80, 0x81, 0x84, 0x99; +$no_name_code_points_re = qr/$no_name_code_points_re/; + sub viacode { # Returns the name of the code point argument @@ -703,8 +748,10 @@ sub viacode { if ($arg =~ $decimal_qr) { $hex = sprintf "%05X", $arg; } elsif ($arg =~ $hex_qr) { + $hex = CORE::hex $1; + $hex = utf8::unicode_to_native($hex) if $arg =~ /^[Uu]\+/; # Below is the line that differs from the _getcode() source - $hex = sprintf "%05X", hex $1; + $hex = sprintf "%05X", $hex; } else { carp("unexpected arg \"$arg\" to charnames::viacode()"); return; @@ -737,7 +784,7 @@ sub viacode { $return = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]); # If not one of these 4 code points, return what we've found. - if ($hex !~ / ^ 000 (?: 8[014] | 99 ) $ /x) { + if ($hex !~ / ^ $no_name_code_points_re $ /x) { $viacode{$hex} = $return; return $return; } @@ -772,7 +819,7 @@ sub viacode { } return; -} # _viacode +} # viacode 1;