X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/a48a707d737e2308435486144d00a4cc18bdfeac..59a30fd4a3ce39b64d30173fc1a45d8018e2fb1f:/lib/_charnames.pm

diff --git a/lib/_charnames.pm b/lib/_charnames.pm
index fa52a9b..c6169d1 100644
--- a/lib/_charnames.pm
+++ b/lib/_charnames.pm
@@ -6,8 +6,7 @@
 package _charnames;
 use strict;
 use warnings;
-use File::Spec;
-our $VERSION = '1.37';
+our $VERSION = '1.45';
 use unicore::Name;    # mktables-generated algorithmically-defined names
 
 use bytes ();          # for $bytes::hint_bits
@@ -23,8 +22,8 @@ $Carp::Internal{ (__PACKAGE__) } = 1;
 # The official names with their code points are stored in a table in
 # lib/unicore/Name.pl which is read in as a large string (almost 3/4 Mb in
 # Unicode 6.0).  Each code point/name combination is separated by a \n in the
-# string.  (Some of the CJK and the Hangul syllable names are determined
-# instead algorithmically via subroutines stored instead in
+# string.  (Some of the CJK and the Hangul syllable names are instead
+# determined algorithmically via subroutines stored instead in
 # lib/unicore/Name.pm).  Because of the large size of this table, it isn't
 # converted into hashes for faster lookup.
 #
@@ -66,10 +65,10 @@ $Carp::Internal{ (__PACKAGE__) } = 1;
 
 my %system_aliases = (
 
-    'SINGLE-SHIFT 2'                => pack("U", 0x8E),
-    'SINGLE-SHIFT 3'                => pack("U", 0x8F),
-    'PRIVATE USE 1'                 => pack("U", 0x91),
-    'PRIVATE USE 2'                 => pack("U", 0x92),
+    'SINGLE-SHIFT 2'                => chr utf8::unicode_to_native(0x8E),
+    'SINGLE-SHIFT 3'                => chr utf8::unicode_to_native(0x8F),
+    'PRIVATE USE 1'                 => chr utf8::unicode_to_native(0x91),
+    'PRIVATE USE 2'                 => chr utf8::unicode_to_native(0x92),
 );
 
 # These are the aliases above that differ under :loose and :full matching
@@ -78,15 +77,15 @@ my %system_aliases = (
 #);
 
 #my %deprecated_aliases;
-#$deprecated_aliases{'BELL'} = pack("U", 0x07) if $^V lt v5.17.0;
+#$deprecated_aliases{'BELL'} = chr utf8::unicode_to_native(0x07) if $^V lt v5.17.0;
 
 #my %loose_deprecated_aliases = (
 #);
 
 # These are special cased in :loose matching, differing only in a medial
 # hyphen
-my $HANGUL_JUNGSEONG_O_E_utf8 = pack("U", 0x1180);
-my $HANGUL_JUNGSEONG_OE_utf8 = pack("U", 0x116C);
+my $HANGUL_JUNGSEONG_O_E_utf8 = chr 0x1180;
+my $HANGUL_JUNGSEONG_OE_utf8 = chr 0x116C;
 
 
 my $txt;  # The table of official character names
@@ -144,6 +143,7 @@ sub carp
 sub alias (@) # Set up a single alias
 {
   my @errors;
+  my $nbsp = chr utf8::unicode_to_native(0xA0);
 
   my $alias = ref $_[0] ? $_[0] : { @_ };
   foreach my $name (sort keys %$alias) {  # Sort only because it helps having
@@ -157,39 +157,51 @@ sub alias (@) # Set up a single alias
     # hex, but makes the code easier to maintain, and is called
     # infrequently, only at compile-time
     if ($value !~ $decimal_qr && $value =~ $hex_qr) {
-      $value = CORE::hex $1;
+      my $temp = CORE::hex $1;
+      $temp = utf8::unicode_to_native($temp) if $value =~ /^[Uu]\+/;
+      $value = $temp;
     }
     if ($value =~ $decimal_qr) {
         no warnings qw(non_unicode surrogate nonchar); # Allow any of these
-        $^H{charnames_ord_aliases}{$name} = pack("U", $value);
+        $^H{charnames_ord_aliases}{$name} = chr $value;
 
         # Use a canonical form.
         $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name;
     }
     else {
-        # This regex needs to be sync'd with the code in toke.c that checks
-        # for the same thing
-        if ($name !~ / ^
-                       \p{_Perl_Charname_Begin}
-                       \p{_Perl_Charname_Continue}*
-                       $ /x) {
-
-          push @errors, $name;
+        my $ok_portion = "";
+        $ok_portion = $1 if $name =~ / ^ (
+                                            \p{_Perl_Charname_Begin}
+                                            \p{_Perl_Charname_Continue}*
+                                         ) /x;
+
+        # If the name was fully correct, the above should have matched all of
+        # it.
+        if (length $ok_portion < length $name) {
+          my $first_bad = substr($name, length($ok_portion), 1);
+          push @errors, "Invalid character in charnames alias definition; "
+                        . "marked by <-- HERE in '$ok_portion$first_bad<-- HERE "
+                        . substr($name, length($ok_portion) + 1)
+                        . "'";
         }
         else {
-          $^H{charnames_name_aliases}{$name} = $value;
-
-          if (warnings::enabled('deprecated')) {
             if ($name =~ / ( .* \s ) ( \s* ) $ /x) {
-              carp "Trailing white-space in a charnames alias definition is deprecated; marked by <-- HERE in '$1 <-- HERE " . $2 . "'";
+              push @errors, "charnames alias definitions may not contain "
+                            . "trailing white-space; marked by <-- HERE in "
+                            . "'$1 <-- HERE " . $2 . "'";
+              next;
             }
 
             # Use '+' instead of '*' in this regex, because any trailing
-            # blanks have already been warned about.
+            # blanks have already been found
             if ($name =~ / ( .*? \s{2} ) ( .+ ) /x) {
-              carp "A sequence of multiple spaces in a charnames alias definition is deprecated; marked by <-- HERE in '$1 <-- HERE " . $2 . "'";
+              push @errors, "charnames alias definitions may not contain a "
+                            . "sequence of multiple spaces; marked by <-- HERE "
+                            . "in '$1 <-- HERE " . $2 . "'";
+              next;
             }
-          }
+
+            $^H{charnames_name_aliases}{$name} = $value;
         }
     }
   }
@@ -197,12 +209,6 @@ sub alias (@) # Set up a single alias
   # We find and output all errors from this :alias definition, rather than
   # failing on the first one, so fewer runs are needed to get it to compile
   if (@errors) {
-    foreach my $name (@errors) {
-      my $ok = "";
-      $ok = $1 if $name =~ / ^ ( \p{Alpha} [-\p{XPosixWord} ():\xa0]* ) /x;
-      my $first_bad = substr($name, length($ok), 1);
-      $name = "Invalid character in charnames alias definition; marked by <-- HERE in '$ok$first_bad<-- HERE " . substr($name, length($ok) + 1) . "'";
-    }
     croak join "\n", @errors;
   }
 
@@ -223,6 +229,7 @@ sub not_legal_use_bytes_msg {
 
 sub alias_file ($)  # Reads a file containing alias definitions
 {
+  require File::Spec;
   my ($arg, $file) = @_;
   if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
     $file = $arg;
@@ -271,7 +278,7 @@ sub lookup_name ($$$) {
   # It looks first in the aliases, then in the large table of official Unicode
   # names.
 
-  my $utf8;       # The string result
+  my $result;       # The string result
   my $save_input;
 
   if ($runtime) {
@@ -284,7 +291,7 @@ sub lookup_name ($$$) {
                               || (! defined $hints_ref->{charnames_full}
                                   && ! defined $hints_ref->{charnames_loose});
 
-    # At runtime, but currently not at compile time, $^H gets
+    # At runtime, but currently not at compile time, %^H gets
     # stringified, so un-stringify back to the original data structures.
     # These get thrown away by perl before the next invocation
     # Also fill in the hash with the non-stringified data.
@@ -307,7 +314,7 @@ sub lookup_name ($$$) {
   # User alias should be checked first or else can't override ours, and if we
   # were to add any, could conflict with theirs.
   if (exists $^H{charnames_ord_aliases}{$name}) {
-    $utf8 = $^H{charnames_ord_aliases}{$name};
+    $result = $^H{charnames_ord_aliases}{$name};
   }
   elsif (exists $^H{charnames_name_aliases}{$name}) {
     $name = $^H{charnames_name_aliases}{$name};
@@ -348,13 +355,13 @@ sub lookup_name ($$$) {
     # interested in convenience over speed, and the time for this second check
     # is miniscule compared to the rest of the routine.
     if (exists $system_aliases{$lookup_name}) {
-      $utf8 = $system_aliases{$lookup_name};
+      $result = $system_aliases{$lookup_name};
     }
     # There are currently no entries in this hash, so don't waste time looking
     # for them.  But the code is retained for the unlikely possibility that
     # some will be added in the future.
 #    elsif ($loose && exists $loose_system_aliases{$lookup_name}) {
-#      $utf8 = $loose_system_aliases{$lookup_name};
+#      $result = $loose_system_aliases{$lookup_name};
 #    }
 #    if (exists $deprecated_aliases{$lookup_name}) {
 #      require warnings;
@@ -362,7 +369,7 @@ sub lookup_name ($$$) {
 #                       "Unicode character name \"$name\" is deprecated, use \""
 #                       . viacode(ord $deprecated_aliases{$lookup_name})
 #                       . "\" instead");
-#      $utf8 = $deprecated_aliases{$lookup_name};
+#      $result = $deprecated_aliases{$lookup_name};
 #    }
     # There are currently no entries in this hash, so don't waste time looking
     # for them.  But the code is retained for the unlikely possibility that
@@ -373,21 +380,21 @@ sub lookup_name ($$$) {
 #                       "Unicode character name \"$name\" is deprecated, use \""
 #                       . viacode(ord $loose_deprecated_aliases{$lookup_name})
 #                       . "\" instead");
-#      $utf8 = $loose_deprecated_aliases{$lookup_name};
+#      $result = $loose_deprecated_aliases{$lookup_name};
 #    }
   }
 
   my @off;  # Offsets into table of pattern match begin and end
 
   # If haven't found it yet...
-  if (! defined $utf8) {
+  if (! defined $result) {
 
     # See if has looked this input up earlier.
     if (! $loose && $^H{charnames_full} && exists $full_names_cache{$name}) {
-      $utf8 = $full_names_cache{$name};
+      $result = $full_names_cache{$name};
     }
     elsif ($loose && exists $loose_names_cache{$name}) {
-      $utf8 = $loose_names_cache{$name};
+      $result = $loose_names_cache{$name};
     }
     else { # Here, must do a look-up
 
@@ -418,7 +425,7 @@ sub lookup_name ($$$) {
       if (($loose || $^H{charnames_full})
           && (defined (my $ord = charnames::name_to_code_point_special($lookup_name, $loose))))
       {
-        $utf8 = pack("U", $ord);
+        $result = chr $ord;
       }
       else {
 
@@ -475,7 +482,7 @@ sub lookup_name ($$$) {
                                   /xs)
           {
               # Even in non-loose matching, the script traditionally has been
-              # case insensitve
+              # case insensitive
               $scripts_trie = "\U$1";
               $lookup_name = $2;
 
@@ -511,14 +518,14 @@ sub lookup_name ($$$) {
         # therefore yield the very last character in the table, which should
         # also be a \n, so the statement works anyway.)
         if (substr($txt, $off[0] - 7, 1) eq "\n") {
-          $utf8 = pack("U", CORE::hex substr($txt, $off[0] - 6, 5));
+          $result = chr CORE::hex substr($txt, $off[0] - 6, 5);
 
           # Handle the single loose matching special case, in which two names
           # differ only by a single medial hyphen.  If the original had a
           # hyphen (or more) in the right place, then it is that one.
-          $utf8 = $HANGUL_JUNGSEONG_O_E_utf8
+          $result = $HANGUL_JUNGSEONG_O_E_utf8
                   if $loose
-                     && $utf8 eq $HANGUL_JUNGSEONG_OE_utf8
+                     && $result eq $HANGUL_JUNGSEONG_OE_utf8
                      && $name =~ m/O \s* - [-\s]* E/ix;
                      # Note that this wouldn't work if there were a 2nd
                      # OE in the name
@@ -530,7 +537,7 @@ sub lookup_name ($$$) {
           # The +1 skips past that newline, or, if the rindex() fails, to put
           # us to an offset of zero.
           my $charstart = rindex($txt, "\n", $off[0] - 7) + 1;
-          $utf8 = pack("U*", map { CORE::hex }
+          $result = pack("W*", map { CORE::hex }
               split " ", substr($txt, $charstart, $off[0] - $charstart - 1));
         }
       }
@@ -539,15 +546,27 @@ sub lookup_name ($$$) {
       # again, but only if it came from the one search that we cache.
       # (Haven't bothered with the pain of sorting out scoping issues for the
       # scripts searches.)
-      $cache_ref->{$name} = $utf8 if defined $cache_ref;
+      $cache_ref->{$name} = $result if defined $cache_ref;
     }
   }
 
-
-  # Here, have the utf8.  If the return is to be an ord, must be any single
-  # character.
+  # Here, have the result character.  If the return is to be an ord, must be
+  # any single character.
   if ($wants_ord) {
-    return ord($utf8) if length $utf8 == 1;
+    return ord($result) if length $result == 1;
+  }
+  elsif (! utf8::is_utf8($result)) {
+
+    # Here isn't UTF-8.  That's OK if it is all ASCII, or we are being called
+    # at compile time where we know we can guarantee that Unicode rules are
+    # correctly imposed on the result, or under 'bytes' where we don't want
+    # those rules.  But otherwise we have to make it UTF8 to guarantee Unicode
+    # rules on the returned string.
+    return $result if ! $runtime
+                      || (caller $runtime)[8] & $bytes::hint_bits
+                      || $result !~ /[[:^ascii:]]/;
+    utf8::upgrade($result);
+    return $result;
   }
   else {
 
@@ -556,7 +575,7 @@ sub lookup_name ($$$) {
     my $in_bytes = ($runtime)
                    ? (caller $runtime)[8] & $bytes::hint_bits
                    : $^H & $bytes::hint_bits;
-    return $utf8 if (! $in_bytes || utf8::downgrade($utf8, 1)) # The 1 arg
+    return $result if (! $in_bytes || utf8::downgrade($result, 1)) # The 1 arg
                                                   # means don't die on failure
   }
 
@@ -580,10 +599,10 @@ sub lookup_name ($$$) {
 
   # Only other possible failure here is from use bytes.
   if ($runtime) {
-    carp not_legal_use_bytes_msg($name, $utf8);
+    carp not_legal_use_bytes_msg($name, $result);
     return;
   } else {
-    croak not_legal_use_bytes_msg($name, $utf8);
+    croak not_legal_use_bytes_msg($name, $result);
   }
 
 } # lookup_name
@@ -697,6 +716,11 @@ sub import
 # not an issue.
 my %viacode;
 
+my $no_name_code_points_re = join "|", map { sprintf("%05X",
+                                             utf8::unicode_to_native($_)) }
+                                            0x80, 0x81, 0x84, 0x99;
+$no_name_code_points_re = qr/$no_name_code_points_re/;
+
 sub viacode {
 
   # Returns the name of the code point argument
@@ -717,8 +741,10 @@ sub viacode {
   if ($arg =~ $decimal_qr) {
     $hex = sprintf "%05X", $arg;
   } elsif ($arg =~ $hex_qr) {
+    $hex = CORE::hex $1;
+    $hex = utf8::unicode_to_native($hex) if $arg =~ /^[Uu]\+/;
     # Below is the line that differs from the _getcode() source
-    $hex = sprintf "%05X", hex $1;
+    $hex = sprintf "%05X", $hex;
   } else {
     carp("unexpected arg \"$arg\" to charnames::viacode()");
     return;
@@ -751,7 +777,7 @@ sub viacode {
         $return = substr($txt, $+[0], index($txt, "\n", $+[0]) - $+[0]);
 
         # If not one of these 4 code points, return what we've found.
-        if ($hex !~ / ^ 000 (?: 8[014] | 99 ) $ /x) {
+        if ($hex !~ / ^ $no_name_code_points_re $ /x) {
           $viacode{$hex} = $return;
           return $return;
         }