Make tables for Perl-tailored Unicode Line_Break property

[perl5.git] / lib / _charnames.pm
diff --git a/lib/_charnames.pm b/lib/_charnames.pm

index 26ba0fa..bb7d7c6 100644 (file)
--- a/lib/_charnames.pm
+++ b/lib/_charnames.pm
@@ -65,10 +65,10 @@ $Carp::Internal{ (__PACKAGE__) } = 1;
  
  my %system_aliases = (
  
-    'SINGLE-SHIFT 2'                => pack("U", utf8::unicode_to_native(0x8E)),
-    'SINGLE-SHIFT 3'                => pack("U", utf8::unicode_to_native(0x8F)),
-    'PRIVATE USE 1'                 => pack("U", utf8::unicode_to_native(0x91)),
-    'PRIVATE USE 2'                 => pack("U", utf8::unicode_to_native(0x92)),
+    'SINGLE-SHIFT 2'                => chr utf8::unicode_to_native(0x8E),
+    'SINGLE-SHIFT 3'                => chr utf8::unicode_to_native(0x8F),
+    'PRIVATE USE 1'                 => chr utf8::unicode_to_native(0x91),
+    'PRIVATE USE 2'                 => chr utf8::unicode_to_native(0x92),
  );
  
  # These are the aliases above that differ under :loose and :full matching
@@ -77,15 +77,15 @@ my %system_aliases = (
  #);
  
  #my %deprecated_aliases;
-#$deprecated_aliases{'BELL'} = pack("U", utf8::unicode_to_native(0x07)) if $^V lt v5.17.0;
+#$deprecated_aliases{'BELL'} = chr utf8::unicode_to_native(0x07) if $^V lt v5.17.0;
  
  #my %loose_deprecated_aliases = (
  #);
  
  # These are special cased in :loose matching, differing only in a medial
  # hyphen
-my $HANGUL_JUNGSEONG_O_E_utf8 = pack("U", 0x1180);
-my $HANGUL_JUNGSEONG_OE_utf8 = pack("U", 0x116C);
+my $HANGUL_JUNGSEONG_O_E_utf8 = chr 0x1180;
+my $HANGUL_JUNGSEONG_OE_utf8 = chr 0x116C;
  
  
  my $txt;  # The table of official character names
@@ -163,7 +163,7 @@ sub alias (@) # Set up a single alias
      }
      if ($value =~ $decimal_qr) {
          no warnings qw(non_unicode surrogate nonchar); # Allow any of these
-        $^H{charnames_ord_aliases}{$name} = pack("U", $value);
+        $^H{charnames_ord_aliases}{$name} = chr $value;
  
          # Use a canonical form.
          $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name;
@@ -285,7 +285,7 @@ sub lookup_name ($$$) {
    # It looks first in the aliases, then in the large table of official Unicode
    # names.
  
-  my $utf8;       # The string result
+  my $result;       # The string result
    my $save_input;
  
    if ($runtime) {
@@ -298,7 +298,7 @@ sub lookup_name ($$$) {
                                || (! defined $hints_ref->{charnames_full}
                                    && ! defined $hints_ref->{charnames_loose});
  
-    # At runtime, but currently not at compile time, $^H gets
+    # At runtime, but currently not at compile time, %^H gets
      # stringified, so un-stringify back to the original data structures.
      # These get thrown away by perl before the next invocation
      # Also fill in the hash with the non-stringified data.
@@ -321,7 +321,7 @@ sub lookup_name ($$$) {
    # User alias should be checked first or else can't override ours, and if we
    # were to add any, could conflict with theirs.
    if (exists $^H{charnames_ord_aliases}{$name}) {
-    $utf8 = $^H{charnames_ord_aliases}{$name};
+    $result = $^H{charnames_ord_aliases}{$name};
    }
    elsif (exists $^H{charnames_name_aliases}{$name}) {
      $name = $^H{charnames_name_aliases}{$name};
@@ -362,13 +362,13 @@ sub lookup_name ($$$) {
      # interested in convenience over speed, and the time for this second check
      # is miniscule compared to the rest of the routine.
      if (exists $system_aliases{$lookup_name}) {
-      $utf8 = $system_aliases{$lookup_name};
+      $result = $system_aliases{$lookup_name};
      }
      # There are currently no entries in this hash, so don't waste time looking
      # for them.  But the code is retained for the unlikely possibility that
      # some will be added in the future.
  #    elsif ($loose && exists $loose_system_aliases{$lookup_name}) {
-#      $utf8 = $loose_system_aliases{$lookup_name};
+#      $result = $loose_system_aliases{$lookup_name};
  #    }
  #    if (exists $deprecated_aliases{$lookup_name}) {
  #      require warnings;
@@ -376,7 +376,7 @@ sub lookup_name ($$$) {
  #                       "Unicode character name \"$name\" is deprecated, use \""
  #                       . viacode(ord $deprecated_aliases{$lookup_name})
  #                       . "\" instead");
-#      $utf8 = $deprecated_aliases{$lookup_name};
+#      $result = $deprecated_aliases{$lookup_name};
  #    }
      # There are currently no entries in this hash, so don't waste time looking
      # for them.  But the code is retained for the unlikely possibility that
@@ -387,21 +387,21 @@ sub lookup_name ($$$) {
  #                       "Unicode character name \"$name\" is deprecated, use \""
  #                       . viacode(ord $loose_deprecated_aliases{$lookup_name})
  #                       . "\" instead");
-#      $utf8 = $loose_deprecated_aliases{$lookup_name};
+#      $result = $loose_deprecated_aliases{$lookup_name};
  #    }
    }
  
    my @off;  # Offsets into table of pattern match begin and end
  
    # If haven't found it yet...
-  if (! defined $utf8) {
+  if (! defined $result) {
  
      # See if has looked this input up earlier.
      if (! $loose && $^H{charnames_full} && exists $full_names_cache{$name}) {
-      $utf8 = $full_names_cache{$name};
+      $result = $full_names_cache{$name};
      }
      elsif ($loose && exists $loose_names_cache{$name}) {
-      $utf8 = $loose_names_cache{$name};
+      $result = $loose_names_cache{$name};
      }
      else { # Here, must do a look-up
  
@@ -432,7 +432,7 @@ sub lookup_name ($$$) {
        if (($loose || $^H{charnames_full})
            && (defined (my $ord = charnames::name_to_code_point_special($lookup_name, $loose))))
        {
-        $utf8 = pack("U", $ord);
+        $result = chr $ord;
        }
        else {
  
@@ -525,14 +525,14 @@ sub lookup_name ($$$) {
          # therefore yield the very last character in the table, which should
          # also be a \n, so the statement works anyway.)
          if (substr($txt, $off[0] - 7, 1) eq "\n") {
-          $utf8 = pack("U", CORE::hex substr($txt, $off[0] - 6, 5));
+          $result = chr CORE::hex substr($txt, $off[0] - 6, 5);
  
            # Handle the single loose matching special case, in which two names
            # differ only by a single medial hyphen.  If the original had a
            # hyphen (or more) in the right place, then it is that one.
-          $utf8 = $HANGUL_JUNGSEONG_O_E_utf8
+          $result = $HANGUL_JUNGSEONG_O_E_utf8
                    if $loose
-                     && $utf8 eq $HANGUL_JUNGSEONG_OE_utf8
+                     && $result eq $HANGUL_JUNGSEONG_OE_utf8
                       && $name =~ m/O \s* - [-\s]* E/ix;
                       # Note that this wouldn't work if there were a 2nd
                       # OE in the name
@@ -544,7 +544,7 @@ sub lookup_name ($$$) {
            # The +1 skips past that newline, or, if the rindex() fails, to put
            # us to an offset of zero.
            my $charstart = rindex($txt, "\n", $off[0] - 7) + 1;
-          $utf8 = pack("U*", map { CORE::hex }
+          $result = pack("W*", map { CORE::hex }
                split " ", substr($txt, $charstart, $off[0] - $charstart - 1));
          }
        }
@@ -553,15 +553,27 @@ sub lookup_name ($$$) {
        # again, but only if it came from the one search that we cache.
        # (Haven't bothered with the pain of sorting out scoping issues for the
        # scripts searches.)
-      $cache_ref->{$name} = $utf8 if defined $cache_ref;
+      $cache_ref->{$name} = $result if defined $cache_ref;
      }
    }
  
-
-  # Here, have the utf8.  If the return is to be an ord, must be any single
-  # character.
+  # Here, have the result character.  If the return is to be an ord, must be
+  # any single character.
    if ($wants_ord) {
-    return ord($utf8) if length $utf8 == 1;
+    return ord($result) if length $result == 1;
+  }
+  elsif (! utf8::is_utf8($result)) {
+
+    # Here isn't UTF-8.  That's OK if it is all ASCII, or we are being called
+    # at compile time where we know we can guarantee that Unicode rules are
+    # correctly imposed on the result, or under 'bytes' where we don't want
+    # those rules.  But otherwise we have to make it UTF8 to guarantee Unicode
+    # rules on the returned string.
+    return $result if ! $runtime
+                      || (caller $runtime)[8] & $bytes::hint_bits
+                      || $result !~ /[[:^ascii:]]/;
+    utf8::upgrade($result);
+    return $result;
    }
    else {
  
@@ -570,7 +582,7 @@ sub lookup_name ($$$) {
      my $in_bytes = ($runtime)
                     ? (caller $runtime)[8] & $bytes::hint_bits
                     : $^H & $bytes::hint_bits;
-    return $utf8 if (! $in_bytes || utf8::downgrade($utf8, 1)) # The 1 arg
+    return $result if (! $in_bytes || utf8::downgrade($result, 1)) # The 1 arg
                                                    # means don't die on failure
    }
  
@@ -594,10 +606,10 @@ sub lookup_name ($$$) {
  
    # Only other possible failure here is from use bytes.
    if ($runtime) {
-    carp not_legal_use_bytes_msg($name, $utf8);
+    carp not_legal_use_bytes_msg($name, $result);
      return;
    } else {
-    croak not_legal_use_bytes_msg($name, $utf8);
+    croak not_legal_use_bytes_msg($name, $result);
    }
  
  } # lookup_name