utf8n_to_uvchr() Properly test for extended UTF-8

author Karl Williamson <khw@cpan.org>

Tue, 27 Jun 2017 20:46:26 +0000 (14:46 -0600)

committer Karl Williamson <khw@cpan.org>

Thu, 13 Jul 2017 03:14:25 +0000 (21:14 -0600)
author Karl Williamson <khw@cpan.org>
Tue, 27 Jun 2017 20:46:26 +0000 (14:46 -0600)
committer Karl Williamson <khw@cpan.org>
Thu, 13 Jul 2017 03:14:25 +0000 (21:14 -0600)
diff --git a/embed.fnc b/embed.fnc

index 1e661e4..20f2987 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -744,7 +744,9 @@ ADMpR       |bool   |isALNUM_lazy   |NN const char* p
  #ifdef PERL_IN_UTF8_C
  snR    |U8     |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp  \
                 |const char dummy
+#  ifndef UV_IS_QUAD
  inR    |bool   |is_utf8_cp_above_31_bits|NN const U8 * const s|NN const U8 * const e
+#  endif
  #endif
  #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
  EXp    |UV        |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const unsigned int flags
diff --git a/embed.h b/embed.h

index a74458d..608d252 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -1472,6 +1472,11 @@
  #define mulexp10               S_mulexp10
  #    endif
  #  endif
+#  if !defined(UV_IS_QUAD)
+#    if defined(PERL_IN_UTF8_C)
+#define is_utf8_cp_above_31_bits       S_is_utf8_cp_above_31_bits
+#    endif
+#  endif
  #  if !defined(WIN32)
  #define do_exec3(a,b,c)                Perl_do_exec3(aTHX_ a,b,c)
  #  endif
@@ -1844,7 +1849,6 @@
  #define isFF_OVERLONG          S_isFF_OVERLONG
  #define is_utf8_common(a,b,c,d)        S_is_utf8_common(aTHX_ a,b,c,d)
  #define is_utf8_common_with_len(a,b,c,d,e)     S_is_utf8_common_with_len(aTHX_ a,b,c,d,e)
-#define is_utf8_cp_above_31_bits       S_is_utf8_cp_above_31_bits
  #define is_utf8_overlong_given_start_byte_ok   S_is_utf8_overlong_given_start_byte_ok
  #define swash_scan_list_line(a,b,c,d,e,f,g)    S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
  #define swatch_get(a,b,c)      S_swatch_get(aTHX_ a,b,c)
diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl

index 94df88e..6c88f5c 100644 (file)
--- a/ext/XS-APItest/t/utf8_warn_base.pl
+++ b/ext/XS-APItest/t/utf8_warn_base.pl
@@ -28,6 +28,7 @@ local $SIG{__WARN__} = sub { my @copy = @_;
                               push @warnings_gotten, map { chomp; $_ } @copy;
                             };
  
+my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
  my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
  
  sub requires_extended_utf8($) {
@@ -36,8 +37,7 @@ sub requires_extended_utf8($) {
      # into 31 bits, subject to the convention that a negative code point
      # stands for one that overflows the word size, so won't fit in 31 bits.
  
-    my $cp = shift;
-    return $cp > 0x7FFFFFFF;
+    return shift > $highest_non_extended_utf8_cp;
  }
  
  my @tests;
@@ -286,7 +286,6 @@ my @tests;
              : I8_to_native(
                  "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
              0x80000000,
-            (isASCII) ? 1 : 8,
          ],
          [ "highest 32 bit code point",
              (isASCII)
@@ -294,7 +293,6 @@ my @tests;
              : I8_to_native(
                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
              0xFFFFFFFF,
-            (isASCII) ? 1 : 8,
          ],
          [ "Lowest 33 bit code point",
              (isASCII)
@@ -340,7 +338,6 @@ my @tests;
                  [ "Lowest code point requiring 13 bytes to represent",
                      "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
                      0x1000000000,
-                    1,
                  ],
                  [ "overflow that old algorithm failed to detect",
                      "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
@@ -355,37 +352,31 @@ my @tests;
                      I8_to_native(
                      "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                      0x800000000,
-                    7,
                  ],
                  [ "requires at least 32 bits",
                      I8_to_native(
                      "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                      0x10000000000,
-                    6,
                  ],
                  [ "requires at least 32 bits",
                      I8_to_native(
                      "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                      0x200000000000,
-                    5,
                  ],
                  [ "requires at least 32 bits",
                      I8_to_native(
                      "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                      0x4000000000000,
-                    4,
                  ],
                  [ "requires at least 32 bits",
                      I8_to_native(
                      "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                      0x80000000000000,
-                    3,
                  ],
                  [ "requires at least 32 bits",
                      I8_to_native(
                      "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                      0x1000000000000000,
-                    2,
                  ];
          }
      }
@@ -587,6 +578,11 @@ foreach my $test (@tests) {
                                  # contain a code point.  (This is a result of
                                  # some sort of malformation that means we
                                  # can't get an exact code poin
+    my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
+                        \Q requires a Perl extension, and so is not\E
+                        \Q portable\E/x;
+    my $extended_non_cp_trailing_text
+                        = "is a Perl extension, and so is not portable";
  
      # Is this test malformed from the beginning?  If so, we know to generally
      # expect that the tests will show it isn't valid.
@@ -619,9 +615,9 @@ foreach my $test (@tests) {
              $initially_malformed = 1;
          }
          elsif (requires_extended_utf8($allowed_uv)) {
-            $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
-                                \Q and not portable\E/x;
-            $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable";
+            $cp_message_qr = $extended_cp_message_qr;
+            $non_cp_trailing_text = $extended_non_cp_trailing_text;
+            $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
          }
          else {
              $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
@@ -870,6 +866,9 @@ foreach my $test (@tests) {
            # maximum length, so skip if we're already at that length.
            next if $overlong && $length >= $::max_bytes;
  
+          my $this_cp_message_qr = $cp_message_qr;
+          my $this_non_cp_trailing_text = $non_cp_trailing_text;
+
            foreach my $malformed_allow_type (0..2) {
              # 0 don't allow this malformation; ignored if no malformation
              # 1 allow, with REPLACEMENT CHARACTER returned
@@ -899,6 +898,8 @@ foreach my $test (@tests) {
              # combinations of on/off are tested for.  It's either all are
              # allowed, or none are.
              my $allow_flags = 0;
+            my $overlong_is_in_perl_extended_utf8 = 0;
+            my $dont_use_overlong_cp = 0;
  
              if ($overlong) {
                  my $new_expected_len;
@@ -929,8 +930,20 @@ foreach my $test (@tests) {
                  else {  # Must use extended UTF-8.  On ASCII platforms, we
                          # could express some overlongs here starting with
                          # \xFE, but there's no real reason to do so.
+                    $overlong_is_in_perl_extended_utf8 = 1;
                      $start_byte = I8_to_native("\xFF");
                      $new_expected_len = $::max_bytes;
+                    $this_cp_message_qr = $extended_cp_message_qr;
+
+                    # The warning that gets raised doesn't include the code
+                    # point in the message if the code point can be expressed
+                    # without using extended UTF-8, but the particular
+                    # overlong sequence used is in extended UTF-8.  To do
+                    # otherwise would be confusing to the user, as it would
+                    # claim the code point requires extended, when it doesn't.
+                    $dont_use_overlong_cp = 1
+                                    unless requires_extended_utf8($allowed_uv);
+                    $this_non_cp_trailing_text = $extended_non_cp_trailing_text;
                  }
  
                  # Splice in the revise continuation byte, preceded by the
@@ -1152,12 +1165,20 @@ foreach my $test (@tests) {
                      # on all the other flags.  That makes sure that they all
                      # are independent of this flag, and so we don't need to
                      # test them individually.
-                    my $this_warning_flags  = ($use_warn_flag)
-                                              ? $this_utf8n_flag_to_warn
-                                              : $utf8n_flag_to_warn_complement;
-                    my $this_disallow_flags = ($do_disallow)
-                                          ? $this_utf8n_flag_to_disallow
-                                          : $utf8n_flag_to_disallow_complement;
+                    my $this_warning_flags
+                            = ($use_warn_flag)
+                              ? $this_utf8n_flag_to_warn
+                              : ($overlong_is_in_perl_extended_utf8
+                                ? ($utf8n_flag_to_warn_complement
+                                    & ~$::UTF8_WARN_PERL_EXTENDED)
+                                :  $utf8n_flag_to_warn_complement);
+                    my $this_disallow_flags
+                            = ($do_disallow)
+                              ? $this_utf8n_flag_to_disallow
+                              : ($overlong_is_in_perl_extended_utf8
+                                 ? ($utf8n_flag_to_disallow_complement
+                                    & ~$::UTF8_DISALLOW_PERL_EXTENDED)
+                                 :  $utf8n_flag_to_disallow_complement);
                      my $expected_uv = $allowed_uv;
                      my $this_uv_string = $uv_string;
  
@@ -1216,19 +1237,21 @@ foreach my $test (@tests) {
  
                          # So far the array contains warnings generated by
                          # malformations.  Add the expected regular one.
-                        unshift @expected_warnings, $cp_message_qr;
+                        unshift @expected_warnings, $this_cp_message_qr;
  
                          # But it may need to be modified, because either of
                          # these malformations means we can't determine the
                          # expected code point.
-                        if ($short || $unexpected_noncont) {
+                        if (   $short || $unexpected_noncont
+                            || $dont_use_overlong_cp)
+                        {
                              my $first_byte = substr($this_bytes, 0, 1);
                              $expected_warnings[0] = display_bytes(
                                      substr($this_bytes, 0, $this_expected_len));
                              $expected_warnings[0]
                                  = qr/[Aa]\Qny UTF-8 sequence that starts with\E
                                       \Q $expected_warnings[0]\E
-                                     \Q $non_cp_trailing_text\E/x;
+                                     \Q $this_non_cp_trailing_text\E/x;
                          }
                      }
  
diff --git a/proto.h b/proto.h

index d9ef91c..a8f6de8 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -4078,6 +4078,17 @@ PERL_CALLCONV int        Perl_my_sprintf(char *buffer, const char *pat, ...);
  STATIC NV      S_mulexp10(NV value, I32 exponent);
  #  endif
  #endif
+#if !defined(UV_IS_QUAD)
+#  if defined(PERL_IN_UTF8_C)
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE bool        S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS      \
+       assert(s); assert(e)
+#endif
+
+#  endif
+#endif
  #if !defined(WIN32)
  PERL_CALLCONV bool     Perl_do_exec3(pTHX_ const char *incmd, int fd, int do_report);
  #define PERL_ARGS_ASSERT_DO_EXEC3      \
@@ -5835,13 +5846,6 @@ PERL_STATIC_INLINE bool  S_is_utf8_common_with_len(pTHX_ const U8 *const p, const
  #endif
  
  #ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE bool        S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
-                       __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS      \
-       assert(s); assert(e)
-#endif
-
-#ifndef PERL_NO_INLINE_FUNCTIONS
  PERL_STATIC_INLINE bool        S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
                         __attribute__warn_unused_result__;
  #define PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK  \
diff --git a/utf8.c b/utf8.c

index 88c2b32..4be3bb7 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -122,8 +122,9 @@ const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
                                     " is not recommended for open interchange";
  const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
                                     " may not be portable";
-const char above_31_bit_cp_format[] = "Code point 0x%" UVXf " is not"
-                                      " Unicode, and not portable";
+const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not"        \
+                                       " Unicode, requires a Perl extension," \
+                                       " and so is not portable";
  
  #define HANDLE_UNICODE_SURROGATE(uv, flags)                         \
      STMT_START {                                                    \
@@ -220,7 +221,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
  
                /* Choose the more dire applicable warning */
                (UNICODE_IS_PERL_EXTENDED(uv))
-              ? above_31_bit_cp_format
+              ? perl_extended_cp_format
                : super_cp_format,
               uv);
          }
@@ -362,30 +363,27 @@ defined in
  L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
  See L<perlunicode/Noncharacter code points>.
  
-Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
-so using them is more problematic than other above-Unicode code points.  Perl
-invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
-likely that non-Perl languages will not be able to read files that contain
-these that written by the perl interpreter; nor would Perl understand files
-written by something that uses a different extension.  For these reasons, there
-is a separate set of flags that can warn and/or disallow these extremely high
-code points, even if other above-Unicode ones are accepted.  These are the
-C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags.  These
-are entirely independent from the deprecation warning for code points above
-C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
-code point that needs more than 31 bits to represent.  When that happens,
-effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
-32-bit machines.  (Of course C<UNICODE_DISALLOW_SUPER> will treat all
-above-Unicode code points, including these, as malformations; and
-C<UNICODE_WARN_SUPER> warns on these.)
-
-On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
-extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
-than on ASCII.  Prior to that, code points 2**31 and higher were simply
-unrepresentable, and a different, incompatible method was used to represent
-code points between 2**30 and 2**31 - 1.  The flags C<UNICODE_WARN_ABOVE_31_BIT>
-and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
-platforms, warning and disallowing 2**31 and higher.
+Extremely high code points were never specified in any standard, and require an
+extension to UTF-8 to express, which Perl does.  It is likely that programs
+written in something other than Perl would not be able to read files that
+contain these; nor would Perl understand files written by something that uses a
+different extension.  For these reasons, there is a separate set of flags that
+can warn and/or disallow these extremely high code points, even if other
+above-Unicode ones are accepted.  They are the C<UNICODE_WARN_PERL_EXTENDED>
+and C<UNICODE_DISALLOW_PERL_EXTENDED> flags.  For more information see
+L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
+treat all above-Unicode code points, including these, as malformations.  (Note
+that the Unicode standard considers anything above 0x10FFFF to be illegal, but
+there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
+
+A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
+retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>.  Similarly,
+C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
+C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because these
+flags can apply to code points that actually do fit in 31 bits.  This happens
+on EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
+describe the situation in all cases.
  
  =cut
  */
@@ -399,6 +397,8 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
      return uvchr_to_utf8_flags(d, uv, flags);
  }
  
+#ifndef UV_IS_QUAD
+
  PERL_STATIC_INLINE bool
  S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
  {
@@ -480,6 +480,8 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
  
  }
  
+#endif
+
  /* Anything larger than this will overflow the word if it were converted into a UV */
  #if defined(UV_IS_QUAD)
  #  ifdef EBCDIC     /* Actually is I8 */
@@ -730,10 +732,12 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
  #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xF1              \
                                                         /* B6 and B7 */      \
                                                && ((s1) & 0xFE ) == 0xB6)
+#  define isUTF8_PERL_EXTENDED(s)   (*s == I8_TO_NATIVE_UTF8(0xFF))
  #else
  #  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
  #  define IS_UTF8_2_BYTE_SUPER(s0, s1)           ((s0) == 0xF4 && (s1) >= 0x90)
  #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xED && (s1) >= 0xA0)
+#  define isUTF8_PERL_EXTENDED(s)   (*s >= 0xFE)
  #endif
  
          if (  (flags & UTF8_DISALLOW_SUPER)
@@ -743,9 +747,9 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
          }
  
          if (   (flags & UTF8_DISALLOW_PERL_EXTENDED)
-            &&  UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
+            &&  UNLIKELY(isUTF8_PERL_EXTENDED(s)))
          {
-            return 0;           /* Above 31 bits */
+            return 0;
          }
  
          if (len > 1) {
@@ -954,36 +958,34 @@ a malformation and raise a warning, specify both the WARN and DISALLOW flags.
  (But note that warnings are not raised if lexically disabled nor if
  C<UTF8_CHECK_ONLY> is also specified.)
  
+Extremely high code points were never specified in any standard, and require an
+extension to UTF-8 to express, which Perl does.  It is likely that programs
+written in something other than Perl would not be able to read files that
+contain these; nor would Perl understand files written by something that uses a
+different extension.  For these reasons, there is a separate set of flags that
+can warn and/or disallow these extremely high code points, even if other
+above-Unicode ones are accepted.  They are the C<UTF8_WARN_PERL_EXTENDED> and
+C<UTF8_DISALLOW_PERL_EXTENDED> flags.  For more information see
+L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
+above-Unicode code points, including these, as malformations.
+(Note that the Unicode standard considers anything above 0x10FFFF to be
+illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
+(2**31 -1))
+
+A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
+retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>.  Similarly,
+C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
+C<UTF8_DISALLOW_PERL_EXTENDED>.  The names are misleading because these flags
+can apply to code points that actually do fit in 31 bits.  This happens on
+EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
+describe the situation in all cases.
+
  It is now deprecated to have very high code points (above C<IV_MAX> on the
  platforms) and this function will raise a deprecation warning for these (unless
  such warnings are turned off).  This value is typically 0x7FFF_FFFF (2**31 -1)
  in a 32-bit word.
  
-Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
-so using them is more problematic than other above-Unicode code points.  Perl
-invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
-likely that non-Perl languages will not be able to read files that contain
-these; nor would Perl understand files
-written by something that uses a different extension.  For these reasons, there
-is a separate set of flags that can warn and/or disallow these extremely high
-code points, even if other above-Unicode ones are accepted.  These are the
-C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags.  These
-are entirely independent from the deprecation warning for code points above
-C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
-code point that needs more than 31 bits to represent.  When that happens,
-effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
-32-bit machines.  (Of course C<UTF8_DISALLOW_SUPER> will treat all
-above-Unicode code points, including these, as malformations; and
-C<UTF8_WARN_SUPER> warns on these.)
-
-On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
-extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
-than on ASCII.  Prior to that, code points 2**31 and higher were simply
-unrepresentable, and a different, incompatible method was used to represent
-code points between 2**30 and 2**31 - 1.  The flags C<UTF8_WARN_ABOVE_31_BIT>
-and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
-platforms, warning and disallowing 2**31 and higher.
-
  All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
  warn.
@@ -1026,12 +1028,36 @@ exceptions are noted:
  
  =over 4
  
-=item C<UTF8_GOT_ABOVE_31_BIT>
+=item C<UTF8_GOT_PERL_EXTENDED>
  
-The code point represented by the input UTF-8 sequence occupies more than 31
-bits.
-This bit is set only if the input C<flags> parameter contains either the
-C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
+The input sequence is not standard UTF-8, but a Perl extension.  This bit is
+set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
+
+Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
+and so some extension must be used to express them.  Perl uses a natural
+extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
+extension to represent even higher ones, so that any code point that fits in a
+64-bit word can be represented.  Text using these extensions is not likely to
+be portable to non-Perl code.  We lump both of these extensions together and
+refer to them as Perl extended UTF-8.  There exist other extensions that people
+have invented, incompatible with Perl's.
+
+On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
+extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
+than on ASCII.  Prior to that, code points 2**31 and higher were simply
+unrepresentable, and a different, incompatible method was used to represent
+code points between 2**30 and 2**31 - 1.
+
+On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
+Perl extended UTF-8 is used.
+
+In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
+may use for backward compatibility.  That name is misleading, as this flag may
+be set when the code point actually does fit in 31 bits.  This happens on
+EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present.  The new name accurately
+describes the situation in all cases.
  
  =item C<UTF8_GOT_CONTINUATION>
  
@@ -1119,8 +1145,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
       * too short one.  Otherwise the first two are set to 's0' and 'send', and
       * the third not used at all */
      U8 * adjusted_s0 = (U8 *) s0;
-    U8 * adjusted_send = NULL;  /* (Initialized to silence compilers' wrong
-                                   warning) */
      U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
                                              routine; see [perl #130921] */
      UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
@@ -1212,7 +1236,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
      else {
          send += expectlen;
      }
-    adjusted_send = send;
  
      /* Now, loop through the remaining bytes in the character's sequence,
       * accumulating each into the working value as we go. */
@@ -1297,7 +1320,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
              }
  
              adjusted_s0 = temp_char_buf;
-            adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+            (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
          }
      }
  
@@ -1316,7 +1339,8 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                             * and we deal with those in the overflow handling
                             * code */
                  && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
-                && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
+                && (   isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
+                    || UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
         && ((flags & ( UTF8_DISALLOW_NONCHAR
                        |UTF8_DISALLOW_SURROGATE
                        |UTF8_DISALLOW_SUPER
@@ -1396,13 +1420,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
       *                      end, based on how many bytes the start byte tells
       *                      us should be in it, but no further than s0 +
       *                      avail_len
-     * adjusted_s0          normally is the same as s0, but in case of an
-     *                      overlong for which the UTF-8 matters below, it is
-     *                      the first byte of the shortest form representation
-     *                      of the input.
-     * adjusted_send        normally is the same as 'send', but if adjusted_s0
-     *                      is set to something other than s0, this points one
-     *                      beyond its end
       */
  
      if (UNLIKELY(possible_problems)) {
@@ -1603,39 +1620,35 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      }
                  }
  
-                /* The maximum code point ever specified by a standard was
-                 * 2**31 - 1.  Anything larger than that is a Perl extension
-                 * that very well may not be understood by other applications
-                 * (including earlier perl versions on EBCDIC platforms).  We
-                 * test for these after the regular SUPER ones, and before
-                 * possibly bailing out, so that the slightly more dire warning
-                 * will override the regular one. */
-                if (   (flags & (UTF8_WARN_PERL_EXTENDED
-                                |UTF8_WARN_SUPER
-                                |UTF8_DISALLOW_PERL_EXTENDED))
-                    && (   (   UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
-                            && UNLIKELY(is_utf8_cp_above_31_bits(
-                                                adjusted_s0,
-                                                adjusted_send)))
-                        || (   LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
-                            && UNLIKELY(UNICODE_IS_PERL_EXTENDED(uv)))))
-                {
+                /* Test for Perl's extended UTF-8 after the regular SUPER ones,
+                 * and before possibly bailing out, so that the more dire
+                 * warning will override the regular one. */
+                if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
                      if (  ! (flags & UTF8_CHECK_ONLY)
                          &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
                          &&  ckWARN_d(WARN_NON_UNICODE))
                      {
                          pack_warn = packWARN(WARN_NON_UNICODE);
  
-                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                        /* If it is an overlong that evaluates to a code point
+                         * that doesn't have to use the Perl extended UTF-8, it
+                         * still used it, and so we output a message that
+                         * doesn't refer to the code point.  The same is true
+                         * if there was a SHORT malformation where the code
+                         * point is not valid.  In that case, 'uv' will have
+                         * been set to the REPLACEMENT CHAR, and the message
+                         * below without the code point in it will be selected
+                         * */
+                        if (UNICODE_IS_PERL_EXTENDED(uv)) {
                              message = Perl_form(aTHX_
-                                        "Any UTF-8 sequence that starts with"
-                                        " \"%s\" is for a non-Unicode code"
-                                        " point, and is not portable",
-                                        _byte_dump_string(s0, curlen, 0));
+                                            perl_extended_cp_format, uv);
                          }
                          else {
                              message = Perl_form(aTHX_
-                                            above_31_bit_cp_format, uv);
+                                        "Any UTF-8 sequence that starts with"
+                                        " \"%s\" is a Perl extension, and"
+                                        " so is not portable",
+                                        _byte_dump_string(s0, curlen, 0));
                          }
                      }
  
diff --git a/utf8.h b/utf8.h

index c880375..0f29817 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -407,6 +407,8 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
      ( LIKELY( ( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
  : LIKELY( ( ( ( 0xF4 == ((const U8*)s)[0] ) && ( ( ((const U8*)s)[1] & 0xF0 ) == 0x80 ) ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
  
+#define UNICODE_IS_PERL_EXTENDED(uv)    UNLIKELY((UV) (uv) > 0x7FFFFFFF)
+
  #endif /* EBCDIC vs ASCII */
  
  /* 2**UTF_ACCUMULATION_SHIFT - 1 */
@@ -781,7 +783,8 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
  #define UTF8_GOT_PERL_EXTENDED          UTF8_DISALLOW_PERL_EXTENDED
  #define UTF8_WARN_PERL_EXTENDED         0x8000
  
-/* For back compat, these old names are misleading for UTF_EBCDIC */
+/* For back compat, these old names are misleading for overlongs and
+ * UTF_EBCDIC. */
  #define UTF8_DISALLOW_ABOVE_31_BIT      UTF8_DISALLOW_PERL_EXTENDED
  #define UTF8_GOT_ABOVE_31_BIT           UTF8_GOT_PERL_EXTENDED
  #define UTF8_WARN_ABOVE_31_BIT          UTF8_WARN_PERL_EXTENDED
@@ -958,7 +961,6 @@ point's representation.
           && UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
  
  #define UNICODE_IS_SUPER(uv)    ((UV) (uv) > PERL_UNICODE_MAX)
-#define UNICODE_IS_PERL_EXTENDED(uv)    ((UV) (uv) > 0x7FFFFFFF)
  
  #define LATIN_SMALL_LETTER_SHARP_S      LATIN_SMALL_LETTER_SHARP_S_NATIVE
  #define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS                                  \
diff --git a/utfebcdic.h b/utfebcdic.h

index 0f81d1f..c2f0788 100644 (file)
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -511,6 +511,8 @@ explicitly forbidden, and the shortest possible encoding should always be used
   * has this start byte (expressed in I8) as the maximum */
  #define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF9
  
+#define UNICODE_IS_PERL_EXTENDED(uv)    UNLIKELY((UV) (uv) > 0x3FFFFFFF)
+
  /*
   * ex: set ts=8 sts=4 sw=4 et:
   */
author	Karl Williamson <khw@cpan.org>
	Tue, 27 Jun 2017 20:46:26 +0000 (14:46 -0600)
committer	Karl Williamson <khw@cpan.org>
	Thu, 13 Jul 2017 03:14:25 +0000 (21:14 -0600)
embed.fnc		patch \| blob \| blame \| history
embed.h		patch \| blob \| blame \| history
ext/XS-APItest/t/utf8_warn_base.pl		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history
utf8.h		patch \| blob \| blame \| history
utfebcdic.h		patch \| blob \| blame \| history