Add details to UTF-8 malformation error messages

author Karl Williamson <khw@cpan.org>

Wed, 28 Sep 2016 21:05:17 +0000 (15:05 -0600)

committer Karl Williamson <khw@cpan.org>

Thu, 13 Oct 2016 17:18:11 +0000 (11:18 -0600)
author Karl Williamson <khw@cpan.org>
Wed, 28 Sep 2016 21:05:17 +0000 (15:05 -0600)
committer Karl Williamson <khw@cpan.org>
Thu, 13 Oct 2016 17:18:11 +0000 (11:18 -0600)
diff --git a/embed.fnc b/embed.fnc

index 43a17de..4e60cd2 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1674,7 +1674,11 @@ ApdD     |UV     |to_utf8_case   |NN const U8 *p                                 \
                                 NULLOK const char *special
  #if defined(PERL_IN_UTF8_C)
  sMR    |char * |unexpected_non_continuation_text                       \
-               |NN const U8 * const s|const STRLEN len
+               |NN const U8 * const s                                  \
+               |const STRLEN print_len                                 \
+               |const STRLEN non_cont_byte_pos                         \
+               |const STRLEN expect_len
+sM     |char * |_byte_dump_string|NN const U8 * s|const STRLEN len
  s      |UV     |_to_utf8_case  |const UV uv1                                   \
                                 |NN const U8 *p                                 \
                                 |NN U8* ustrp                                   \
diff --git a/embed.h b/embed.h

index 6045ec7..d9c2709 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -1821,6 +1821,7 @@
  #define isa_lookup(a,b,c,d)    S_isa_lookup(aTHX_ a,b,c,d)
  #  endif
  #  if defined(PERL_IN_UTF8_C)
+#define _byte_dump_string(a,b) S__byte_dump_string(aTHX_ a,b)
  #define _to_utf8_case(a,b,c,d,e,f,g)   S__to_utf8_case(aTHX_ a,b,c,d,e,f,g)
  #define check_locale_boundary_crossing(a,b,c,d)        S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
  #define is_utf8_common(a,b,c,d)        S_is_utf8_common(aTHX_ a,b,c,d)
@@ -1828,7 +1829,7 @@
  #define swash_scan_list_line(a,b,c,d,e,f,g)    S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
  #define swatch_get(a,b,c)      S_swatch_get(aTHX_ a,b,c)
  #define to_lower_latin1                S_to_lower_latin1
-#define unexpected_non_continuation_text(a,b)  S_unexpected_non_continuation_text(aTHX_ a,b)
+#define unexpected_non_continuation_text(a,b,c,d)      S_unexpected_non_continuation_text(aTHX_ a,b,c,d)
  #  endif
  #  if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
  #define _to_upper_title_latin1(a,b,c,d)        Perl__to_upper_title_latin1(aTHX_ a,b,c,d)
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t

index 32f60e0..7a951a0 100644 (file)
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -1007,7 +1007,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          0,   # NUL
          2,
-        qr/2 bytes, need 1/
+        qr/overlong/
      ],
      [ "overlong malformation, highest 2-byte",
          (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
@@ -1015,7 +1015,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          (isASCII) ? 0x7F : utf8::unicode_to_native(0xBF),
          2,
-        qr/2 bytes, need 1/
+        qr/overlong/
      ],
      [ "overlong malformation, lowest 3-byte",
          (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
@@ -1023,7 +1023,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          0,   # NUL
          3,
-        qr/3 bytes, need 1/
+        qr/overlong/
      ],
      [ "overlong malformation, highest 3-byte",
          (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
@@ -1031,7 +1031,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          (isASCII) ? 0x7FF : 0x3FF,
          3,
-        qr/3 bytes, need 2/
+        qr/overlong/
      ],
      [ "overlong malformation, lowest 4-byte",
          (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
@@ -1039,7 +1039,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          0,   # NUL
          4,
-        qr/4 bytes, need 1/
+        qr/overlong/
      ],
      [ "overlong malformation, highest 4-byte",
          (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
@@ -1047,7 +1047,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          (isASCII) ? 0xFFFF : 0x3FFF,
          4,
-        qr/4 bytes, need 3/
+        qr/overlong/
      ],
      [ "overlong malformation, lowest 5-byte",
          (isASCII)
@@ -1057,7 +1057,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          0,   # NUL
          5,
-        qr/5 bytes, need 1/
+        qr/overlong/
      ],
      [ "overlong malformation, highest 5-byte",
          (isASCII)
@@ -1067,7 +1067,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          (isASCII) ? 0x1FFFFF : 0x3FFFF,
          5,
-        qr/5 bytes, need 4/
+        qr/overlong/
      ],
      [ "overlong malformation, lowest 6-byte",
          (isASCII)
@@ -1077,7 +1077,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          0,   # NUL
          6,
-        qr/6 bytes, need 1/
+        qr/overlong/
      ],
      [ "overlong malformation, highest 6-byte",
          (isASCII)
@@ -1087,7 +1087,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
          6,
-        qr/6 bytes, need 5/
+        qr/overlong/
      ],
      [ "overlong malformation, lowest 7-byte",
          (isASCII)
@@ -1097,7 +1097,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          0,   # NUL
          7,
-        qr/7 bytes, need 1/
+        qr/overlong/
      ],
      [ "overlong malformation, highest 7-byte",
          (isASCII)
@@ -1107,7 +1107,7 @@ my @malformations = (
          $UTF8_ALLOW_LONG,
          (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
          7,
-        qr/7 bytes, need 6/
+        qr/overlong/
      ],
  );
  
@@ -1120,7 +1120,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
              0,  # There is no way to allow this malformation
              $REPLACEMENT,
              7,
-            qr/overflow/
+            qr/overflows/
          ],
          [ "overflow malformation, can tell on first byte",
              "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
@@ -1128,7 +1128,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
              0,  # There is no way to allow this malformation
              $REPLACEMENT,
              13,
-            qr/overflow/
+            qr/overflows/
          ];
  }
  else {
@@ -1146,7 +1146,7 @@ else {
              $UTF8_ALLOW_LONG,
              0,   # NUL
              (isASCII) ? 13 : 14,
-            qr/1[34] bytes, need 1/,    # 1[34] to work on either ASCII or EBCDIC
+            qr/overlong/,
          ],
          [ "overlong malformation, highest max-byte",
              (isASCII)    # 2**36-1 on ASCII; 2**30-1 on EBCDIC
@@ -1156,7 +1156,7 @@ else {
              $UTF8_ALLOW_LONG,
              (isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF,
              (isASCII) ? 13 : 14,
-            qr/1[34] bytes, need 7/,
+            qr/overlong/,
          ];
  
      if (! $is64bit) {   # 32-bit EBCDIC
@@ -1167,7 +1167,7 @@ else {
              0,  # There is no way to allow this malformation
              $REPLACEMENT,
              14,
-            qr/overflow/
+            qr/overflows/
          ];
      }
      else {  # 64-bit
@@ -1180,7 +1180,7 @@ else {
                  0,  # There is no way to allow this malformation
                  $REPLACEMENT,
                  (isASCII) ? 13 : 14,
-                qr/overflow/
+                qr/overflows/
              ];
      }
  }
@@ -1656,7 +1656,7 @@ my @tests = (
          "$UTF8_DISALLOW_ABOVE_31_BIT",
          'utf8', 0,
          (! isASCII) ? 14 : ($is64bit) ? 13 : 7,
-        qr/overflow at byte .*, after start byte 0xf/
+        qr/overflows/
      ],
  );
  
diff --git a/lib/utf8.t b/lib/utf8.t

index c9dbb6e..0530faf 100644 (file)
--- a/lib/utf8.t
+++ b/lib/utf8.t
@@ -165,7 +165,7 @@ no utf8; # Ironic, no?
      use utf8; %a = ("\xE1\xA0"=>"sterling");
      print 'start'; printf '%x,', ord \$_ foreach keys %a; print "end\n";
  BANG
-             qr/^Malformed UTF-8 character \(\d bytes?, need \d, .+\).*start\d+,end$/sm
+             qr/^Malformed UTF-8 character: .*? \(too short; got \d bytes?, need \d\).*start\d+,end$/sm
              ],
              );
      foreach (@tests) {
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index 2c3986b..dbba9ef 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -204,7 +204,9 @@ XXX Changes (i.e. rewording) of diagnostic messages go here
  
  =item *
  
-XXX Describe change here
+Details as to the exact problem have been added to the diagnostics that
+occur when malformed UTF-8 is encountered when trying to convert to a
+code point.
  
  =back
  
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 8348663..d9f807c 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3346,8 +3346,11 @@ so the signature was instead interpreted as a bad prototype.
  
  =item Malformed UTF-8 character (%s)
  
-(S utf8)(F) Perl detected a string that didn't comply with UTF-8
-encoding rules, even though it had the UTF8 flag on.
+(S utf8)(F) Perl detected a string that should be UTF-8, but didn't
+comply with UTF-8 encoding rules, or represents a code point whose
+ordinal integer value doesn't fit into the word size of the current
+platform (overflows).  Details as to the exact malformation are given in
+the variable, C<%s>, part of the message.
  
  One possible cause is that you set the UTF8 flag yourself for data that
  you thought to be in UTF-8 but it wasn't (it was for example legacy
diff --git a/proto.h b/proto.h

index 6491d1f..78519f6 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -5572,6 +5572,9 @@ STATIC bool       S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U
         assert(stash); assert(name)
  #endif
  #if defined(PERL_IN_UTF8_C)
+STATIC char *  S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len);
+#define PERL_ARGS_ASSERT__BYTE_DUMP_STRING     \
+       assert(s)
  STATIC UV      S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special);
  #define PERL_ARGS_ASSERT__TO_UTF8_CASE \
         assert(p); assert(ustrp); assert(swashp); assert(normal)
@@ -5604,7 +5607,7 @@ STATIC SV*        S_swatch_get(pTHX_ SV* swash, UV start, UV span)
  STATIC U8      S_to_lower_latin1(const U8 c, U8 *p, STRLEN *lenp)
                         __attribute__warn_unused_result__;
  
-STATIC char *  S_unexpected_non_continuation_text(pTHX_ const U8 * const s, const STRLEN len)
+STATIC char *  S_unexpected_non_continuation_text(pTHX_ const U8 * const s, const STRLEN print_len, const STRLEN non_cont_byte_pos, const STRLEN expect_len)
                         __attribute__warn_unused_result__;
  #define PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT      \
         assert(s)
diff --git a/t/io/utf8.t b/t/io/utf8.t

index 282b8e7..3909dc8 100644 (file)
--- a/t/io/utf8.t
+++ b/t/io/utf8.t
@@ -174,7 +174,7 @@ SKIP: {
         local $SIG{__WARN__} = sub { push @warnings, $_[0]; };
         eval { sprintf "%vd\n", $x };
         is (scalar @warnings, 1);
-       like ($warnings[0], qr/Malformed UTF-8 character \(unexpected continuation byte 0x82, with no preceding start byte/);
+       like ($warnings[0], qr/Malformed UTF-8 character: \\x82 \(unexpected continuation byte 0x82, with no preceding start byte/);
      }
  }
  
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8

index 4263c04..1c782e7 100644 (file)
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -30,8 +30,8 @@ my $a = "sn
      my $a = "snøstorm";
  }
  EXPECT
-Malformed UTF-8 character (unexpected non-continuation byte 0x73, immediately after start byte 0xf8) at - line 9.
-Malformed UTF-8 character (unexpected non-continuation byte 0x73, immediately after start byte 0xf8) at - line 14.
+Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 9.
+Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 14.
  ########
  use warnings 'utf8';
  my $d7ff  = uc(chr(0xD7FF));
@@ -766,4 +766,4 @@ BEGIN{
      }
  {};$^H=2**400}Â
  EXPECT
-Malformed UTF-8 character (unexpected non-continuation byte 0x0a, immediately after start byte 0xc2) at - line 6.
+Malformed UTF-8 character: \xc2\x0a (unexpected non-continuation byte 0x0a, immediately after start byte 0xc2; need 2 bytes, got 1) at - line 6.
diff --git a/t/op/pack.t b/t/op/pack.t

index df16464..014fbc5 100644 (file)
--- a/t/op/pack.t
+++ b/t/op/pack.t
@@ -964,7 +964,7 @@ is("@{[unpack('U*', pack('U*', 100, 200))]}", "100 200");
          my $bad = pack("U0C", 202);
          local $SIG{__WARN__} = sub { $@ = "@_" };
          my @null = unpack('U0U', $bad);
-        like($@, qr/^Malformed UTF-8 character /);
+        like($@, qr/^Malformed UTF-8 character: /);
      }
  }
  
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t

index c8c7dc7..40ec540 100644 (file)
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -141,29 +141,29 @@ __DATA__
  3.3.10 n -     5       fd:bf:bf:bf:bf  -       5 bytes, need 6
  3.4    Concatenation of incomplete sequences
  3.4.1 N10 -    30      c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf       -       unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
-3.5    Impossible bytes
-3.5.1 n -      1       fe      -       byte 0xfe
-3.5.2 n -      1       ff      -       byte 0xff
+3.5    Impossible bytes (but not with Perl's extended UTF-8)
+3.5.1 n -      1       fe      -       1 byte, need 7
+3.5.2 n -      1       ff      -       1 byte, need 13
  3.5.3 N4 -     4       fe:fe:ff:ff     -       byte 0xfe
  4      Overlong sequences
  4.1    Examples of an overlong ASCII character
-4.1.1 n -      2       c0:af   -       2 bytes, need 1
-4.1.2 n -      3       e0:80:af        -       3 bytes, need 1
-4.1.3 n -      4       f0:80:80:af     -       4 bytes, need 1
-4.1.4 n -      5       f8:80:80:80:af  -       5 bytes, need 1
-4.1.5 n -      6       fc:80:80:80:80:af       -       6 bytes, need 1
+4.1.1 n -      2       c0:af   -       overlong
+4.1.2 n -      3       e0:80:af        -       overlong
+4.1.3 n -      4       f0:80:80:af     -       overlong
+4.1.4 n -      5       f8:80:80:80:af  -       overlong
+4.1.5 n -      6       fc:80:80:80:80:af       -       overlong
  4.2    Maximum overlong sequences
-4.2.1 n -      2       c1:bf   -       2 bytes, need 1
-4.2.2 n -      3       e0:9f:bf        -       3 bytes, need 2
-4.2.3 n -      4       f0:8f:bf:bf     -       4 bytes, need 3
-4.2.4 n -      5       f8:87:bf:bf:bf  -       5 bytes, need 4
-4.2.5 n -      6       fc:83:bf:bf:bf:bf       -       6 bytes, need 5
+4.2.1 n -      2       c1:bf   -       overlong
+4.2.2 n -      3       e0:9f:bf        -       overlong
+4.2.3 n -      4       f0:8f:bf:bf     -       overlong
+4.2.4 n -      5       f8:87:bf:bf:bf  -       overlong
+4.2.5 n -      6       fc:83:bf:bf:bf:bf       -       overlong
  4.3    Overlong representation of the NUL character
-4.3.1 n -      2       c0:80   -       2 bytes, need 1
-4.3.2 n -      3       e0:80:80        -       3 bytes, need 1
-4.3.3 n -      4       f0:80:80:80     -       4 bytes, need 1
-4.3.4 n -      5       f8:80:80:80:80  -       5 bytes, need 1
-4.3.5 n -      6       fc:80:80:80:80:80       -       6 bytes, need 1
+4.3.1 n -      2       c0:80   -       overlong
+4.3.2 n -      3       e0:80:80        -       overlong
+4.3.3 n -      4       f0:80:80:80     -       overlong
+4.3.4 n -      5       f8:80:80:80:80  -       overlong
+4.3.5 n -      6       fc:80:80:80:80:80       -       overlong
  5      Illegal code positions
  5.1    Single UTF-16 surrogates
  5.1.1 y d800   3       ed:a0:80        1       UTF-16 surrogate 0xd800
diff --git a/utf8.c b/utf8.c

index 2c3ecf7..1803bd8 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -656,21 +656,85 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
  #undef FE_ABOVE_OVERLONG
  #undef FF_OVERLONG_PREFIX
  
+STATIC char *
+S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
+{
+    /* Returns a mortalized C string that is a displayable copy of the 'len'
+     * bytes starting at 's', each in a \xXY format. */
+
+    const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
+                                               trailing NUL */
+    const U8 * const e = s + len;
+    char * output;
+    char * d;
+
+    PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
+
+    Newx(output, output_len, char);
+    SAVEFREEPV(output);
+
+    d = output;
+    for (; s < e; s++) {
+        const unsigned high_nibble = (*s & 0xF0) >> 4;
+        const unsigned low_nibble =  (*s & 0x0F);
+
+        *d++ = '\\';
+        *d++ = 'x';
+
+        if (high_nibble < 10) {
+            *d++ = high_nibble + '0';
+        }
+        else {
+            *d++ = high_nibble - 10 + 'a';
+        }
+
+        if (low_nibble < 10) {
+            *d++ = low_nibble + '0';
+        }
+        else {
+            *d++ = low_nibble - 10 + 'a';
+        }
+    }
+
+    *d = '\0';
+    return output;
+}
+
  PERL_STATIC_INLINE char *
-S_unexpected_non_continuation_text(pTHX_ const U8 * const s, const STRLEN len)
+S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
+
+                                         /* How many bytes to print */
+                                         const STRLEN print_len,
+
+                                         /* Which one is the non-continuation */
+                                         const STRLEN non_cont_byte_pos,
+
+                                         /* How many bytes should there be? */
+                                         const STRLEN expect_len)
  {
      /* Return the malformation warning text for an unexpected continuation
       * byte. */
  
-    const char * const where = (len == 1)
+    const char * const where = (non_cont_byte_pos == 1)
                                 ? "immediately"
-                               : Perl_form(aTHX_ "%d bytes", (int) len);
+                               : Perl_form(aTHX_ "%d bytes",
+                                                 (int) non_cont_byte_pos);
  
      PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
  
-    return Perl_form(aTHX_ "%s (unexpected non-continuation byte 0x%02x,"
-                           " %s after start byte 0x%02x)",
-                           malformed_text, *(s + len), where, *s);
+    /* We don't need to pass this parameter, but since it has already been
+     * calculated, it's likely faster to pass it; verify under DEBUGGING */
+    assert(expect_len == UTF8SKIP(s));
+
+    return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
+                           " %s after start byte 0x%02x; need %d bytes, got %d)",
+                           malformed_text,
+                           _byte_dump_string(s, print_len),
+                           *(s + non_cont_byte_pos),
+                           where,
+                           *s,
+                           (int) expect_len,
+                           (int) non_cont_byte_pos);
  }
  
  /*
@@ -782,7 +846,6 @@ UV
  Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  {
      const U8 * const s0 = s;
-    U8 overflow_byte = '\0';   /* Save byte in case of overflow */
      U8 * send;
      UV uv = *s;
      STRLEN expectlen;
@@ -827,7 +890,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
             return 0;
         }
         if (! (flags & UTF8_CHECK_ONLY)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
+           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)",
+                                                malformed_text));
         }
         goto malformed;
      }
@@ -857,7 +921,11 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
         }
  
         if (! (flags & UTF8_CHECK_ONLY)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
+           sv = sv_2mortal(Perl_newSVpvf(aTHX_
+                            "%s: %s (unexpected continuation byte 0x%02x,"
+                            " with no preceding start byte)",
+                            malformed_text,
+                            _byte_dump_string(s0, 1), *s0));
         }
         curlen = 1;
         goto malformed;
@@ -887,7 +955,6 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
                  * Set a flag, but keep going in the loop, so that we absorb
                  * the rest of the bytes that comprise the character. */
                 overflowed = TRUE;
-               overflow_byte = *s; /* Save for warning message's use */
             }
             uv = UTF8_ACCUMULATE(uv, *s);
         }
@@ -924,7 +991,10 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
         if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
             if (! (flags & UTF8_CHECK_ONLY)) {
                  sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s",
-                                unexpected_non_continuation_text(s0, curlen)));
+                                unexpected_non_continuation_text(s0,
+                                                             send - s0,
+                                                             s - s0,
+                                                             (int) expectlen)));
             }
             goto malformed;
         }
@@ -940,7 +1010,13 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
      else if (UNLIKELY(curlen < expectlen)) {
         if (! (flags & UTF8_ALLOW_SHORT)) {
             if (! (flags & UTF8_CHECK_ONLY)) {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
+               sv = sv_2mortal(Perl_newSVpvf(aTHX_
+                                "%s: %s (too short; got %d byte%s, need %d)",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0),
+                                (int)curlen,
+                                curlen == 1 ? "" : "s",
+                                (int)expectlen));
             }
             goto malformed;
         }
@@ -952,7 +1028,9 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
      }
  
      if (UNLIKELY(overflowed)) {
-       sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
+        sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s: %s (overflows)",
+                                            malformed_text,
+                                            _byte_dump_string(s0, send - s0)));
         goto malformed;
      }
  
@@ -965,7 +1043,16 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
          * value, instead of the replacement character.  This is because this
          * value is actually well-defined. */
         if (! (flags & UTF8_CHECK_ONLY)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
+            U8 tmpbuf[UTF8_MAXBYTES+1];
+            const U8 * const e = uvchr_to_utf8(tmpbuf, uv);
+            sv = sv_2mortal(Perl_newSVpvf(aTHX_
+                  "%s: %s (overlong; instead use %s to represent U+%0*"UVXf")",
+                  malformed_text,
+                  _byte_dump_string(s0, send - s0),
+                  _byte_dump_string(tmpbuf, e - tmpbuf),
+                  ((uv < 256) ? 2 : 4), /* Field width of 2 for small code
+                                           points */
+                  uv));
         }
         goto malformed;
      }
@@ -1279,7 +1366,7 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
                          /* diag_listed_as: Malformed UTF-8 character (%s) */
                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
                                      "%s %s%s",
-                                    unexpected_non_continuation_text(u - 1, 1),
+                                    unexpected_non_continuation_text(u - 1, 2, 1, 2),
                                      PL_op ? " in " : "",
                                      PL_op ? OP_DESC(PL_op) : "");
                         return -2;
author	Karl Williamson <khw@cpan.org>
	Wed, 28 Sep 2016 21:05:17 +0000 (15:05 -0600)
committer	Karl Williamson <khw@cpan.org>
	Thu, 13 Oct 2016 17:18:11 +0000 (11:18 -0600)
embed.fnc		patch \| blob \| blame \| history
embed.h		patch \| blob \| blame \| history
ext/XS-APItest/t/utf8.t		patch \| blob \| blame \| history
lib/utf8.t		patch \| blob \| blame \| history
pod/perldelta.pod		patch \| blob \| blame \| history
pod/perldiag.pod		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
t/io/utf8.t		patch \| blob \| blame \| history
t/lib/warnings/utf8		patch \| blob \| blame \| history
t/op/pack.t		patch \| blob \| blame \| history
t/op/utf8decode.t		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history