NULLOK const char *special
#if defined(PERL_IN_UTF8_C)
sMR |char * |unexpected_non_continuation_text \
- |NN const U8 * const s|const STRLEN len
+ |NN const U8 * const s \
+ |const STRLEN print_len \
+ |const STRLEN non_cont_byte_pos \
+ |const STRLEN expect_len
+sM |char * |_byte_dump_string|NN const U8 * s|const STRLEN len
s |UV |_to_utf8_case |const UV uv1 \
|NN const U8 *p \
|NN U8* ustrp \
#define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d)
# endif
# if defined(PERL_IN_UTF8_C)
+#define _byte_dump_string(a,b) S__byte_dump_string(aTHX_ a,b)
#define _to_utf8_case(a,b,c,d,e,f,g) S__to_utf8_case(aTHX_ a,b,c,d,e,f,g)
#define check_locale_boundary_crossing(a,b,c,d) S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
#define is_utf8_common(a,b,c,d) S_is_utf8_common(aTHX_ a,b,c,d)
#define swash_scan_list_line(a,b,c,d,e,f,g) S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
#define swatch_get(a,b,c) S_swatch_get(aTHX_ a,b,c)
#define to_lower_latin1 S_to_lower_latin1
-#define unexpected_non_continuation_text(a,b) S_unexpected_non_continuation_text(aTHX_ a,b)
+#define unexpected_non_continuation_text(a,b,c,d) S_unexpected_non_continuation_text(aTHX_ a,b,c,d)
# endif
# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
#define _to_upper_title_latin1(a,b,c,d) Perl__to_upper_title_latin1(aTHX_ a,b,c,d)
$UTF8_ALLOW_LONG,
0, # NUL
2,
- qr/2 bytes, need 1/
+ qr/overlong/
],
[ "overlong malformation, highest 2-byte",
(isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
$UTF8_ALLOW_LONG,
(isASCII) ? 0x7F : utf8::unicode_to_native(0xBF),
2,
- qr/2 bytes, need 1/
+ qr/overlong/
],
[ "overlong malformation, lowest 3-byte",
(isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
$UTF8_ALLOW_LONG,
0, # NUL
3,
- qr/3 bytes, need 1/
+ qr/overlong/
],
[ "overlong malformation, highest 3-byte",
(isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
$UTF8_ALLOW_LONG,
(isASCII) ? 0x7FF : 0x3FF,
3,
- qr/3 bytes, need 2/
+ qr/overlong/
],
[ "overlong malformation, lowest 4-byte",
(isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
$UTF8_ALLOW_LONG,
0, # NUL
4,
- qr/4 bytes, need 1/
+ qr/overlong/
],
[ "overlong malformation, highest 4-byte",
(isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
$UTF8_ALLOW_LONG,
(isASCII) ? 0xFFFF : 0x3FFF,
4,
- qr/4 bytes, need 3/
+ qr/overlong/
],
[ "overlong malformation, lowest 5-byte",
(isASCII)
$UTF8_ALLOW_LONG,
0, # NUL
5,
- qr/5 bytes, need 1/
+ qr/overlong/
],
[ "overlong malformation, highest 5-byte",
(isASCII)
$UTF8_ALLOW_LONG,
(isASCII) ? 0x1FFFFF : 0x3FFFF,
5,
- qr/5 bytes, need 4/
+ qr/overlong/
],
[ "overlong malformation, lowest 6-byte",
(isASCII)
$UTF8_ALLOW_LONG,
0, # NUL
6,
- qr/6 bytes, need 1/
+ qr/overlong/
],
[ "overlong malformation, highest 6-byte",
(isASCII)
$UTF8_ALLOW_LONG,
(isASCII) ? 0x3FFFFFF : 0x3FFFFF,
6,
- qr/6 bytes, need 5/
+ qr/overlong/
],
[ "overlong malformation, lowest 7-byte",
(isASCII)
$UTF8_ALLOW_LONG,
0, # NUL
7,
- qr/7 bytes, need 1/
+ qr/overlong/
],
[ "overlong malformation, highest 7-byte",
(isASCII)
$UTF8_ALLOW_LONG,
(isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
7,
- qr/7 bytes, need 6/
+ qr/overlong/
],
);
0, # There is no way to allow this malformation
$REPLACEMENT,
7,
- qr/overflow/
+ qr/overflows/
],
[ "overflow malformation, can tell on first byte",
"\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
0, # There is no way to allow this malformation
$REPLACEMENT,
13,
- qr/overflow/
+ qr/overflows/
];
}
else {
$UTF8_ALLOW_LONG,
0, # NUL
(isASCII) ? 13 : 14,
- qr/1[34] bytes, need 1/, # 1[34] to work on either ASCII or EBCDIC
+ qr/overlong/,
],
[ "overlong malformation, highest max-byte",
(isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC
$UTF8_ALLOW_LONG,
(isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF,
(isASCII) ? 13 : 14,
- qr/1[34] bytes, need 7/,
+ qr/overlong/,
];
if (! $is64bit) { # 32-bit EBCDIC
0, # There is no way to allow this malformation
$REPLACEMENT,
14,
- qr/overflow/
+ qr/overflows/
];
}
else { # 64-bit
0, # There is no way to allow this malformation
$REPLACEMENT,
(isASCII) ? 13 : 14,
- qr/overflow/
+ qr/overflows/
];
}
}
"$UTF8_DISALLOW_ABOVE_31_BIT",
'utf8', 0,
(! isASCII) ? 14 : ($is64bit) ? 13 : 7,
- qr/overflow at byte .*, after start byte 0xf/
+ qr/overflows/
],
);
use utf8; %a = ("\xE1\xA0"=>"sterling");
print 'start'; printf '%x,', ord \$_ foreach keys %a; print "end\n";
BANG
- qr/^Malformed UTF-8 character \(\d bytes?, need \d, .+\).*start\d+,end$/sm
+ qr/^Malformed UTF-8 character: .*? \(too short; got \d bytes?, need \d\).*start\d+,end$/sm
],
);
foreach (@tests) {
=item *
-XXX Describe change here
+Details as to the exact problem have been added to the diagnostics that
+occur when malformed UTF-8 is encountered when trying to convert to a
+code point.
=back
=item Malformed UTF-8 character (%s)
-(S utf8)(F) Perl detected a string that didn't comply with UTF-8
-encoding rules, even though it had the UTF8 flag on.
+(S utf8)(F) Perl detected a string that should be UTF-8, but didn't
+comply with UTF-8 encoding rules, or represents a code point whose
+ordinal integer value doesn't fit into the word size of the current
+platform (overflows). Details as to the exact malformation are given in
+the variable, C<%s>, part of the message.
One possible cause is that you set the UTF8 flag yourself for data that
you thought to be in UTF-8 but it wasn't (it was for example legacy
assert(stash); assert(name)
#endif
#if defined(PERL_IN_UTF8_C)
+STATIC char * S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len);
+#define PERL_ARGS_ASSERT__BYTE_DUMP_STRING \
+ assert(s)
STATIC UV S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special);
#define PERL_ARGS_ASSERT__TO_UTF8_CASE \
assert(p); assert(ustrp); assert(swashp); assert(normal)
STATIC U8 S_to_lower_latin1(const U8 c, U8 *p, STRLEN *lenp)
__attribute__warn_unused_result__;
-STATIC char * S_unexpected_non_continuation_text(pTHX_ const U8 * const s, const STRLEN len)
+STATIC char * S_unexpected_non_continuation_text(pTHX_ const U8 * const s, const STRLEN print_len, const STRLEN non_cont_byte_pos, const STRLEN expect_len)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT \
assert(s)
local $SIG{__WARN__} = sub { push @warnings, $_[0]; };
eval { sprintf "%vd\n", $x };
is (scalar @warnings, 1);
- like ($warnings[0], qr/Malformed UTF-8 character \(unexpected continuation byte 0x82, with no preceding start byte/);
+ like ($warnings[0], qr/Malformed UTF-8 character: \\x82 \(unexpected continuation byte 0x82, with no preceding start byte/);
}
}
my $a = "snøstorm";
}
EXPECT
-Malformed UTF-8 character (unexpected non-continuation byte 0x73, immediately after start byte 0xf8) at - line 9.
-Malformed UTF-8 character (unexpected non-continuation byte 0x73, immediately after start byte 0xf8) at - line 14.
+Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 9.
+Malformed UTF-8 character: \xf8\x73\x74\x6f\x72 (unexpected non-continuation byte 0x73, immediately after start byte 0xf8; need 5 bytes, got 1) at - line 14.
########
use warnings 'utf8';
my $d7ff = uc(chr(0xD7FF));
}
{};$^H=2**400}Â
EXPECT
-Malformed UTF-8 character (unexpected non-continuation byte 0x0a, immediately after start byte 0xc2) at - line 6.
+Malformed UTF-8 character: \xc2\x0a (unexpected non-continuation byte 0x0a, immediately after start byte 0xc2; need 2 bytes, got 1) at - line 6.
my $bad = pack("U0C", 202);
local $SIG{__WARN__} = sub { $@ = "@_" };
my @null = unpack('U0U', $bad);
- like($@, qr/^Malformed UTF-8 character /);
+ like($@, qr/^Malformed UTF-8 character: /);
}
}
3.3.10 n - 5 fd:bf:bf:bf:bf - 5 bytes, need 6
3.4 Concatenation of incomplete sequences
3.4.1 N10 - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
-3.5 Impossible bytes
-3.5.1 n - 1 fe - byte 0xfe
-3.5.2 n - 1 ff - byte 0xff
+3.5 Impossible bytes (but not with Perl's extended UTF-8)
+3.5.1 n - 1 fe - 1 byte, need 7
+3.5.2 n - 1 ff - 1 byte, need 13
3.5.3 N4 - 4 fe:fe:ff:ff - byte 0xfe
4 Overlong sequences
4.1 Examples of an overlong ASCII character
-4.1.1 n - 2 c0:af - 2 bytes, need 1
-4.1.2 n - 3 e0:80:af - 3 bytes, need 1
-4.1.3 n - 4 f0:80:80:af - 4 bytes, need 1
-4.1.4 n - 5 f8:80:80:80:af - 5 bytes, need 1
-4.1.5 n - 6 fc:80:80:80:80:af - 6 bytes, need 1
+4.1.1 n - 2 c0:af - overlong
+4.1.2 n - 3 e0:80:af - overlong
+4.1.3 n - 4 f0:80:80:af - overlong
+4.1.4 n - 5 f8:80:80:80:af - overlong
+4.1.5 n - 6 fc:80:80:80:80:af - overlong
4.2 Maximum overlong sequences
-4.2.1 n - 2 c1:bf - 2 bytes, need 1
-4.2.2 n - 3 e0:9f:bf - 3 bytes, need 2
-4.2.3 n - 4 f0:8f:bf:bf - 4 bytes, need 3
-4.2.4 n - 5 f8:87:bf:bf:bf - 5 bytes, need 4
-4.2.5 n - 6 fc:83:bf:bf:bf:bf - 6 bytes, need 5
+4.2.1 n - 2 c1:bf - overlong
+4.2.2 n - 3 e0:9f:bf - overlong
+4.2.3 n - 4 f0:8f:bf:bf - overlong
+4.2.4 n - 5 f8:87:bf:bf:bf - overlong
+4.2.5 n - 6 fc:83:bf:bf:bf:bf - overlong
4.3 Overlong representation of the NUL character
-4.3.1 n - 2 c0:80 - 2 bytes, need 1
-4.3.2 n - 3 e0:80:80 - 3 bytes, need 1
-4.3.3 n - 4 f0:80:80:80 - 4 bytes, need 1
-4.3.4 n - 5 f8:80:80:80:80 - 5 bytes, need 1
-4.3.5 n - 6 fc:80:80:80:80:80 - 6 bytes, need 1
+4.3.1 n - 2 c0:80 - overlong
+4.3.2 n - 3 e0:80:80 - overlong
+4.3.3 n - 4 f0:80:80:80 - overlong
+4.3.4 n - 5 f8:80:80:80:80 - overlong
+4.3.5 n - 6 fc:80:80:80:80:80 - overlong
5 Illegal code positions
5.1 Single UTF-16 surrogates
5.1.1 y d800 3 ed:a0:80 1 UTF-16 surrogate 0xd800
#undef FE_ABOVE_OVERLONG
#undef FF_OVERLONG_PREFIX
+STATIC char *
+S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
+{
+ /* Returns a mortalized C string that is a displayable copy of the 'len'
+ * bytes starting at 's', each in a \xXY format. */
+
+ const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
+ trailing NUL */
+ const U8 * const e = s + len;
+ char * output;
+ char * d;
+
+ PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
+
+ Newx(output, output_len, char);
+ SAVEFREEPV(output);
+
+ d = output;
+ for (; s < e; s++) {
+ const unsigned high_nibble = (*s & 0xF0) >> 4;
+ const unsigned low_nibble = (*s & 0x0F);
+
+ *d++ = '\\';
+ *d++ = 'x';
+
+ if (high_nibble < 10) {
+ *d++ = high_nibble + '0';
+ }
+ else {
+ *d++ = high_nibble - 10 + 'a';
+ }
+
+ if (low_nibble < 10) {
+ *d++ = low_nibble + '0';
+ }
+ else {
+ *d++ = low_nibble - 10 + 'a';
+ }
+ }
+
+ *d = '\0';
+ return output;
+}
+
PERL_STATIC_INLINE char *
-S_unexpected_non_continuation_text(pTHX_ const U8 * const s, const STRLEN len)
+S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
+
+ /* How many bytes to print */
+ const STRLEN print_len,
+
+ /* Which one is the non-continuation */
+ const STRLEN non_cont_byte_pos,
+
+ /* How many bytes should there be? */
+ const STRLEN expect_len)
{
/* Return the malformation warning text for an unexpected continuation
* byte. */
- const char * const where = (len == 1)
+ const char * const where = (non_cont_byte_pos == 1)
? "immediately"
- : Perl_form(aTHX_ "%d bytes", (int) len);
+ : Perl_form(aTHX_ "%d bytes",
+ (int) non_cont_byte_pos);
PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
- return Perl_form(aTHX_ "%s (unexpected non-continuation byte 0x%02x,"
- " %s after start byte 0x%02x)",
- malformed_text, *(s + len), where, *s);
+ /* We don't need to pass this parameter, but since it has already been
+ * calculated, it's likely faster to pass it; verify under DEBUGGING */
+ assert(expect_len == UTF8SKIP(s));
+
+ return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
+ " %s after start byte 0x%02x; need %d bytes, got %d)",
+ malformed_text,
+ _byte_dump_string(s, print_len),
+ *(s + non_cont_byte_pos),
+ where,
+ *s,
+ (int) expect_len,
+ (int) non_cont_byte_pos);
}
/*
Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
const U8 * const s0 = s;
- U8 overflow_byte = '\0'; /* Save byte in case of overflow */
U8 * send;
UV uv = *s;
STRLEN expectlen;
return 0;
}
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)",
+ malformed_text));
}
goto malformed;
}
}
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (unexpected continuation byte 0x%02x,"
+ " with no preceding start byte)",
+ malformed_text,
+ _byte_dump_string(s0, 1), *s0));
}
curlen = 1;
goto malformed;
* Set a flag, but keep going in the loop, so that we absorb
* the rest of the bytes that comprise the character. */
overflowed = TRUE;
- overflow_byte = *s; /* Save for warning message's use */
}
uv = UTF8_ACCUMULATE(uv, *s);
}
if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
if (! (flags & UTF8_CHECK_ONLY)) {
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s",
- unexpected_non_continuation_text(s0, curlen)));
+ unexpected_non_continuation_text(s0,
+ send - s0,
+ s - s0,
+ (int) expectlen)));
}
goto malformed;
}
else if (UNLIKELY(curlen < expectlen)) {
if (! (flags & UTF8_ALLOW_SHORT)) {
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (too short; got %d byte%s, need %d)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ (int)curlen,
+ curlen == 1 ? "" : "s",
+ (int)expectlen));
}
goto malformed;
}
}
if (UNLIKELY(overflowed)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s: %s (overflows)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0)));
goto malformed;
}
* value, instead of the replacement character. This is because this
* value is actually well-defined. */
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
+ U8 tmpbuf[UTF8_MAXBYTES+1];
+ const U8 * const e = uvchr_to_utf8(tmpbuf, uv);
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (overlong; instead use %s to represent U+%0*"UVXf")",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ _byte_dump_string(tmpbuf, e - tmpbuf),
+ ((uv < 256) ? 2 : 4), /* Field width of 2 for small code
+ points */
+ uv));
}
goto malformed;
}
/* diag_listed_as: Malformed UTF-8 character (%s) */
Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
"%s %s%s",
- unexpected_non_continuation_text(u - 1, 1),
+ unexpected_non_continuation_text(u - 1, 2, 1, 2),
PL_op ? " in " : "",
PL_op ? OP_DESC(PL_op) : "");
return -2;