#include "perl.h"
#include "invlist_inline.h"
+static const char malformed_text[] = "Malformed UTF-8 character";
static const char unees[] =
- "Malformed UTF-8 character (unexpected end of string)";
+ "Malformed UTF-8 character (unexpected end of string)";
static const char cp_above_legal_max[] =
"Use of code point 0x%"UVXf" is deprecated; the permissible max is 0x%"UVXf"";
#ifdef EBCDIC
- /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
- const U8 * const prefix = "\x41\x41\x41\x41\x41\x41\x42";
+ /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
+ const U8 * const prefix = (U8 *) "\x41\x41\x41\x41\x41\x41\x42";
const STRLEN prefix_len = sizeof(prefix) - 1;
const STRLEN len = e - s;
- const cmp_len = MIN(prefix_len, len - 1);
+ const STRLEN cmp_len = MIN(prefix_len, len - 1);
#else
}
+PERL_STATIC_INLINE bool
+S_does_utf8_overflow(const U8 * const s, const U8 * e)
+{
+ const U8 *x;
+ const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+
+ /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
+ * platform, that is if it represents a code point larger than the highest
+ * representable code point. (For ASCII platforms, we could use memcmp()
+ * because we don't have to convert each byte to I8, but it's very rare
+ * input indeed that would approach overflow, so the loop below will likely
+ * only get executed once.
+ *
+ * 'e' must not be beyond a full character. If it is less than a full
+ * character, the function returns FALSE if there is any input beyond 'e'
+ * that could result in a non-overflowing code point */
+
+ PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
+ assert(s + UTF8SKIP(s) >= e);
+
+ for (x = s; x < e; x++, y++) {
+
+ /* If this byte is larger than the corresponding highest UTF-8 byte, it
+ * overflows */
+ if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
+ return TRUE;
+ }
+
+ /* If not the same as this byte, it must be smaller, doesn't overflow */
+ if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
+ return FALSE;
+ }
+ }
+
+ /* Got to the end and all bytes are the same. If the input is a whole
+ * character, it doesn't overflow. And if it is a partial character,
+ * there's not enough information to tell, so assume doesn't overflow */
+ return FALSE;
+}
+
+PERL_STATIC_INLINE bool
+S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
+{
+ /* Overlongs can occur whenever the number of continuation bytes
+ * changes. That means whenever the number of leading 1 bits in a start
+ * byte increases from the next lower start byte. That happens for start
+ * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
+ * illegal start bytes have already been excluded, so don't need to be
+ * tested here;
+ * ASCII platforms: C0, C1
+ * EBCDIC platforms C0, C1, C2, C3, C4, E0
+ *
+ * At least a second byte is required to determine if other sequences will
+ * be an overlong. */
+
+ const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+ const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+ PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
+ assert(len > 1 && UTF8_IS_START(*s));
+
+ /* Each platform has overlongs after the start bytes given above (expressed
+ * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
+ * the logic is the same, except the E0 overlong has already been excluded
+ * on EBCDIC platforms. The values below were found by manually
+ * inspecting the UTF-8 patterns. See the tables in utf8.h and
+ * utfebcdic.h. */
+
+# ifdef EBCDIC
+# define F0_ABOVE_OVERLONG 0xB0
+# define F8_ABOVE_OVERLONG 0xA8
+# define FC_ABOVE_OVERLONG 0xA4
+# define FE_ABOVE_OVERLONG 0xA2
+# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
+ /* I8(0xfe) is FF */
+# else
+
+ if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
+ return TRUE;
+ }
+
+# define F0_ABOVE_OVERLONG 0x90
+# define F8_ABOVE_OVERLONG 0x88
+# define FC_ABOVE_OVERLONG 0x84
+# define FE_ABOVE_OVERLONG 0x82
+# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
+# endif
+
+
+ if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
+ || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
+ || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
+ || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
+ {
+ return TRUE;
+ }
+
+# if defined(UV_IS_QUAD) || defined(EBCDIC)
+
+ /* Check for the FF overlong. This happens only if all these bytes match;
+ * what comes after them doesn't matter. See tables in utf8.h,
+ * utfebcdic.h. (Can't happen on ASCII 32-bit platforms, as overflows
+ * instead.) */
+
+ if ( len >= sizeof(FF_OVERLONG_PREFIX) - 1
+ && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
+ sizeof(FF_OVERLONG_PREFIX) - 1)))
+ {
+ return TRUE;
+ }
+
+#endif
+
+ return FALSE;
+}
+
+#undef F0_ABOVE_OVERLONG
+#undef F8_ABOVE_OVERLONG
+#undef FC_ABOVE_OVERLONG
+#undef FE_ABOVE_OVERLONG
+#undef FF_OVERLONG_PREFIX
+
STRLEN
-Perl__is_utf8_char_helper(const U8 * const s, const U8 * const e, const U32 flags)
+Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
{
STRLEN len;
- const U8 *x, *y;
+ const U8 *x;
/* A helper function that should not be called directly.
*
return 0;
}
+ /* Examine a maximum of a single whole code point */
+ if (e - s > UTF8SKIP(s)) {
+ e = s + UTF8SKIP(s);
+ }
+
len = e - s;
if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
-# define IS_SUPER_2_BYTE(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
+# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
- /* B6 and B7 */
-# define IS_SURROGATE(s0, s1) ((s0) == 0xF1 && ((s1) & 0xFE ) == 0xB6)
+# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
+ /* B6 and B7 */ \
+ && ((s1) & 0xFE ) == 0xB6)
#else
# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
-# define IS_SUPER_2_BYTE(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
-# define IS_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
+# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
+# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
#endif
if ( (flags & UTF8_DISALLOW_SUPER)
const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
if ( (flags & UTF8_DISALLOW_SUPER)
- && UNLIKELY(IS_SUPER_2_BYTE(s0, s1)))
+ && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
{
return 0; /* Above Unicode */
}
if ( (flags & UTF8_DISALLOW_SURROGATE)
- && UNLIKELY(IS_SURROGATE(s0, s1)))
+ && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
{
return 0; /* Surrogate */
}
}
/* Here is syntactically valid. Next, make sure this isn't the start of an
- * overlong. Overlongs can occur whenever the number of continuation bytes
- * changes. That means whenever the number of leading 1 bits in a start
- * byte increases from the next lower start byte. That happens for start
- * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
- * illegal start bytes have already been excluded, so don't need to be
- * tested here;
- * ASCII platforms: C0, C1
- * EBCDIC platforms C0, C1, C2, C3, C4, E0
- *
- * At least a second byte is required to determine if other sequences will
- * be an overlong. */
+ * overlong. */
+ if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
+ return 0;
+ }
- if (len > 1) {
- const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
- const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+ /* And finally, that the code point represented fits in a word on this
+ * platform */
+ if (does_utf8_overflow(s, e)) {
+ return 0;
+ }
- /* Each platform has overlongs after the start bytes given above
- * (expressed in I8 for EBCDIC). What constitutes an overlong varies
- * by platform, but the logic is the same, except the E0 overlong has
- * already been excluded on EBCDIC platforms. The values below were
- * found by manually inspecting the UTF-8 patterns. See the tables in
- * utf8.h and utfebcdic.h */
+ return UTF8SKIP(s);
+}
-# ifdef EBCDIC
-# define F0_ABOVE_OVERLONG 0xB0
-# define F8_ABOVE_OVERLONG 0xA8
-# define FC_ABOVE_OVERLONG 0xA4
-# define FE_ABOVE_OVERLONG 0xA2
-# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
- /* I8(0xfe) is FF */
-# else
+STATIC char *
+S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
+{
+ /* Returns a mortalized C string that is a displayable copy of the 'len'
+ * bytes starting at 's', each in a \xXY format. */
- if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
- return 0; /* Overlong */
- }
+ const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
+ trailing NUL */
+ const U8 * const e = s + len;
+ char * output;
+ char * d;
-# define F0_ABOVE_OVERLONG 0x90
-# define F8_ABOVE_OVERLONG 0x88
-# define FC_ABOVE_OVERLONG 0x84
-# define FE_ABOVE_OVERLONG 0x82
-# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
-# endif
+ PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
+ Newx(output, output_len, char);
+ SAVEFREEPV(output);
- if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
- || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
- || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
- || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
- {
- return 0; /* Overlong */
- }
-
-# if defined(UV_IS_QUAD) || defined(EBCDIC)
+ d = output;
+ for (; s < e; s++) {
+ const unsigned high_nibble = (*s & 0xF0) >> 4;
+ const unsigned low_nibble = (*s & 0x0F);
- /* Check for the FF overlong. This happens only if all these bytes
- * match; what comes after them doesn't matter. See tables in utf8.h,
- * utfebcdic.h. (Can't happen on ASCII 32-bit platforms, as overflows
- * instead.) */
+ *d++ = '\\';
+ *d++ = 'x';
- if ( len >= sizeof(FF_OVERLONG_PREFIX) - 1
- && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
- sizeof(FF_OVERLONG_PREFIX) - 1)))
- {
- return 0; /* Overlong */
+ if (high_nibble < 10) {
+ *d++ = high_nibble + '0';
+ }
+ else {
+ *d++ = high_nibble - 10 + 'a';
}
-#endif
-
+ if (low_nibble < 10) {
+ *d++ = low_nibble + '0';
+ }
+ else {
+ *d++ = low_nibble - 10 + 'a';
+ }
}
- /* Finally, see if this would overflow a UV on this platform. See if the
- * UTF8 for this code point is larger than that for the highest
- * representable code point. (For ASCII platforms, we could use memcmp()
- * because we don't have to convert each byte to I8, but it's very rare
- * input indeed that would approach overflow, so the loop below will likely
- * only get executed once */
- y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+ *d = '\0';
+ return output;
+}
- for (x = s; x < e; x++, y++) {
+PERL_STATIC_INLINE char *
+S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
- /* If the same as this byte, go on to the next */
- if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
- continue;
- }
+ /* How many bytes to print */
+ const STRLEN print_len,
- /* If this is larger, it overflows */
- if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
- return 0;
- }
-
- /* But if smaller, it won't */
- break;
- }
+ /* Which one is the non-continuation */
+ const STRLEN non_cont_byte_pos,
- return UTF8SKIP(s);
+ /* How many bytes should there be? */
+ const STRLEN expect_len)
+{
+ /* Return the malformation warning text for an unexpected continuation
+ * byte. */
+
+ const char * const where = (non_cont_byte_pos == 1)
+ ? "immediately"
+ : Perl_form(aTHX_ "%d bytes",
+ (int) non_cont_byte_pos);
+
+ PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
+
+ /* We don't need to pass this parameter, but since it has already been
+ * calculated, it's likely faster to pass it; verify under DEBUGGING */
+ assert(expect_len == UTF8SKIP(s));
+
+ return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
+ " %s after start byte 0x%02x; need %d bytes, got %d)",
+ malformed_text,
+ _byte_dump_string(s, print_len),
+ *(s + non_cont_byte_pos),
+ where,
+ *s,
+ (int) expect_len,
+ (int) non_cont_byte_pos);
}
-#undef FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER
-#undef IS_SUPER_2_BYTE
-#undef IS_SURROGATE
-#undef F0_ABOVE_OVERLONG
-#undef F8_ABOVE_OVERLONG
-#undef FC_ABOVE_OVERLONG
-#undef FE_ABOVE_OVERLONG
-#undef FF_OVERLONG_PREFIX
-
/*
=for apidoc utf8n_to_uvchr
Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
const U8 * const s0 = s;
- U8 overflow_byte = '\0'; /* Save byte in case of overflow */
U8 * send;
UV uv = *s;
STRLEN expectlen;
bool overflowed = FALSE;
bool do_overlong_test = TRUE; /* May have to skip this test */
- const char* const malformed_text = "Malformed UTF-8 character";
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
return 0;
}
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)",
+ malformed_text));
}
goto malformed;
}
}
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (unexpected continuation byte 0x%02x,"
+ " with no preceding start byte)",
+ malformed_text,
+ _byte_dump_string(s0, 1), *s0));
}
curlen = 1;
goto malformed;
* Set a flag, but keep going in the loop, so that we absorb
* the rest of the bytes that comprise the character. */
overflowed = TRUE;
- overflow_byte = *s; /* Save for warning message's use */
}
uv = UTF8_ACCUMULATE(uv, *s);
}
if (UNLIKELY(unexpected_non_continuation)) {
if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
if (! (flags & UTF8_CHECK_ONLY)) {
- if (curlen == 1) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
- }
- else {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
- }
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s",
+ unexpected_non_continuation_text(s0,
+ send - s0,
+ s - s0,
+ (int) expectlen)));
}
goto malformed;
}
else if (UNLIKELY(curlen < expectlen)) {
if (! (flags & UTF8_ALLOW_SHORT)) {
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (too short; got %d byte%s, need %d)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ (int)curlen,
+ curlen == 1 ? "" : "s",
+ (int)expectlen));
}
goto malformed;
}
}
if (UNLIKELY(overflowed)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s: %s (overflows)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0)));
goto malformed;
}
* value, instead of the replacement character. This is because this
* value is actually well-defined. */
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
+ U8 tmpbuf[UTF8_MAXBYTES+1];
+ const U8 * const e = uvchr_to_utf8(tmpbuf, uv);
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (overlong; instead use %s to represent U+%0*"UVXf")",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ _byte_dump_string(tmpbuf, e - tmpbuf),
+ ((uv < 256) ? 2 : 4), /* Field width of 2 for small code
+ points */
+ uv));
}
goto malformed;
}
|| ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
&& ckWARN_d(WARN_DEPRECATED))))
{
- if (UNICODE_IS_SURROGATE(uv)) {
+ if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
/* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
* generation of the sv, since no warnings are raised under CHECK */
goto disallowed;
}
}
- else if ((uv > PERL_UNICODE_MAX)) {
+ else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
&& ckWARN_d(WARN_NON_UNICODE))
{
pack_warn = packWARN(WARN_DEPRECATED);
}
}
- else if (UNICODE_IS_NONCHAR(uv)) {
+ else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
&& ckWARN_d(WARN_NONCHAR))
{
}
if (sv) {
- outlier_ret = uv; /* Note we don't bother to convert to native,
- as all the outlier code points are the same
- in both ASCII and EBCDIC */
+ outlier_ret = UNI_TO_NATIVE(uv);
goto do_warn;
}
if (UTF8_IS_CONTINUATION(c1)) {
c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
} else {
+ /* diag_listed_as: Malformed UTF-8 character (%s) */
Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
- "Malformed UTF-8 character "
- "(unexpected non-continuation byte 0x%02x"
- ", immediately after start byte 0x%02x)"
- /* Dear diag.t, it's in the pod. */
- "%s%s", c1, c,
- PL_op ? " in " : "",
- PL_op ? OP_DESC(PL_op) : "");
+ "%s %s%s",
+ unexpected_non_continuation_text(u - 1, 2, 1, 2),
+ PL_op ? " in " : "",
+ PL_op ? OP_DESC(PL_op) : "");
return -2;
}
} else {
}
if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
STRLEN char_len;
- if (UTF8_IS_SUPER(s, e)) {
+ if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
if ( ckWARN_d(WARN_NON_UNICODE)
|| ( ckWARN_d(WARN_DEPRECATED)
#ifndef UV_IS_QUAD
ok = FALSE;
}
}
- else if (UTF8_IS_SURROGATE(s, e)) {
+ else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
if (ckWARN_d(WARN_SURROGATE)) {
/* This has a different warning than the one the called
* function would output, so can't just call it, unlike we
ok = FALSE;
}
}
- else if ((UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
+ else if (UNLIKELY(UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
/* A side effect of this function will be to warn */
(void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_NONCHAR);
ok = FALSE;