#include "perl.h"
#include "invlist_inline.h"
+static const char malformed_text[] = "Malformed UTF-8 character";
static const char unees[] =
- "Malformed UTF-8 character (unexpected end of string)";
+ "Malformed UTF-8 character (unexpected end of string)";
static const char cp_above_legal_max[] =
"Use of code point 0x%"UVXf" is deprecated; the permissible max is 0x%"UVXf"";
#ifdef EBCDIC
- /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
- const U8 * const prefix = "\x41\x41\x41\x41\x41\x41\x42";
+ /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
+ const U8 * const prefix = (U8 *) "\x41\x41\x41\x41\x41\x41\x42";
const STRLEN prefix_len = sizeof(prefix) - 1;
const STRLEN len = e - s;
- const cmp_len = MIN(prefix_len, len - 1);
+ const STRLEN cmp_len = MIN(prefix_len, len - 1);
#else
}
-/*
+PERL_STATIC_INLINE bool
+S_does_utf8_overflow(const U8 * const s, const U8 * e)
+{
+ const U8 *x;
+ const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+
+ /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
+ * platform, that is if it represents a code point larger than the highest
+ * representable code point. (For ASCII platforms, we could use memcmp()
+ * because we don't have to convert each byte to I8, but it's very rare
+ * input indeed that would approach overflow, so the loop below will likely
+ * only get executed once.
+ *
+ * 'e' must not be beyond a full character. If it is less than a full
+ * character, the function returns FALSE if there is any input beyond 'e'
+ * that could result in a non-overflowing code point */
+
+ PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
+ assert(s + UTF8SKIP(s) >= e);
+
+ for (x = s; x < e; x++, y++) {
-A helper function for the macro isUTF8_CHAR(), which should be used instead of
-this function. The macro will handle smaller code points directly saving time,
-using this function as a fall-back for higher code points. This function
-assumes that it is not called with an invariant character, and that
-'s + len - 1' is within bounds of the string 's'.
+ /* If this byte is larger than the corresponding highest UTF-8 byte, it
+ * overflows */
+ if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
+ return TRUE;
+ }
-Tests if the string C<s> of at least length 'len' is a valid variant UTF-8
-character. 0 is returned if not, otherwise, 'len' is returned.
+ /* If not the same as this byte, it must be smaller, doesn't overflow */
+ if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
+ return FALSE;
+ }
+ }
-It is written in such a way that if 'len' is set to less than a full
-character's length, it will test if the bytes ending there form the legal
-beginning of partial character.
+ /* Got to the end and all bytes are the same. If the input is a whole
+ * character, it doesn't overflow. And if it is a partial character,
+ * there's not enough information to tell, so assume doesn't overflow */
+ return FALSE;
+}
-*/
+PERL_STATIC_INLINE bool
+S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
+{
+ /* Overlongs can occur whenever the number of continuation bytes
+ * changes. That means whenever the number of leading 1 bits in a start
+ * byte increases from the next lower start byte. That happens for start
+ * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
+ * illegal start bytes have already been excluded, so don't need to be
+ * tested here;
+ * ASCII platforms: C0, C1
+ * EBCDIC platforms C0, C1, C2, C3, C4, E0
+ *
+ * At least a second byte is required to determine if other sequences will
+ * be an overlong. */
+
+ const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+ const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+ PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
+ assert(len > 1 && UTF8_IS_START(*s));
+
+ /* Each platform has overlongs after the start bytes given above (expressed
+ * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
+ * the logic is the same, except the E0 overlong has already been excluded
+ * on EBCDIC platforms. The values below were found by manually
+ * inspecting the UTF-8 patterns. See the tables in utf8.h and
+ * utfebcdic.h. */
+
+# ifdef EBCDIC
+# define F0_ABOVE_OVERLONG 0xB0
+# define F8_ABOVE_OVERLONG 0xA8
+# define FC_ABOVE_OVERLONG 0xA4
+# define FE_ABOVE_OVERLONG 0xA2
+# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
+ /* I8(0xfe) is FF */
+# else
+
+ if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
+ return TRUE;
+ }
+
+# define F0_ABOVE_OVERLONG 0x90
+# define F8_ABOVE_OVERLONG 0x88
+# define FC_ABOVE_OVERLONG 0x84
+# define FE_ABOVE_OVERLONG 0x82
+# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
+# endif
+
+
+ if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
+ || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
+ || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
+ || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
+ {
+ return TRUE;
+ }
+
+# if defined(UV_IS_QUAD) || defined(EBCDIC)
+
+ /* Check for the FF overlong. This happens only if all these bytes match;
+ * what comes after them doesn't matter. See tables in utf8.h,
+ * utfebcdic.h. (Can't happen on ASCII 32-bit platforms, as overflows
+ * instead.) */
+
+ if ( len >= sizeof(FF_OVERLONG_PREFIX) - 1
+ && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
+ sizeof(FF_OVERLONG_PREFIX) - 1)))
+ {
+ return TRUE;
+ }
+
+#endif
+
+ return FALSE;
+}
+
+#undef F0_ABOVE_OVERLONG
+#undef F8_ABOVE_OVERLONG
+#undef FC_ABOVE_OVERLONG
+#undef FE_ABOVE_OVERLONG
+#undef FF_OVERLONG_PREFIX
STRLEN
-Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len)
+Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
{
- const U8 *e;
- const U8 *x, *y;
+ STRLEN len;
+ const U8 *x;
- PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW;
+ /* A helper function that should not be called directly.
+ *
+ * This function returns non-zero if the string beginning at 's' and
+ * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
+ * code point; otherwise it returns 0. The examination stops after the
+ * first code point in 's' is validated, not looking at the rest of the
+ * input. If 'e' is such that there are not enough bytes to represent a
+ * complete code point, this function will return non-zero anyway, if the
+ * bytes it does have are well-formed UTF-8 as far as they go, and aren't
+ * excluded by 'flags'.
+ *
+ * A non-zero return gives the number of bytes required to represent the
+ * code point. Be aware that if the input is for a partial character, the
+ * return will be larger than 'e - s'.
+ *
+ * This function assumes that the code point represented is UTF-8 variant.
+ * The caller should have excluded this possibility before calling this
+ * function.
+ *
+ * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
+ * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
+ * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
+ * disallowed by the flags. If the input is only for a partial character,
+ * the function will return non-zero if there is any sequence of
+ * well-formed UTF-8 that, when appended to the input sequence, could
+ * result in an allowed code point; otherwise it returns 0. Non characters
+ * cannot be determined based on partial character input. But many of the
+ * other excluded types can be determined with just the first one or two
+ * bytes.
+ *
+ */
+ PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
+
+ assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+ |UTF8_DISALLOW_ABOVE_31_BIT)));
+ assert(! UTF8_IS_INVARIANT(*s));
+
+ /* A variant char must begin with a start byte */
if (UNLIKELY(! UTF8_IS_START(*s))) {
return 0;
}
- e = s + len;
-
- for (x = s + 1; x < e; x++) {
- if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
- return 0;
- }
+ /* Examine a maximum of a single whole code point */
+ if (e - s > UTF8SKIP(s)) {
+ e = s + UTF8SKIP(s);
}
-#ifndef EBCDIC
+ len = e - s;
- /* Here is syntactically valid. Make sure this isn't the start of an
- * overlong. These values were found by manually inspecting the UTF-8
- * patterns. See the tables in utf8.h and utfebcdic.h */
+ if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
+ const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
- /* This is not needed on modern perls where C0 and C1 are not considered
- * start bytes. */
-#if 0
- if (UNLIKELY(*s < 0xC2)) {
- return 0;
- }
+ /* The code below is derived from this table. Keep in mind that legal
+ * continuation bytes range between \x80..\xBF for UTF-8, and
+ * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
+ * Hence, we don't have to test the upper edge because if any of those
+ * are encountered, the sequence is malformed, and will fail elsewhere
+ * in this function.
+ * UTF-8 UTF-EBCDIC I8
+ * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
+ * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
+ * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
+ *
+ */
+
+#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
+# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
+# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
+
+# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
+ /* B6 and B7 */ \
+ && ((s1) & 0xFE ) == 0xB6)
+#else
+# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
+# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
+# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
#endif
- if (len > 1) {
- if ( (*s == 0xE0 && UNLIKELY(s[1] < 0xA0))
- || (*s == 0xF0 && UNLIKELY(s[1] < 0x90))
- || (*s == 0xF8 && UNLIKELY(s[1] < 0x88))
- || (*s == 0xFC && UNLIKELY(s[1] < 0x84))
- || (*s == 0xFE && UNLIKELY(s[1] < 0x82)))
- {
- return 0;
+ if ( (flags & UTF8_DISALLOW_SUPER)
+ && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
+ return 0; /* Above Unicode */
}
- if ((len > 6 && UNLIKELY(*s == 0xFF) && UNLIKELY(s[6] < 0x81))) {
- return 0;
- }
- }
-
-#else /* For EBCDIC, we use I8, which is the same on all code pages */
- {
- const U8 s0 = NATIVE_UTF8_TO_I8(*s);
- /* On modern perls C0-C4 aren't considered start bytes */
- if ( /* s0 < 0xC5 || */ s0 == 0xE0) {
- return 0;
+ if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
+ && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
+ {
+ return 0; /* Above 31 bits */
}
- if (len >= 1) {
+ if (len > 1) {
const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
- if ( (s0 == 0xF0 && UNLIKELY(s1 < 0xB0))
- || (s0 == 0xF8 && UNLIKELY(s1 < 0xA8))
- || (s0 == 0xFC && UNLIKELY(s1 < 0xA4))
- || (s0 == 0xFE && UNLIKELY(s1 < 0x82)))
+ if ( (flags & UTF8_DISALLOW_SUPER)
+ && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
{
- return 0;
+ return 0; /* Above Unicode */
}
- if ((len > 7 && UNLIKELY(s0 == 0xFF) && UNLIKELY(s[7] < 0xA1))) {
- return 0;
+
+ if ( (flags & UTF8_DISALLOW_SURROGATE)
+ && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
+ {
+ return 0; /* Surrogate */
+ }
+
+ if ( (flags & UTF8_DISALLOW_NONCHAR)
+ && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
+ {
+ return 0; /* Noncharacter code point */
}
}
}
-#endif
+ /* Make sure that all that follows are continuation bytes */
+ for (x = s + 1; x < e; x++) {
+ if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
+ return 0;
+ }
+ }
- /* Now see if this would overflow a UV on this platform. See if the UTF8
- * for this code point is larger than that for the highest representable
- * code point */
- y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+ /* Here is syntactically valid. Next, make sure this isn't the start of an
+ * overlong. */
+ if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
+ return 0;
+ }
- for (x = s; x < e; x++, y++) {
+ /* And finally, that the code point represented fits in a word on this
+ * platform */
+ if (does_utf8_overflow(s, e)) {
+ return 0;
+ }
- /* If the same at this byte, go on to the next */
- if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
- continue;
- }
+ return UTF8SKIP(s);
+}
- /* If this is larger, it overflows */
- if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
- return 0;
+STATIC char *
+S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
+{
+ /* Returns a mortalized C string that is a displayable copy of the 'len'
+ * bytes starting at 's', each in a \xXY format. */
+
+ const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
+ trailing NUL */
+ const U8 * const e = s + len;
+ char * output;
+ char * d;
+
+ PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
+
+ Newx(output, output_len, char);
+ SAVEFREEPV(output);
+
+ d = output;
+ for (; s < e; s++) {
+ const unsigned high_nibble = (*s & 0xF0) >> 4;
+ const unsigned low_nibble = (*s & 0x0F);
+
+ *d++ = '\\';
+ *d++ = 'x';
+
+ if (high_nibble < 10) {
+ *d++ = high_nibble + '0';
+ }
+ else {
+ *d++ = high_nibble - 10 + 'a';
}
- /* But if smaller, it won't */
- break;
+ if (low_nibble < 10) {
+ *d++ = low_nibble + '0';
+ }
+ else {
+ *d++ = low_nibble - 10 + 'a';
+ }
}
- return len;
+ *d = '\0';
+ return output;
+}
+
+PERL_STATIC_INLINE char *
+S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
+
+ /* How many bytes to print */
+ const STRLEN print_len,
+
+ /* Which one is the non-continuation */
+ const STRLEN non_cont_byte_pos,
+
+ /* How many bytes should there be? */
+ const STRLEN expect_len)
+{
+ /* Return the malformation warning text for an unexpected continuation
+ * byte. */
+
+ const char * const where = (non_cont_byte_pos == 1)
+ ? "immediately"
+ : Perl_form(aTHX_ "%d bytes",
+ (int) non_cont_byte_pos);
+
+ PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
+
+ /* We don't need to pass this parameter, but since it has already been
+ * calculated, it's likely faster to pass it; verify under DEBUGGING */
+ assert(expect_len == UTF8SKIP(s));
+
+ return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
+ " %s after start byte 0x%02x; need %d bytes, got %d)",
+ malformed_text,
+ _byte_dump_string(s, print_len),
+ *(s + non_cont_byte_pos),
+ where,
+ *s,
+ (int) expect_len,
+ (int) non_cont_byte_pos);
}
/*
Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
const U8 * const s0 = s;
- U8 overflow_byte = '\0'; /* Save byte in case of overflow */
U8 * send;
UV uv = *s;
STRLEN expectlen;
bool overflowed = FALSE;
bool do_overlong_test = TRUE; /* May have to skip this test */
- const char* const malformed_text = "Malformed UTF-8 character";
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
return 0;
}
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)",
+ malformed_text));
}
goto malformed;
}
}
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (unexpected continuation byte 0x%02x,"
+ " with no preceding start byte)",
+ malformed_text,
+ _byte_dump_string(s0, 1), *s0));
}
curlen = 1;
goto malformed;
* Set a flag, but keep going in the loop, so that we absorb
* the rest of the bytes that comprise the character. */
overflowed = TRUE;
- overflow_byte = *s; /* Save for warning message's use */
}
uv = UTF8_ACCUMULATE(uv, *s);
}
if (UNLIKELY(unexpected_non_continuation)) {
if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
if (! (flags & UTF8_CHECK_ONLY)) {
- if (curlen == 1) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
- }
- else {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
- }
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s",
+ unexpected_non_continuation_text(s0,
+ send - s0,
+ s - s0,
+ (int) expectlen)));
}
goto malformed;
}
else if (UNLIKELY(curlen < expectlen)) {
if (! (flags & UTF8_ALLOW_SHORT)) {
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (too short; got %d byte%s, need %d)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ (int)curlen,
+ curlen == 1 ? "" : "s",
+ (int)expectlen));
}
goto malformed;
}
}
if (UNLIKELY(overflowed)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s: %s (overflows)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0)));
goto malformed;
}
* value, instead of the replacement character. This is because this
* value is actually well-defined. */
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
+ U8 tmpbuf[UTF8_MAXBYTES+1];
+ const U8 * const e = uvchr_to_utf8(tmpbuf, uv);
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (overlong; instead use %s to represent U+%0*"UVXf")",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ _byte_dump_string(tmpbuf, e - tmpbuf),
+ ((uv < 256) ? 2 : 4), /* Field width of 2 for small code
+ points */
+ uv));
}
goto malformed;
}
|| ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
&& ckWARN_d(WARN_DEPRECATED))))
{
- if (UNICODE_IS_SURROGATE(uv)) {
+ if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
/* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
* generation of the sv, since no warnings are raised under CHECK */
goto disallowed;
}
}
- else if ((uv > PERL_UNICODE_MAX)) {
+ else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
&& ckWARN_d(WARN_NON_UNICODE))
{
pack_warn = packWARN(WARN_DEPRECATED);
}
}
- else if (UNICODE_IS_NONCHAR(uv)) {
+ else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
&& ckWARN_d(WARN_NONCHAR))
{
}
if (sv) {
- outlier_ret = uv; /* Note we don't bother to convert to native,
- as all the outlier code points are the same
- in both ASCII and EBCDIC */
+ outlier_ret = UNI_TO_NATIVE(uv);
goto do_warn;
}
if (UTF8_IS_CONTINUATION(c1)) {
c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
} else {
+ /* diag_listed_as: Malformed UTF-8 character (%s) */
Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
- "Malformed UTF-8 character "
- "(unexpected non-continuation byte 0x%02x"
- ", immediately after start byte 0x%02x)"
- /* Dear diag.t, it's in the pod. */
- "%s%s", c1, c,
- PL_op ? " in " : "",
- PL_op ? OP_DESC(PL_op) : "");
+ "%s %s%s",
+ unexpected_non_continuation_text(u - 1, 2, 1, 2),
+ PL_op ? " in " : "",
+ PL_op ? OP_DESC(PL_op) : "");
return -2;
}
} else {
}
if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
STRLEN char_len;
- if (UTF8_IS_SUPER(s, e)) {
+ if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
if ( ckWARN_d(WARN_NON_UNICODE)
|| ( ckWARN_d(WARN_DEPRECATED)
#ifndef UV_IS_QUAD
ok = FALSE;
}
}
- else if (UTF8_IS_SURROGATE(s, e)) {
+ else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
if (ckWARN_d(WARN_SURROGATE)) {
/* This has a different warning than the one the called
* function would output, so can't just call it, unlike we
ok = FALSE;
}
}
- else if ((UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
+ else if (UNLIKELY(UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
/* A side effect of this function will be to warn */
(void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_NONCHAR);
ok = FALSE;