#include "perl.h"
#include "invlist_inline.h"
+static const char malformed_text[] = "Malformed UTF-8 character";
static const char unees[] =
- "Malformed UTF-8 character (unexpected end of string)";
+ "Malformed UTF-8 character (unexpected end of string)";
static const char cp_above_legal_max[] =
"Use of code point 0x%"UVXf" is deprecated; the permissible max is 0x%"UVXf"";
*/
/*
-=for apidoc is_utf8_invariant_string
-
-Returns true iff the first C<len> bytes of the string C<s> are the same
-regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
-EBCDIC machines). That is, if they are UTF-8 invariant. On ASCII-ish
-machines, all the ASCII characters and only the ASCII characters fit this
-definition. On EBCDIC machines, the ASCII-range characters are invariant, but
-so also are the C1 controls and C<\c?> (which isn't in the ASCII range on
-EBCDIC).
-
-If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
-use this option, that C<s> can't have embedded C<NUL> characters and has to
-have a terminating C<NUL> byte).
-
-See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and
-L</is_utf8_string_loc>().
-
-=cut
-*/
-
-bool
-Perl_is_utf8_invariant_string(const U8 *s, STRLEN len)
-{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
-
- PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING;
-
- for (; x < send; ++x) {
- if (!UTF8_IS_INVARIANT(*x))
- return FALSE;
- }
-
- return TRUE;
-}
-
-/*
=for apidoc uvoffuni_to_utf8_flags
THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
-three DISALLOW flags.
+three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
+allowed inputs to the strict UTF-8 traditionally defined by Unicode.
+Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
+C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
+above-Unicode and surrogate flags, but not the non-character ones, as
+defined in
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+See L<perlunicode/Noncharacter code points>.
Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
so using them is more problematic than other above-Unicode code points. Perl
return uvchr_to_utf8_flags(d, uv, flags);
}
-/*
-=for apidoc is_utf8_string
+PERL_STATIC_INLINE bool
+S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
+{
+ /* Returns TRUE if the first code point represented by the Perl-extended-
+ * UTF-8-encoded string starting at 's', and looking no further than 'e -
+ * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
+ *
+ * The function handles the case where the input bytes do not include all
+ * the ones necessary to represent a full character. That is, they may be
+ * the intial bytes of the representation of a code point, but possibly
+ * the final ones necessary for the complete representation may be beyond
+ * 'e - 1'.
+ *
+ * The function assumes that the sequence is well-formed UTF-8 as far as it
+ * goes, and is for a UTF-8 variant code point. If the sequence is
+ * incomplete, the function returns FALSE if there is any well-formed
+ * UTF-8 byte sequence that can complete it in such a way that a code point
+ * < 2**31 is produced; otherwise it returns TRUE.
+ *
+ * Getting this exactly right is slightly tricky, and has to be done in
+ * several places in this file, so is centralized here. It is based on the
+ * following table:
+ *
+ * U+7FFFFFFF (2 ** 31 - 1)
+ * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
+ * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
+ * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
+ * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
+ * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
+ * U+80000000 (2 ** 31):
+ * ASCII: \xFE\x82\x80\x80\x80\x80\x80
+ * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
+ * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+ * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+ * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+ * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
+ */
-Returns true if the first C<len> bytes of string C<s> form a valid
-UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
-using C<strlen(s)> (which means if you use this option, that C<s> can't have
-embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
-that all characters being ASCII constitute 'a valid UTF-8 string'.
+#ifdef EBCDIC
-See also L</is_utf8_invariant_string>(), L</is_utf8_string_loclen>(), and
-L</is_utf8_string_loc>().
+ /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
+ const U8 * const prefix = (U8 *) "\x41\x41\x41\x41\x41\x41\x42";
+ const STRLEN prefix_len = sizeof(prefix) - 1;
+ const STRLEN len = e - s;
+ const STRLEN cmp_len = MIN(prefix_len, len - 1);
-=cut
-*/
+#else
-bool
-Perl_is_utf8_string(const U8 *s, STRLEN len)
+ PERL_UNUSED_ARG(e);
+
+#endif
+
+ PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
+
+ assert(! UTF8_IS_INVARIANT(*s));
+
+#ifndef EBCDIC
+
+ /* Technically, a start byte of FE can be for a code point that fits into
+ * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
+ * malformation. */
+ return (*s >= 0xFE);
+
+#else
+
+ /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
+ * larger code point (0xFF is an invariant). For 0xFE, we need at least 2
+ * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
+ * bits. */
+ if (*s != 0xFE || len == 1) {
+ return FALSE;
+ }
+
+ /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
+ * \x41 and \x42. */
+ return cBOOL(memGT(s + 1, prefix, cmp_len));
+
+#endif
+
+}
+
+PERL_STATIC_INLINE bool
+S_does_utf8_overflow(const U8 * const s, const U8 * e)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
+ const U8 *x;
+ const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+
+ /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
+ * platform, that is if it represents a code point larger than the highest
+ * representable code point. (For ASCII platforms, we could use memcmp()
+ * because we don't have to convert each byte to I8, but it's very rare
+ * input indeed that would approach overflow, so the loop below will likely
+ * only get executed once.
+ *
+ * 'e' must not be beyond a full character. If it is less than a full
+ * character, the function returns FALSE if there is any input beyond 'e'
+ * that could result in a non-overflowing code point */
+
+ PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
+ assert(s + UTF8SKIP(s) >= e);
+
+ for (x = s; x < e; x++, y++) {
- PERL_ARGS_ASSERT_IS_UTF8_STRING;
+ /* If this byte is larger than the corresponding highest UTF-8 byte, it
+ * overflows */
+ if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
+ return TRUE;
+ }
- while (x < send) {
- STRLEN len = isUTF8_CHAR(x, send);
- if (UNLIKELY(! len)) {
+ /* If not the same as this byte, it must be smaller, doesn't overflow */
+ if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
return FALSE;
}
- x += len;
}
- return TRUE;
+ /* Got to the end and all bytes are the same. If the input is a whole
+ * character, it doesn't overflow. And if it is a partial character,
+ * there's not enough information to tell, so assume doesn't overflow */
+ return FALSE;
}
-/*
-Implemented as a macro in utf8.h
+PERL_STATIC_INLINE bool
+S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
+{
+ /* Overlongs can occur whenever the number of continuation bytes
+ * changes. That means whenever the number of leading 1 bits in a start
+ * byte increases from the next lower start byte. That happens for start
+ * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
+ * illegal start bytes have already been excluded, so don't need to be
+ * tested here;
+ * ASCII platforms: C0, C1
+ * EBCDIC platforms C0, C1, C2, C3, C4, E0
+ *
+ * At least a second byte is required to determine if other sequences will
+ * be an overlong. */
+
+ const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+ const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+ PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
+ assert(len > 1 && UTF8_IS_START(*s));
+
+ /* Each platform has overlongs after the start bytes given above (expressed
+ * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
+ * the logic is the same, except the E0 overlong has already been excluded
+ * on EBCDIC platforms. The values below were found by manually
+ * inspecting the UTF-8 patterns. See the tables in utf8.h and
+ * utfebcdic.h. */
+
+# ifdef EBCDIC
+# define F0_ABOVE_OVERLONG 0xB0
+# define F8_ABOVE_OVERLONG 0xA8
+# define FC_ABOVE_OVERLONG 0xA4
+# define FE_ABOVE_OVERLONG 0xA2
+# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
+ /* I8(0xfe) is FF */
+# else
+
+ if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
+ return TRUE;
+ }
+
+# define F0_ABOVE_OVERLONG 0x90
+# define F8_ABOVE_OVERLONG 0x88
+# define FC_ABOVE_OVERLONG 0x84
+# define FE_ABOVE_OVERLONG 0x82
+# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
+# endif
+
+
+ if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
+ || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
+ || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
+ || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
+ {
+ return TRUE;
+ }
-=for apidoc is_utf8_string_loc
+# if defined(UV_IS_QUAD) || defined(EBCDIC)
-Like L</is_utf8_string> but stores the location of the failure (in the
-case of "utf8ness failure") or the location C<s>+C<len> (in the case of
-"utf8ness success") in the C<ep>.
+ /* Check for the FF overlong. This happens only if all these bytes match;
+ * what comes after them doesn't matter. See tables in utf8.h,
+ * utfebcdic.h. (Can't happen on ASCII 32-bit platforms, as overflows
+ * instead.) */
-See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
+ if ( len >= sizeof(FF_OVERLONG_PREFIX) - 1
+ && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
+ sizeof(FF_OVERLONG_PREFIX) - 1)))
+ {
+ return TRUE;
+ }
-=for apidoc is_utf8_string_loclen
+#endif
-Like L</is_utf8_string>() but stores the location of the failure (in the
-case of "utf8ness failure") or the location C<s>+C<len> (in the case of
-"utf8ness success") in the C<ep>, and the number of UTF-8
-encoded characters in the C<el>.
+ return FALSE;
+}
-See also L</is_utf8_string_loc>() and L</is_utf8_string>().
+#undef F0_ABOVE_OVERLONG
+#undef F8_ABOVE_OVERLONG
+#undef FC_ABOVE_OVERLONG
+#undef FE_ABOVE_OVERLONG
+#undef FF_OVERLONG_PREFIX
-=cut
-*/
+STRLEN
+Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+{
+ STRLEN len;
+ const U8 *x;
-bool
-Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
+ /* A helper function that should not be called directly.
+ *
+ * This function returns non-zero if the string beginning at 's' and
+ * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
+ * code point; otherwise it returns 0. The examination stops after the
+ * first code point in 's' is validated, not looking at the rest of the
+ * input. If 'e' is such that there are not enough bytes to represent a
+ * complete code point, this function will return non-zero anyway, if the
+ * bytes it does have are well-formed UTF-8 as far as they go, and aren't
+ * excluded by 'flags'.
+ *
+ * A non-zero return gives the number of bytes required to represent the
+ * code point. Be aware that if the input is for a partial character, the
+ * return will be larger than 'e - s'.
+ *
+ * This function assumes that the code point represented is UTF-8 variant.
+ * The caller should have excluded this possibility before calling this
+ * function.
+ *
+ * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
+ * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
+ * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
+ * disallowed by the flags. If the input is only for a partial character,
+ * the function will return non-zero if there is any sequence of
+ * well-formed UTF-8 that, when appended to the input sequence, could
+ * result in an allowed code point; otherwise it returns 0. Non characters
+ * cannot be determined based on partial character input. But many of the
+ * other excluded types can be determined with just the first one or two
+ * bytes.
+ *
+ */
+
+ PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
+
+ assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+ |UTF8_DISALLOW_ABOVE_31_BIT)));
+ assert(! UTF8_IS_INVARIANT(*s));
+
+ /* A variant char must begin with a start byte */
+ if (UNLIKELY(! UTF8_IS_START(*s))) {
+ return 0;
+ }
+
+ /* Examine a maximum of a single whole code point */
+ if (e - s > UTF8SKIP(s)) {
+ e = s + UTF8SKIP(s);
+ }
+
+ len = e - s;
+
+ if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
+ const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+
+ /* The code below is derived from this table. Keep in mind that legal
+ * continuation bytes range between \x80..\xBF for UTF-8, and
+ * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
+ * Hence, we don't have to test the upper edge because if any of those
+ * are encountered, the sequence is malformed, and will fail elsewhere
+ * in this function.
+ * UTF-8 UTF-EBCDIC I8
+ * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
+ * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
+ * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
+ *
+ */
+
+#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
+# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
+# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
+
+# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
+ /* B6 and B7 */ \
+ && ((s1) & 0xFE ) == 0xB6)
+#else
+# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
+# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
+# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
+#endif
+
+ if ( (flags & UTF8_DISALLOW_SUPER)
+ && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
+ return 0; /* Above Unicode */
+ }
+
+ if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
+ && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
+ {
+ return 0; /* Above 31 bits */
+ }
+
+ if (len > 1) {
+ const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+ if ( (flags & UTF8_DISALLOW_SUPER)
+ && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
+ {
+ return 0; /* Above Unicode */
+ }
+
+ if ( (flags & UTF8_DISALLOW_SURROGATE)
+ && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
+ {
+ return 0; /* Surrogate */
+ }
+
+ if ( (flags & UTF8_DISALLOW_NONCHAR)
+ && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
+ {
+ return 0; /* Noncharacter code point */
+ }
+ }
+ }
+
+ /* Make sure that all that follows are continuation bytes */
+ for (x = s + 1; x < e; x++) {
+ if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
+ return 0;
+ }
+ }
+
+ /* Here is syntactically valid. Next, make sure this isn't the start of an
+ * overlong. */
+ if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
+ return 0;
+ }
+
+ /* And finally, that the code point represented fits in a word on this
+ * platform */
+ if (does_utf8_overflow(s, e)) {
+ return 0;
+ }
+
+ return UTF8SKIP(s);
+}
+
+STATIC char *
+S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
- STRLEN outlen = 0;
+ /* Returns a mortalized C string that is a displayable copy of the 'len'
+ * bytes starting at 's', each in a \xXY format. */
+
+ const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
+ trailing NUL */
+ const U8 * const e = s + len;
+ char * output;
+ char * d;
- PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
+ PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
+
+ Newx(output, output_len, char);
+ SAVEFREEPV(output);
+
+ d = output;
+ for (; s < e; s++) {
+ const unsigned high_nibble = (*s & 0xF0) >> 4;
+ const unsigned low_nibble = (*s & 0x0F);
+
+ *d++ = '\\';
+ *d++ = 'x';
+
+ if (high_nibble < 10) {
+ *d++ = high_nibble + '0';
+ }
+ else {
+ *d++ = high_nibble - 10 + 'a';
+ }
- while (x < send) {
- STRLEN len = isUTF8_CHAR(x, send);
- if (UNLIKELY(! len)) {
- break;
+ if (low_nibble < 10) {
+ *d++ = low_nibble + '0';
+ }
+ else {
+ *d++ = low_nibble - 10 + 'a';
}
- x += len;
- outlen++;
}
- if (el)
- *el = outlen;
+ *d = '\0';
+ return output;
+}
- if (ep)
- *ep = x;
- return (x == send);
+PERL_STATIC_INLINE char *
+S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
+
+ /* How many bytes to print */
+ const STRLEN print_len,
+
+ /* Which one is the non-continuation */
+ const STRLEN non_cont_byte_pos,
+
+ /* How many bytes should there be? */
+ const STRLEN expect_len)
+{
+ /* Return the malformation warning text for an unexpected continuation
+ * byte. */
+
+ const char * const where = (non_cont_byte_pos == 1)
+ ? "immediately"
+ : Perl_form(aTHX_ "%d bytes",
+ (int) non_cont_byte_pos);
+
+ PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
+
+ /* We don't need to pass this parameter, but since it has already been
+ * calculated, it's likely faster to pass it; verify under DEBUGGING */
+ assert(expect_len == UTF8SKIP(s));
+
+ return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
+ " %s after start byte 0x%02x; need %d bytes, got %d)",
+ malformed_text,
+ _byte_dump_string(s, print_len),
+ *(s + non_cont_byte_pos),
+ where,
+ *s,
+ (int) expect_len,
+ (int) non_cont_byte_pos);
}
/*
Note that this API requires disambiguation between successful decoding a C<NUL>
character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
-in both cases, 0 is returned. To disambiguate, upon a zero return, see if the
-first byte of C<s> is 0 as well. If so, the input was a C<NUL>; if not, the
-input had an error.
+in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
+be set to 1. To disambiguate, upon a zero return, see if the first byte of
+C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
+error.
Certain code points are considered problematic. These are Unicode surrogates,
Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
By default these are considered regular code points, but certain situations
-warrant special handling for them. If C<flags> contains
-C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all three classes are treated as
-malformations and handled as such. The flags C<UTF8_DISALLOW_SURROGATE>,
-C<UTF8_DISALLOW_NONCHAR>, and C<UTF8_DISALLOW_SUPER> (meaning above the legal
-Unicode maximum) can be set to disallow these categories individually.
-
-The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
+warrant special handling for them, which can be specified using the C<flags>
+parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
+three classes are treated as malformations and handled as such. The flags
+C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
+C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
+disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
+restricts the allowed inputs to the strict UTF-8 traditionally defined by
+Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
+definition given by
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+The difference between traditional strictness and C9 strictness is that the
+latter does not forbid non-character code points. (They are still discouraged,
+however.) For more discussion see L<perlunicode/Noncharacter code points>.
+
+The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
+C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
raised for their respective categories, but otherwise the code points are
considered valid (not malformations). To get a category to both be treated as
Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
const U8 * const s0 = s;
- U8 overflow_byte = '\0'; /* Save byte in case of overflow */
U8 * send;
UV uv = *s;
STRLEN expectlen;
bool overflowed = FALSE;
bool do_overlong_test = TRUE; /* May have to skip this test */
- const char* const malformed_text = "Malformed UTF-8 character";
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
return 0;
}
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)",
+ malformed_text));
}
goto malformed;
}
}
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (unexpected continuation byte 0x%02x,"
+ " with no preceding start byte)",
+ malformed_text,
+ _byte_dump_string(s0, 1), *s0));
}
curlen = 1;
goto malformed;
* Set a flag, but keep going in the loop, so that we absorb
* the rest of the bytes that comprise the character. */
overflowed = TRUE;
- overflow_byte = *s; /* Save for warning message's use */
}
uv = UTF8_ACCUMULATE(uv, *s);
}
if (UNLIKELY(unexpected_non_continuation)) {
if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
if (! (flags & UTF8_CHECK_ONLY)) {
- if (curlen == 1) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
- }
- else {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
- }
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s",
+ unexpected_non_continuation_text(s0,
+ send - s0,
+ s - s0,
+ (int) expectlen)));
}
goto malformed;
}
else if (UNLIKELY(curlen < expectlen)) {
if (! (flags & UTF8_ALLOW_SHORT)) {
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (too short; got %d byte%s, need %d)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ (int)curlen,
+ curlen == 1 ? "" : "s",
+ (int)expectlen));
}
goto malformed;
}
}
if (UNLIKELY(overflowed)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s: %s (overflows)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0)));
goto malformed;
}
* value, instead of the replacement character. This is because this
* value is actually well-defined. */
if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
+ U8 tmpbuf[UTF8_MAXBYTES+1];
+ const U8 * const e = uvchr_to_utf8(tmpbuf, uv);
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_
+ "%s: %s (overlong; instead use %s to represent U+%0*"UVXf")",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ _byte_dump_string(tmpbuf, e - tmpbuf),
+ ((uv < 256) ? 2 : 4), /* Field width of 2 for small code
+ points */
+ uv));
}
goto malformed;
}
|| ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
&& ckWARN_d(WARN_DEPRECATED))))
{
- if (UNICODE_IS_SURROGATE(uv)) {
+ if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
/* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
* generation of the sv, since no warnings are raised under CHECK */
goto disallowed;
}
}
- else if ((uv > PERL_UNICODE_MAX)) {
+ else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
&& ckWARN_d(WARN_NON_UNICODE))
{
/* The maximum code point ever specified by a standard was
* 2**31 - 1. Anything larger than that is a Perl extension that
* very well may not be understood by other applications (including
- * earlier perl versions on EBCDIC platforms). On ASCII platforms,
- * these code points are indicated by the first UTF-8 byte being
- * 0xFE or 0xFF. We test for these after the regular SUPER ones,
- * and before possibly bailing out, so that the slightly more dire
- * warning will override the regular one. */
- if (
-#ifndef EBCDIC
- (*s0 & 0xFE) == 0xFE /* matches both FE, FF */
-#else
- /* The I8 for 2**31 (U+80000000) is
- * \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
- * and it turns out that on all EBCDIC pages recognized that
- * the UTF-EBCDIC for that code point is
- * \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
- * For the next lower code point, the 1047 UTF-EBCDIC is
- * \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
- * The other code pages differ only in the bytes following
- * \x42. Thus the following works (the minimum continuation
- * byte is \x41). */
- *s0 == 0xFE && send - s0 > 7 && ( s0[1] > 0x41
- || s0[2] > 0x41
- || s0[3] > 0x41
- || s0[4] > 0x41
- || s0[5] > 0x41
- || s0[6] > 0x41
- || s0[7] > 0x42)
-#endif
- && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER
- |UTF8_DISALLOW_ABOVE_31_BIT)))
+ * earlier perl versions on EBCDIC platforms). We test for these
+ * after the regular SUPER ones, and before possibly bailing out,
+ * so that the slightly more dire warning will override the regular
+ * one. */
+ if ( (flags & (UTF8_WARN_ABOVE_31_BIT
+ |UTF8_WARN_SUPER
+ |UTF8_DISALLOW_ABOVE_31_BIT))
+ && UNLIKELY(is_utf8_cp_above_31_bits(s0, send)))
{
if ( ! (flags & UTF8_CHECK_ONLY)
&& (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
pack_warn = packWARN(WARN_DEPRECATED);
}
}
- else if (UNICODE_IS_NONCHAR(uv)) {
+ else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
&& ckWARN_d(WARN_NONCHAR))
{
}
if (sv) {
- outlier_ret = uv; /* Note we don't bother to convert to native,
- as all the outlier code points are the same
- in both ASCII and EBCDIC */
+ outlier_ret = UNI_TO_NATIVE(uv);
goto do_warn;
}
unless those are turned off.
=cut
+
+Also implemented as a macro in utf8.h
+
*/
ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
-/*
+/* This is marked as deprecated
+ *
=for apidoc utf8_to_uvuni_buf
Only in very rare circumstances should code need to be dealing in Unicode
assert(send > s);
- /* Call the low level routine asking for checks */
- return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
- ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
+ /* Call the low level routine, asking for checks */
+ return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
}
/*
}
/*
-=for apidoc utf8_distance
-
-Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
-and C<b>.
-
-WARNING: use only if you *know* that the pointers point inside the
-same UTF-8 buffer.
-
-=cut
-*/
-
-IV
-Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
-{
- PERL_ARGS_ASSERT_UTF8_DISTANCE;
-
- return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
-}
-
-/*
-=for apidoc utf8_hop
-
-Return the UTF-8 pointer C<s> displaced by C<off> characters, either
-forward or backward.
-
-WARNING: do not use the following unless you *know* C<off> is within
-the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
-on the first byte of character or just after the last byte of a character.
-
-=cut
-*/
-
-U8 *
-Perl_utf8_hop(const U8 *s, SSize_t off)
-{
- PERL_ARGS_ASSERT_UTF8_HOP;
-
- /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
- * the bitops (especially ~) can create illegal UTF-8.
- * In other words: in Perl UTF-8 is not just for Unicode. */
-
- if (off >= 0) {
- while (off--)
- s += UTF8SKIP(s);
- }
- else {
- while (off++) {
- s--;
- while (UTF8_IS_CONTINUATION(*s))
- s--;
- }
- }
- return (U8 *)s;
-}
-
-/*
=for apidoc bytes_cmp_utf8
Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
if (UTF8_IS_CONTINUATION(c1)) {
c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
} else {
+ /* diag_listed_as: Malformed UTF-8 character (%s) */
Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
- "Malformed UTF-8 character "
- "(unexpected non-continuation byte 0x%02x"
- ", immediately after start byte 0x%02x)"
- /* Dear diag.t, it's in the pod. */
- "%s%s", c1, c,
- PL_op ? " in " : "",
- PL_op ? OP_DESC(PL_op) : "");
+ "%s %s%s",
+ unexpected_non_continuation_text(u - 1, 2, 1, 2),
+ PL_op ? " in " : "",
+ PL_op ? OP_DESC(PL_op) : "");
return -2;
}
} else {
}
if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
STRLEN char_len;
- if (UTF8_IS_SUPER(s, e)) {
+ if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
if ( ckWARN_d(WARN_NON_UNICODE)
|| ( ckWARN_d(WARN_DEPRECATED)
-#if defined(UV_IS_QUAD)
+#ifndef UV_IS_QUAD
+ && UNLIKELY(is_utf8_cp_above_31_bits(s, e))
+#else /* Below is 64-bit words */
/* 2**63 and up meet these conditions provided we have
* a 64-bit word. */
# ifdef EBCDIC
- && *s == 0xFE && e - s >= UTF8_MAXBYTES
- && s[1] >= 0x49
+ && *s == 0xFE
+ && NATIVE_UTF8_TO_I8(s[1]) >= 0xA8
# else
- && *s == 0xFF && e -s >= UTF8_MAXBYTES
+ && *s == 0xFF
+ /* s[1] being above 0x80 overflows */
&& s[2] >= 0x88
# endif
-#else /* Below is 32-bit words */
- /* 2**31 and above meet these conditions on all EBCDIC
- * pages recognized for 32-bit platforms */
-# ifdef EBCDIC
- && *s == 0xFE && e - s >= UTF8_MAXBYTES
- && s[6] >= 0x43
-# else
- && *s >= 0xFE
-# endif
#endif
)) {
/* A side effect of this function will be to warn */
ok = FALSE;
}
}
- else if (UTF8_IS_SURROGATE(s, e)) {
+ else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
if (ckWARN_d(WARN_SURROGATE)) {
/* This has a different warning than the one the called
* function would output, so can't just call it, unlike we
ok = FALSE;
}
}
- else if ((UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
+ else if (UNLIKELY(UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
/* A side effect of this function will be to warn */
(void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_NONCHAR);
ok = FALSE;