X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/83dc0f42cb8bfe955e45a5b44b989daddf87570a..2a67563b073377398a953d7df7d276295521bfa6:/utf8.c diff --git a/utf8.c b/utf8.c index f3edc25..3ded4a6 100644 --- a/utf8.c +++ b/utf8.c @@ -33,10 +33,12 @@ #include "perl.h" #include "invlist_inline.h" +static const char malformed_text[] = "Malformed UTF-8 character"; static const char unees[] = - "Malformed UTF-8 character (unexpected end of string)"; + "Malformed UTF-8 character (unexpected end of string)"; static const char cp_above_legal_max[] = - "Use of code point 0x%"UVXf" is deprecated; the permissible max is 0x%"UVXf""; + "Use of code point 0x%" UVXf " is not allowed; " + "the permissible max is 0x%" UVXf; #define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX)) @@ -51,6 +53,54 @@ within non-zero characters. =cut */ +void +Perl__force_out_malformed_utf8_message(pTHX_ + const U8 *const p, /* First byte in UTF-8 sequence */ + const U8 * const e, /* Final byte in sequence (may include + multiple chars */ + const U32 flags, /* Flags to pass to utf8n_to_uvchr(), + usually 0, or some DISALLOW flags */ + const bool die_here) /* If TRUE, this function does not return */ +{ + /* This core-only function is to be called when a malformed UTF-8 character + * is found, in order to output the detailed information about the + * malformation before dieing. The reason it exists is for the occasions + * when such a malformation is fatal, but warnings might be turned off, so + * that normally they would not be actually output. This ensures that they + * do get output. Because a sequence may be malformed in more than one + * way, multiple messages may be generated, so we can't make them fatal, as + * that would cause the first one to die. + * + * Instead we pretend -W was passed to perl, then die afterwards. The + * flexibility is here to return to the caller so they can finish up and + * die themselves */ + U32 errors; + + PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE; + + ENTER; + SAVEI8(PL_dowarn); + SAVESPTR(PL_curcop); + + PL_dowarn = G_WARN_ALL_ON|G_WARN_ON; + if (PL_curcop) { + PL_curcop->cop_warnings = pWARN_ALL; + } + + (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors); + + LEAVE; + + if (! errors) { + Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should" + " be called only when there are errors found"); + } + + if (die_here) { + Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); + } +} + /* =for apidoc uvoffuni_to_utf8_flags @@ -71,7 +121,7 @@ For details, see the description for L. STMT_START { \ if (flags & UNICODE_WARN_SURROGATE) { \ Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \ - "UTF-16 surrogate U+%04"UVXf, uv); \ + "UTF-16 surrogate U+%04" UVXf, uv); \ } \ if (flags & UNICODE_DISALLOW_SURROGATE) { \ return NULL; \ @@ -82,7 +132,7 @@ For details, see the description for L. STMT_START { \ if (flags & UNICODE_WARN_NONCHAR) { \ Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \ - "Unicode non-character U+%04"UVXf" is not " \ + "Unicode non-character U+%04" UVXf " is not " \ "recommended for open interchange", uv); \ } \ if (flags & UNICODE_DISALLOW_NONCHAR) { \ @@ -97,7 +147,7 @@ For details, see the description for L. #define MASK UTF_CONTINUATION_MASK U8 * -Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) +Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) { PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS; @@ -149,11 +199,8 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) * performance hit on these high EBCDIC code points. */ if (UNLIKELY(UNICODE_IS_SUPER(uv))) { - if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP); + if (UNLIKELY(uv > MAX_NON_DEPRECATED_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP); } if ( (flags & UNICODE_WARN_SUPER) || ( UNICODE_IS_ABOVE_31_BIT(uv) @@ -163,8 +210,8 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) /* Choose the more dire applicable warning */ (UNICODE_IS_ABOVE_31_BIT(uv)) - ? "Code point 0x%"UVXf" is not Unicode, and not portable" - : "Code point 0x%"UVXf" is not Unicode, may not be portable", + ? "Code point 0x%" UVXf " is not Unicode, and not portable" + : "Code point 0x%" UVXf " is not Unicode, may not be portable", uv); } if (flags & UNICODE_DISALLOW_SUPER @@ -382,11 +429,11 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) #ifdef EBCDIC - /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */ - const U8 * const prefix = "\x41\x41\x41\x41\x41\x41\x42"; + /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */ + const U8 prefix[] = "\x41\x41\x41\x41\x41\x41\x42"; const STRLEN prefix_len = sizeof(prefix) - 1; const STRLEN len = e - s; - const cmp_len = MIN(prefix_len, len - 1); + const STRLEN cmp_len = MIN(prefix_len, len - 1); #else @@ -423,120 +470,376 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) } -/* +PERL_STATIC_INLINE bool +S_does_utf8_overflow(const U8 * const s, const U8 * e) +{ + const U8 *x; + const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8; -A helper function for the macro isUTF8_CHAR(), which should be used instead of -this function. The macro will handle smaller code points directly saving time, -using this function as a fall-back for higher code points. This function -assumes that it is not called with an invariant character, and that -'s + len - 1' is within bounds of the string 's'. +#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC) -Tests if the string C of at least length 'len' is a valid variant UTF-8 -character. 0 is returned if not, otherwise, 'len' is returned. + const STRLEN len = e - s; -It is written in such a way that if 'len' is set to less than a full -character's length, it will test if the bytes ending there form the legal -beginning of partial character. +#endif -*/ + /* Returns a boolean as to if this UTF-8 string would overflow a UV on this + * platform, that is if it represents a code point larger than the highest + * representable code point. (For ASCII platforms, we could use memcmp() + * because we don't have to convert each byte to I8, but it's very rare + * input indeed that would approach overflow, so the loop below will likely + * only get executed once. + * + * 'e' must not be beyond a full character. If it is less than a full + * character, the function returns FALSE if there is any input beyond 'e' + * that could result in a non-overflowing code point */ -STRLEN -Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len) -{ - const U8 *e; - const U8 *x, *y; + PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW; + assert(s <= e && s + UTF8SKIP(s) >= e); - PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW; +#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC) - if (UNLIKELY(! UTF8_IS_START(*s))) { - return 0; + /* On 32 bit ASCII machines, many overlongs that start with FF don't + * overflow */ + + if (isFF_OVERLONG(s, len)) { + const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84"; + return memGE(s, max_32_bit_overlong, + MIN(len, sizeof(max_32_bit_overlong) - 1)); } - e = s + len; +#endif - for (x = s + 1; x < e; x++) { - if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) { - return 0; + for (x = s; x < e; x++, y++) { + + /* If this byte is larger than the corresponding highest UTF-8 byte, it + * overflows */ + if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) { + return TRUE; + } + + /* If not the same as this byte, it must be smaller, doesn't overflow */ + if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) { + return FALSE; } } -#ifndef EBCDIC + /* Got to the end and all bytes are the same. If the input is a whole + * character, it doesn't overflow. And if it is a partial character, + * there's not enough information to tell, so assume doesn't overflow */ + return FALSE; +} - /* Here is syntactically valid. Make sure this isn't the start of an - * overlong. These values were found by manually inspecting the UTF-8 - * patterns. See the tables in utf8.h and utfebcdic.h */ +PERL_STATIC_INLINE bool +S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) +{ + /* Overlongs can occur whenever the number of continuation bytes + * changes. That means whenever the number of leading 1 bits in a start + * byte increases from the next lower start byte. That happens for start + * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following + * illegal start bytes have already been excluded, so don't need to be + * tested here; + * ASCII platforms: C0, C1 + * EBCDIC platforms C0, C1, C2, C3, C4, E0 + * + * At least a second byte is required to determine if other sequences will + * be an overlong. */ + + const U8 s0 = NATIVE_UTF8_TO_I8(s[0]); + const U8 s1 = NATIVE_UTF8_TO_I8(s[1]); + + PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK; + assert(len > 1 && UTF8_IS_START(*s)); + + /* Each platform has overlongs after the start bytes given above (expressed + * in I8 for EBCDIC). What constitutes an overlong varies by platform, but + * the logic is the same, except the E0 overlong has already been excluded + * on EBCDIC platforms. The values below were found by manually + * inspecting the UTF-8 patterns. See the tables in utf8.h and + * utfebcdic.h. */ + +# ifdef EBCDIC +# define F0_ABOVE_OVERLONG 0xB0 +# define F8_ABOVE_OVERLONG 0xA8 +# define FC_ABOVE_OVERLONG 0xA4 +# define FE_ABOVE_OVERLONG 0xA2 +# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41" + /* I8(0xfe) is FF */ +# else + + if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) { + return TRUE; + } + +# define F0_ABOVE_OVERLONG 0x90 +# define F8_ABOVE_OVERLONG 0x88 +# define FC_ABOVE_OVERLONG 0x84 +# define FE_ABOVE_OVERLONG 0x82 +# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80" +# endif + + + if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG)) + || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG)) + || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG)) + || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG))) + { + return TRUE; + } + + /* Check for the FF overlong */ + return isFF_OVERLONG(s, len); +} + +PERL_STATIC_INLINE bool +S_isFF_OVERLONG(const U8 * const s, const STRLEN len) +{ + PERL_ARGS_ASSERT_ISFF_OVERLONG; + + /* Check for the FF overlong. This happens only if all these bytes match; + * what comes after them doesn't matter. See tables in utf8.h, + * utfebcdic.h. */ + + return len >= sizeof(FF_OVERLONG_PREFIX) - 1 + && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX, + sizeof(FF_OVERLONG_PREFIX) - 1)); +} + +#undef F0_ABOVE_OVERLONG +#undef F8_ABOVE_OVERLONG +#undef FC_ABOVE_OVERLONG +#undef FE_ABOVE_OVERLONG +#undef FF_OVERLONG_PREFIX + +STRLEN +Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) +{ + STRLEN len; + const U8 *x; + + /* A helper function that should not be called directly. + * + * This function returns non-zero if the string beginning at 's' and + * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a + * code point; otherwise it returns 0. The examination stops after the + * first code point in 's' is validated, not looking at the rest of the + * input. If 'e' is such that there are not enough bytes to represent a + * complete code point, this function will return non-zero anyway, if the + * bytes it does have are well-formed UTF-8 as far as they go, and aren't + * excluded by 'flags'. + * + * A non-zero return gives the number of bytes required to represent the + * code point. Be aware that if the input is for a partial character, the + * return will be larger than 'e - s'. + * + * This function assumes that the code point represented is UTF-8 variant. + * The caller should have excluded this possibility before calling this + * function. + * + * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags + * accepted by L. If non-zero, this function will return + * 0 if the code point represented is well-formed Perl-extended-UTF-8, but + * disallowed by the flags. If the input is only for a partial character, + * the function will return non-zero if there is any sequence of + * well-formed UTF-8 that, when appended to the input sequence, could + * result in an allowed code point; otherwise it returns 0. Non characters + * cannot be determined based on partial character input. But many of the + * other excluded types can be determined with just the first one or two + * bytes. + * + */ + + PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER; - /* This is not needed on modern perls where C0 and C1 are not considered - * start bytes. */ -#if 0 - if (UNLIKELY(*s < 0xC2)) { + assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE + |UTF8_DISALLOW_ABOVE_31_BIT))); + assert(! UTF8_IS_INVARIANT(*s)); + + /* A variant char must begin with a start byte */ + if (UNLIKELY(! UTF8_IS_START(*s))) { return 0; } + + /* Examine a maximum of a single whole code point */ + if (e - s > UTF8SKIP(s)) { + e = s + UTF8SKIP(s); + } + + len = e - s; + + if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) { + const U8 s0 = NATIVE_UTF8_TO_I8(s[0]); + + /* The code below is derived from this table. Keep in mind that legal + * continuation bytes range between \x80..\xBF for UTF-8, and + * \xA0..\xBF for I8. Anything above those aren't continuation bytes. + * Hence, we don't have to test the upper edge because if any of those + * are encountered, the sequence is malformed, and will fail elsewhere + * in this function. + * UTF-8 UTF-EBCDIC I8 + * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate + * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate + * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode + * + */ + +#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */ +# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA +# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2) + +# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \ + /* B6 and B7 */ \ + && ((s1) & 0xFE ) == 0xB6) +#else +# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5 +# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90) +# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0) #endif - if (len > 1) { - if ( (*s == 0xE0 && UNLIKELY(s[1] < 0xA0)) - || (*s == 0xF0 && UNLIKELY(s[1] < 0x90)) - || (*s == 0xF8 && UNLIKELY(s[1] < 0x88)) - || (*s == 0xFC && UNLIKELY(s[1] < 0x84)) - || (*s == 0xFE && UNLIKELY(s[1] < 0x82))) + if ( (flags & UTF8_DISALLOW_SUPER) + && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) { - return 0; + return 0; /* Above Unicode */ } - if ((len > 6 && UNLIKELY(*s == 0xFF) && UNLIKELY(s[6] < 0x81))) { - return 0; - } - } - -#else /* For EBCDIC, we use I8, which is the same on all code pages */ - { - const U8 s0 = NATIVE_UTF8_TO_I8(*s); - /* On modern perls C0-C4 aren't considered start bytes */ - if ( /* s0 < 0xC5 || */ s0 == 0xE0) { - return 0; + if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT) + && UNLIKELY(is_utf8_cp_above_31_bits(s, e))) + { + return 0; /* Above 31 bits */ } - if (len >= 1) { + if (len > 1) { const U8 s1 = NATIVE_UTF8_TO_I8(s[1]); - if ( (s0 == 0xF0 && UNLIKELY(s1 < 0xB0)) - || (s0 == 0xF8 && UNLIKELY(s1 < 0xA8)) - || (s0 == 0xFC && UNLIKELY(s1 < 0xA4)) - || (s0 == 0xFE && UNLIKELY(s1 < 0x82))) + if ( (flags & UTF8_DISALLOW_SUPER) + && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1))) { - return 0; + return 0; /* Above Unicode */ } - if ((len > 7 && UNLIKELY(s0 == 0xFF) && UNLIKELY(s[7] < 0xA1))) { - return 0; + + if ( (flags & UTF8_DISALLOW_SURROGATE) + && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1))) + { + return 0; /* Surrogate */ + } + + if ( (flags & UTF8_DISALLOW_NONCHAR) + && UNLIKELY(UTF8_IS_NONCHAR(s, e))) + { + return 0; /* Noncharacter code point */ } } } -#endif + /* Make sure that all that follows are continuation bytes */ + for (x = s + 1; x < e; x++) { + if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) { + return 0; + } + } - /* Now see if this would overflow a UV on this platform. See if the UTF8 - * for this code point is larger than that for the highest representable - * code point */ - y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8; + /* Here is syntactically valid. Next, make sure this isn't the start of an + * overlong. */ + if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) { + return 0; + } - for (x = s; x < e; x++, y++) { + /* And finally, that the code point represented fits in a word on this + * platform */ + if (does_utf8_overflow(s, e)) { + return 0; + } - /* If the same at this byte, go on to the next */ - if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) { - continue; + return UTF8SKIP(s); +} + +char * +Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format) +{ + /* Returns a mortalized C string that is a displayable copy of the 'len' + * bytes starting at 's'. 'format' gives how to display each byte. + * Currently, there are only two formats, so it is currently a bool: + * 0 \xab + * 1 ab (that is a space between two hex digit bytes) + */ + + const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a + trailing NUL */ + const U8 * const e = s + len; + char * output; + char * d; + + PERL_ARGS_ASSERT__BYTE_DUMP_STRING; + + Newx(output, output_len, char); + SAVEFREEPV(output); + + d = output; + for (; s < e; s++) { + const unsigned high_nibble = (*s & 0xF0) >> 4; + const unsigned low_nibble = (*s & 0x0F); + + if (format) { + *d++ = ' '; + } + else { + *d++ = '\\'; + *d++ = 'x'; } - /* If this is larger, it overflows */ - if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) { - return 0; + if (high_nibble < 10) { + *d++ = high_nibble + '0'; + } + else { + *d++ = high_nibble - 10 + 'a'; } - /* But if smaller, it won't */ - break; + if (low_nibble < 10) { + *d++ = low_nibble + '0'; + } + else { + *d++ = low_nibble - 10 + 'a'; + } } - return len; + *d = '\0'; + return output; +} + +PERL_STATIC_INLINE char * +S_unexpected_non_continuation_text(pTHX_ const U8 * const s, + + /* How many bytes to print */ + STRLEN print_len, + + /* Which one is the non-continuation */ + const STRLEN non_cont_byte_pos, + + /* How many bytes should there be? */ + const STRLEN expect_len) +{ + /* Return the malformation warning text for an unexpected continuation + * byte. */ + + const char * const where = (non_cont_byte_pos == 1) + ? "immediately" + : Perl_form(aTHX_ "%d bytes", + (int) non_cont_byte_pos); + + PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT; + + /* We don't need to pass this parameter, but since it has already been + * calculated, it's likely faster to pass it; verify under DEBUGGING */ + assert(expect_len == UTF8SKIP(s)); + + return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x," + " %s after start byte 0x%02x; need %d bytes, got %d)", + malformed_text, + _byte_dump_string(s, print_len, 0), + *(s + non_cont_byte_pos), + where, + *s, + (int) expect_len, + (int) non_cont_byte_pos); } /* @@ -553,10 +856,13 @@ C bytes; C<*retlen> (if C isn't NULL) will be set to the length, in bytes, of that character. The value of C determines the behavior when C does not point to a -well-formed UTF-8 character. If C is 0, when a malformation is found, -zero is returned and C<*retlen> is set so that (S + C<*retlen>>) is the -next possible position in C that could begin a non-malformed character. -Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised. +well-formed UTF-8 character. If C is 0, encountering a malformation +causes zero to be returned and C<*retlen> is set so that (S + C<*retlen>>) +is the next possible position in C that could begin a non-malformed +character. Also, if UTF-8 warnings haven't been lexically disabled, a warning +is raised. Some UTF-8 input sequences may contain multiple malformations. +This function tries to find every possible one in each call, so multiple +warnings can be raised for each sequence. Various ALLOW flags can be set in C to allow (and not warn on) individual types of malformations, such as the sequence being overlong (that @@ -564,10 +870,10 @@ is, when there is a shorter sequence that can express the same code point; overlong sequences are expressly forbidden in the UTF-8 standard due to potential security issues). Another malformation example is the first byte of a character not being a legal first byte. See F for the list of such -flags. For allowed 0 length strings, this function returns 0; for allowed -overlong sequences, the computed code point is returned; for all other allowed -malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no -determinable reasonable value. +flags. Even if allowed, this function generally returns the Unicode +REPLACEMENT CHARACTER when it encounters a malformation. There are flags in +F to override this behavior for the overlong malformations, but don't +do that except for very specialized purposes. The C flag overrides the behavior when a non-allowed (by other flags) malformation is found. If this flag is set, the routine assumes that @@ -579,7 +885,7 @@ character, and an error return (unless the C flag is set), as in both cases, 0 is returned, and, depending on the malformation, C may be set to 1. To disambiguate, upon a zero return, see if the first byte of C is 0 as well. If so, the input was a C; if not, the input had an -error. +error. Or you can use C>. Certain code points are considered problematic. These are Unicode surrogates, Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF. @@ -609,14 +915,14 @@ C is also specified.) It is now deprecated to have very high code points (above C on the platforms) and this function will raise a deprecation warning for these (unless -such warnings are turned off). This value, is typically 0x7FFF_FFFF (2**31 -1) +such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1) in a 32-bit word. Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, so using them is more problematic than other above-Unicode code points. Perl invented an extension to UTF-8 to represent the ones above 2**36-1, so it is likely that non-Perl languages will not be able to read files that contain -these that written by the perl interpreter; nor would Perl understand files +these; nor would Perl understand files written by something that uses a different extension. For these reasons, there is a separate set of flags that can warn and/or disallow these extremely high code points, even if other above-Unicode ones are accepted. These are the @@ -642,27 +948,147 @@ use and those yet to be assigned, are never considered malformed and never warn. =cut + +Also implemented as a macro in utf8.h */ UV -Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) +Perl_utf8n_to_uvchr(pTHX_ const U8 *s, + STRLEN curlen, + STRLEN *retlen, + const U32 flags) { - const U8 * const s0 = s; - U8 overflow_byte = '\0'; /* Save byte in case of overflow */ - U8 * send; - UV uv = *s; - STRLEN expectlen; - SV* sv = NULL; - UV outlier_ret = 0; /* return value when input is in error or problematic - */ - UV pack_warn = 0; /* Save result of packWARN() for later */ - bool unexpected_non_continuation = FALSE; - bool overflowed = FALSE; - bool do_overlong_test = TRUE; /* May have to skip this test */ + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; + + return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL); +} - const char* const malformed_text = "Malformed UTF-8 character"; +/* - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; +=for apidoc utf8n_to_uvchr_error + +THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. +Most code should use L() rather than call this directly. + +This function is for code that needs to know what the precise malformation(s) +are when an error is found. + +It is like C> but it takes an extra parameter placed after +all the others, C. If this parameter is 0, this function behaves +identically to C>. Otherwise, C should be a pointer +to a C variable, which this function sets to indicate any errors found. +Upon return, if C<*errors> is 0, there were no errors found. Otherwise, +C<*errors> is the bit-wise C of the bits described in the list below. Some +of these bits will be set if a malformation is found, even if the input +C parameter indicates that the given malformation is allowed; those +exceptions are noted: + +=over 4 + +=item C + +The code point represented by the input UTF-8 sequence occupies more than 31 +bits. +This bit is set only if the input C parameter contains either the +C or the C flags. + +=item C + +The input sequence was malformed in that the first byte was a a UTF-8 +continuation byte. + +=item C + +The input C parameter was 0. + +=item C + +The input sequence was malformed in that there is some other sequence that +evaluates to the same code point, but that sequence is shorter than this one. + +=item C + +The code point represented by the input UTF-8 sequence is for a Unicode +non-character code point. +This bit is set only if the input C parameter contains either the +C or the C flags. + +=item C + +The input sequence was malformed in that a non-continuation type byte was found +in a position where only a continuation type one should be. + +=item C + +The input sequence was malformed in that it is for a code point that is not +representable in the number of bits available in a UV on the current platform. + +=item C + +The input sequence was malformed in that C is smaller than required for +a complete sequence. In other words, the input is for a partial character +sequence. + +=item C + +The input sequence was malformed in that it is for a non-Unicode code point; +that is, one above the legal Unicode maximum. +This bit is set only if the input C parameter contains either the +C or the C flags. + +=item C + +The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate +code point. +This bit is set only if the input C parameter contains either the +C or the C flags. + +=back + +To do your own error handling, call this function with the C +flag to suppress any warnings, and then examine the C<*errors> return. + +=cut +*/ + +UV +Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, + STRLEN curlen, + STRLEN *retlen, + const U32 flags, + U32 * errors) +{ + const U8 * const s0 = s; + U8 * send = NULL; /* (initialized to silence compilers' wrong + warning) */ + U32 possible_problems = 0; /* A bit is set here for each potential problem + found as we go along */ + UV uv = *s; + STRLEN expectlen = 0; /* How long should this sequence be? + (initialized to silence compilers' wrong + warning) */ + STRLEN avail_len = 0; /* When input is too short, gives what that is */ + U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL; + this gets set and discarded */ + + /* The below are used only if there is both an overlong malformation and a + * too short one. Otherwise the first two are set to 's0' and 'send', and + * the third not used at all */ + U8 * adjusted_s0 = (U8 *) s0; + U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong + warning) */ + U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this + routine; see [perl #130921] */ + UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */ + + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; + + if (errors) { + *errors = 0; + } + else { + errors = &discard_errors; + } /* The order of malformation tests here is important. We should consume as * few bytes as possible in order to not skip any valid character. This is @@ -682,21 +1108,20 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) * returning to the caller C<*retlen> pointing to the very next byte (one * which is actually part of of the overflowing sequence), that could look * legitimate to the caller, which could discard the initial partial - * sequence and process the rest, inappropriately */ + * sequence and process the rest, inappropriately. + * + * Some possible input sequences are malformed in more than one way. This + * function goes to lengths to try to find all of them. This is necessary + * for correctness, as the inputs may allow one malformation but not + * another, and if we abandon searching for others after finding the + * allowed one, we could allow in something that shouldn't have been. + */ - /* Zero length strings, if allowed, of necessity are zero */ if (UNLIKELY(curlen == 0)) { - if (retlen) { - *retlen = 0; - } - - if (flags & UTF8_ALLOW_EMPTY) { - return 0; - } - if (! (flags & UTF8_CHECK_ONLY)) { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text)); - } - goto malformed; + possible_problems |= UTF8_GOT_EMPTY; + curlen = 0; + uv = UNICODE_REPLACEMENT; + goto ready_to_handle_errors; } expectlen = UTF8SKIP(s); @@ -716,134 +1141,127 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) /* A continuation character can't start a valid sequence */ if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) { - if (flags & UTF8_ALLOW_CONTINUATION) { - if (retlen) { - *retlen = 1; - } - return UNICODE_REPLACEMENT; - } - - if (! (flags & UTF8_CHECK_ONLY)) { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0)); - } - curlen = 1; - goto malformed; + possible_problems |= UTF8_GOT_CONTINUATION; + curlen = 1; + uv = UNICODE_REPLACEMENT; + goto ready_to_handle_errors; } /* Here is not a continuation byte, nor an invariant. The only thing left - * is a start byte (possibly for an overlong) */ + * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START + * because it excludes start bytes like \xC0 that always lead to + * overlongs.) */ /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits * that indicate the number of bytes in the character's whole UTF-8 * sequence, leaving just the bits that are part of the value. */ uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); - /* Now, loop through the remaining bytes in the character's sequence, - * accumulating each into the working value as we go. Be sure to not look - * past the end of the input string */ - send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen); + /* Setup the loop end point, making sure to not look past the end of the + * input string, and flag it as too short if the size isn't big enough. */ + send = (U8*) s0; + if (UNLIKELY(curlen < expectlen)) { + possible_problems |= UTF8_GOT_SHORT; + avail_len = curlen; + send += curlen; + } + else { + send += expectlen; + } + adjusted_send = send; + /* Now, loop through the remaining bytes in the character's sequence, + * accumulating each into the working value as we go. */ for (s = s0 + 1; s < send; s++) { if (LIKELY(UTF8_IS_CONTINUATION(*s))) { - if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) { - - /* The original implementors viewed this malformation as more - * serious than the others (though I, khw, don't understand - * why, since other malformations also give very very wrong - * results), so there is no way to turn off checking for it. - * Set a flag, but keep going in the loop, so that we absorb - * the rest of the bytes that comprise the character. */ - overflowed = TRUE; - overflow_byte = *s; /* Save for warning message's use */ - } uv = UTF8_ACCUMULATE(uv, *s); - } - else { - /* Here, found a non-continuation before processing all expected - * bytes. This byte begins a new character, so quit, even if - * allowing this malformation. */ - unexpected_non_continuation = TRUE; - break; - } + continue; + } + + /* Here, found a non-continuation before processing all expected bytes. + * This byte indicates the beginning of a new character, so quit, even + * if allowing this malformation. */ + possible_problems |= UTF8_GOT_NON_CONTINUATION; + break; } /* End of loop through the character's bytes */ /* Save how many bytes were actually in the character */ curlen = s - s0; - /* The loop above finds two types of malformations: non-continuation and/or - * overflow. The non-continuation malformation is really a too-short - * malformation, as it means that the current character ended before it was - * expected to (being terminated prematurely by the beginning of the next - * character, whereas in the too-short malformation there just are too few - * bytes available to hold the character. In both cases, the check below - * that we have found the expected number of bytes would fail if executed.) - * Thus the non-continuation malformation is really unnecessary, being a - * subset of the too-short malformation. But there may be existing - * applications that are expecting the non-continuation type, so we retain - * it, and return it in preference to the too-short malformation. (If this - * code were being written from scratch, the two types might be collapsed - * into one.) I, khw, am also giving priority to returning the - * non-continuation and too-short malformations over overflow when multiple - * ones are present. I don't know of any real reason to prefer one over - * the other, except that it seems to me that multiple-byte errors trumps - * errors from a single byte */ - if (UNLIKELY(unexpected_non_continuation)) { - if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) { - if (! (flags & UTF8_CHECK_ONLY)) { - if (curlen == 1) { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0)); - } - else { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen)); - } - } - goto malformed; - } - uv = UNICODE_REPLACEMENT; - - /* Skip testing for overlongs, as the REPLACEMENT may not be the same - * as what the original expectations were. */ - do_overlong_test = FALSE; - if (retlen) { - *retlen = curlen; - } - } - else if (UNLIKELY(curlen < expectlen)) { - if (! (flags & UTF8_ALLOW_SHORT)) { - if (! (flags & UTF8_CHECK_ONLY)) { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0)); - } - goto malformed; - } - uv = UNICODE_REPLACEMENT; - do_overlong_test = FALSE; - if (retlen) { - *retlen = curlen; - } - } + /* Note that there are two types of too-short malformation. One is when + * there is actual wrong data before the normal termination of the + * sequence. The other is that the sequence wasn't complete before the end + * of the data we are allowed to look at, based on the input 'curlen'. + * This means that we were passed data for a partial character, but it is + * valid as far as we saw. The other is definitely invalid. This + * distinction could be important to a caller, so the two types are kept + * separate. + * + * A convenience macro that matches either of the too-short conditions. */ +# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) + + if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { + uv_so_far = uv; + uv = UNICODE_REPLACEMENT; + } + + /* Check for overflow */ + if (UNLIKELY(does_utf8_overflow(s0, send))) { + possible_problems |= UTF8_GOT_OVERFLOW; + uv = UNICODE_REPLACEMENT; + } + + /* Check for overlong. If no problems so far, 'uv' is the correct code + * point value. Simply see if it is expressible in fewer bytes. Otherwise + * we must look at the UTF-8 byte sequence itself to see if it is for an + * overlong */ + if ( ( LIKELY(! possible_problems) + && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv))) + || ( UNLIKELY( possible_problems) + && ( UNLIKELY(! UTF8_IS_START(*s0)) + || ( curlen > 1 + && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0, + send - s0)))))) + { + possible_problems |= UTF8_GOT_LONG; + + if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { + UV min_uv = uv_so_far; + STRLEN i; + + /* Here, the input is both overlong and is missing some trailing + * bytes. There is no single code point it could be for, but there + * may be enough information present to determine if what we have + * so far is for an unallowed code point, such as for a surrogate. + * The code below has the intelligence to determine this, but just + * for non-overlong UTF-8 sequences. What we do here is calculate + * the smallest code point the input could represent if there were + * no too short malformation. Then we compute and save the UTF-8 + * for that, which is what the code below looks at instead of the + * raw input. It turns out that the smallest such code point is + * all we need. */ + for (i = curlen; i < expectlen; i++) { + min_uv = UTF8_ACCUMULATE(min_uv, + I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK)); + } - if (UNLIKELY(overflowed)) { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0)); - goto malformed; + adjusted_s0 = temp_char_buf; + adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0); + } } - if (do_overlong_test - && expectlen > (STRLEN) OFFUNISKIP(uv) - && ! (flags & UTF8_ALLOW_LONG)) - { - /* The overlong malformation has lower precedence than the others. - * Note that if this malformation is allowed, we return the actual - * value, instead of the replacement character. This is because this - * value is actually well-defined. */ - if (! (flags & UTF8_CHECK_ONLY)) { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0)); - } - goto malformed; - } + /* Now check that the input isn't for a problematic code point not allowed + * by the input parameters. */ + /* isn't problematic if < this */ + if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST) + || ( UNLIKELY(possible_problems) - /* Here, the input is considered to be well-formed, but it still could be a - * problematic code point that is not allowed by the input parameters. */ - if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */ + /* if overflow, we know without looking further + * precisely which of the problematic types it is, + * and we deal with those in the overflow handling + * code */ + && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)) + && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0))) && ((flags & ( UTF8_DISALLOW_NONCHAR |UTF8_DISALLOW_SURROGATE |UTF8_DISALLOW_SUPER @@ -852,153 +1270,456 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) |UTF8_WARN_SURROGATE |UTF8_WARN_SUPER |UTF8_WARN_ABOVE_31_BIT)) + /* In case of a malformation, 'uv' is not valid, and has + * been changed to something in the Unicode range. + * Currently we don't output a deprecation message if there + * is already a malformation, so we don't have to special + * case the test immediately below */ || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP) && ckWARN_d(WARN_DEPRECATED)))) { - if (UNICODE_IS_SURROGATE(uv)) { - - /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary - * generation of the sv, since no warnings are raised under CHECK */ - if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE - && ckWARN_d(WARN_SURROGATE)) - { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv)); - pack_warn = packWARN(WARN_SURROGATE); - } - if (flags & UTF8_DISALLOW_SURROGATE) { - goto disallowed; - } - } - else if ((uv > PERL_UNICODE_MAX)) { - if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER - && ckWARN_d(WARN_NON_UNICODE)) - { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ - "Code point 0x%04"UVXf" is not Unicode, may not be portable", - uv)); - pack_warn = packWARN(WARN_NON_UNICODE); - } + /* If there were no malformations, or the only malformation is an + * overlong, 'uv' is valid */ + if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) { + if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { + possible_problems |= UTF8_GOT_SURROGATE; + } + else if (UNLIKELY(uv > PERL_UNICODE_MAX)) { + possible_problems |= UTF8_GOT_SUPER; + } + else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) { + possible_problems |= UTF8_GOT_NONCHAR; + } + } + else { /* Otherwise, need to look at the source UTF-8, possibly + adjusted to be non-overlong */ - /* The maximum code point ever specified by a standard was - * 2**31 - 1. Anything larger than that is a Perl extension that - * very well may not be understood by other applications (including - * earlier perl versions on EBCDIC platforms). We test for these - * after the regular SUPER ones, and before possibly bailing out, - * so that the slightly more dire warning will override the regular - * one. */ - if ( (flags & (UTF8_WARN_ABOVE_31_BIT - |UTF8_WARN_SUPER - |UTF8_DISALLOW_ABOVE_31_BIT)) - && UNLIKELY(is_utf8_cp_above_31_bits(s0, send))) + if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0) + >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) { - if ( ! (flags & UTF8_CHECK_ONLY) - && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER)) - && ckWARN_d(WARN_UTF8)) + possible_problems |= UTF8_GOT_SUPER; + } + else if (curlen > 1) { + if (UNLIKELY(IS_UTF8_2_BYTE_SUPER( + NATIVE_UTF8_TO_I8(*adjusted_s0), + NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))))) { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ - "Code point 0x%"UVXf" is not Unicode, and not portable", - uv)); - pack_warn = packWARN(WARN_UTF8); + possible_problems |= UTF8_GOT_SUPER; } - if (flags & UTF8_DISALLOW_ABOVE_31_BIT) { - goto disallowed; + else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE( + NATIVE_UTF8_TO_I8(*adjusted_s0), + NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))))) + { + possible_problems |= UTF8_GOT_SURROGATE; } } - if (flags & UTF8_DISALLOW_SUPER) { - goto disallowed; - } + /* We need a complete well-formed UTF-8 character to discern + * non-characters, so can't look for them here */ + } + } - /* The deprecated warning overrides any non-deprecated one */ - if (UNLIKELY(uv > MAX_NON_DEPRECATED_CP) && ckWARN_d(WARN_DEPRECATED)) - { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ cp_above_legal_max, - uv, MAX_NON_DEPRECATED_CP)); - pack_warn = packWARN(WARN_DEPRECATED); + ready_to_handle_errors: + + /* At this point: + * curlen contains the number of bytes in the sequence that + * this call should advance the input by. + * avail_len gives the available number of bytes passed in, but + * only if this is less than the expected number of + * bytes, based on the code point's start byte. + * possible_problems' is 0 if there weren't any problems; otherwise a bit + * is set in it for each potential problem found. + * uv contains the code point the input sequence + * represents; or if there is a problem that prevents + * a well-defined value from being computed, it is + * some subsitute value, typically the REPLACEMENT + * CHARACTER. + * s0 points to the first byte of the character + * send points to just after where that (potentially + * partial) character ends + * adjusted_s0 normally is the same as s0, but in case of an + * overlong for which the UTF-8 matters below, it is + * the first byte of the shortest form representation + * of the input. + * adjusted_send normally is the same as 'send', but if adjusted_s0 + * is set to something other than s0, this points one + * beyond its end + */ + + if (UNLIKELY(possible_problems)) { + bool disallowed = FALSE; + const U32 orig_problems = possible_problems; + + while (possible_problems) { /* Handle each possible problem */ + UV pack_warn = 0; + char * message = NULL; + + /* Each 'if' clause handles one problem. They are ordered so that + * the first ones' messages will be displayed before the later + * ones; this is kinda in decreasing severity order */ + if (possible_problems & UTF8_GOT_OVERFLOW) { + + /* Overflow means also got a super and above 31 bits, but we + * handle all three cases here */ + possible_problems + &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT); + *errors |= UTF8_GOT_OVERFLOW; + + /* But the API says we flag all errors found */ + if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) { + *errors |= UTF8_GOT_SUPER; + } + if (flags + & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) + { + *errors |= UTF8_GOT_ABOVE_31_BIT; + } + + /* Disallow if any of the three categories say to */ + if ( ! (flags & UTF8_ALLOW_OVERFLOW) + || (flags & ( UTF8_DISALLOW_SUPER + |UTF8_DISALLOW_ABOVE_31_BIT))) + { + disallowed = TRUE; + } + + + /* Likewise, warn if any say to, plus if deprecation warnings + * are on, because this code point is above IV_MAX */ + if ( ckWARN_d(WARN_DEPRECATED) + || ! (flags & UTF8_ALLOW_OVERFLOW) + || (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT))) + { + + /* The warnings code explicitly says it doesn't handle the + * case of packWARN2 and two categories which have + * parent-child relationship. Even if it works now to + * raise the warning if either is enabled, it wouldn't + * necessarily do so in the future. We output (only) the + * most dire warning*/ + if (! (flags & UTF8_CHECK_ONLY)) { + if (ckWARN_d(WARN_UTF8)) { + pack_warn = packWARN(WARN_UTF8); + } + else if (ckWARN_d(WARN_NON_UNICODE)) { + pack_warn = packWARN(WARN_NON_UNICODE); + } + if (pack_warn) { + message = Perl_form(aTHX_ "%s: %s (overflows)", + malformed_text, + _byte_dump_string(s0, send - s0, 0)); + } + } + } } - } - else if (UNICODE_IS_NONCHAR(uv)) { - if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR - && ckWARN_d(WARN_NONCHAR)) - { - sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv)); - pack_warn = packWARN(WARN_NONCHAR); - } - if (flags & UTF8_DISALLOW_NONCHAR) { - goto disallowed; - } - } + else if (possible_problems & UTF8_GOT_EMPTY) { + possible_problems &= ~UTF8_GOT_EMPTY; + *errors |= UTF8_GOT_EMPTY; + + if (! (flags & UTF8_ALLOW_EMPTY)) { + + /* This so-called malformation is now treated as a bug in + * the caller. If you have nothing to decode, skip calling + * this function */ + assert(0); + + disallowed = TRUE; + if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + pack_warn = packWARN(WARN_UTF8); + message = Perl_form(aTHX_ "%s (empty string)", + malformed_text); + } + } + } + else if (possible_problems & UTF8_GOT_CONTINUATION) { + possible_problems &= ~UTF8_GOT_CONTINUATION; + *errors |= UTF8_GOT_CONTINUATION; + + if (! (flags & UTF8_ALLOW_CONTINUATION)) { + disallowed = TRUE; + if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + pack_warn = packWARN(WARN_UTF8); + message = Perl_form(aTHX_ + "%s: %s (unexpected continuation byte 0x%02x," + " with no preceding start byte)", + malformed_text, + _byte_dump_string(s0, 1, 0), *s0); + } + } + } + else if (possible_problems & UTF8_GOT_SHORT) { + possible_problems &= ~UTF8_GOT_SHORT; + *errors |= UTF8_GOT_SHORT; + + if (! (flags & UTF8_ALLOW_SHORT)) { + disallowed = TRUE; + if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + pack_warn = packWARN(WARN_UTF8); + message = Perl_form(aTHX_ + "%s: %s (too short; %d byte%s available, need %d)", + malformed_text, + _byte_dump_string(s0, send - s0, 0), + (int)avail_len, + avail_len == 1 ? "" : "s", + (int)expectlen); + } + } - if (sv) { - outlier_ret = uv; /* Note we don't bother to convert to native, - as all the outlier code points are the same - in both ASCII and EBCDIC */ - goto do_warn; - } + } + else if (possible_problems & UTF8_GOT_NON_CONTINUATION) { + possible_problems &= ~UTF8_GOT_NON_CONTINUATION; + *errors |= UTF8_GOT_NON_CONTINUATION; + + if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) { + disallowed = TRUE; + if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + + /* If we don't know for sure that the input length is + * valid, avoid as much as possible reading past the + * end of the buffer */ + int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN) + ? s - s0 + : send - s0; + pack_warn = packWARN(WARN_UTF8); + message = Perl_form(aTHX_ "%s", + unexpected_non_continuation_text(s0, + printlen, + s - s0, + (int) expectlen)); + } + } + } + else if (possible_problems & UTF8_GOT_LONG) { + possible_problems &= ~UTF8_GOT_LONG; + *errors |= UTF8_GOT_LONG; - /* Here, this is not considered a malformed character, so drop through - * to return it */ - } + if (flags & UTF8_ALLOW_LONG) { - return UNI_TO_NATIVE(uv); + /* We don't allow the actual overlong value, unless the + * special extra bit is also set */ + if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE + & ~UTF8_ALLOW_LONG))) + { + uv = UNICODE_REPLACEMENT; + } + } + else { + disallowed = TRUE; + + if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + pack_warn = packWARN(WARN_UTF8); + + /* These error types cause 'uv' to be something that + * isn't what was intended, so can't use it in the + * message. The other error types either can't + * generate an overlong, or else the 'uv' is valid */ + if (orig_problems & + (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW)) + { + message = Perl_form(aTHX_ + "%s: %s (any UTF-8 sequence that starts" + " with \"%s\" is overlong which can and" + " should be represented with a" + " different, shorter sequence)", + malformed_text, + _byte_dump_string(s0, send - s0, 0), + _byte_dump_string(s0, curlen, 0)); + } + else { + U8 tmpbuf[UTF8_MAXBYTES+1]; + const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf, + uv, 0); + message = Perl_form(aTHX_ + "%s: %s (overlong; instead use %s to represent" + " U+%0*" UVXf ")", + malformed_text, + _byte_dump_string(s0, send - s0, 0), + _byte_dump_string(tmpbuf, e - tmpbuf, 0), + ((uv < 256) ? 2 : 4), /* Field width of 2 for + small code points */ + uv); + } + } + } + } + else if (possible_problems & UTF8_GOT_SURROGATE) { + possible_problems &= ~UTF8_GOT_SURROGATE; - /* There are three cases which get to beyond this point. In all 3 cases: - * if not null points to a string to print as a warning. - * is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't - * set. - * is what return value to use if UTF8_CHECK_ONLY isn't set. - * This is done by initializing it to 0, and changing it only - * for case 1). - * The 3 cases are: - * 1) The input is valid but problematic, and to be warned about. The - * return value is the resultant code point; <*retlen> is set to - * , the number of bytes that comprise the code point. - * contains the result of packWARN() for the warning - * types. The entry point for this case is the label ; - * 2) The input is a valid code point but disallowed by the parameters to - * this function. The return value is 0. If UTF8_CHECK_ONLY is set, - * <*relen> is -1; otherwise it is , the number of bytes that - * comprise the code point. contains the result of - * packWARN() for the warning types. The entry point for this case is - * the label . - * 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY - * is set, <*relen> is -1; otherwise it is , the number of - * bytes that comprise the malformation. All such malformations are - * assumed to be warning type . The entry point for this case - * is the label . - */ + if (flags & UTF8_WARN_SURROGATE) { + *errors |= UTF8_GOT_SURROGATE; + + if ( ! (flags & UTF8_CHECK_ONLY) + && ckWARN_d(WARN_SURROGATE)) + { + pack_warn = packWARN(WARN_SURROGATE); + + /* These are the only errors that can occur with a + * surrogate when the 'uv' isn't valid */ + if (orig_problems & UTF8_GOT_TOO_SHORT) { + message = Perl_form(aTHX_ + "UTF-16 surrogate (any UTF-8 sequence that" + " starts with \"%s\" is for a surrogate)", + _byte_dump_string(s0, curlen, 0)); + } + else { + message = Perl_form(aTHX_ + "UTF-16 surrogate U+%04" UVXf, uv); + } + } + } + + if (flags & UTF8_DISALLOW_SURROGATE) { + disallowed = TRUE; + *errors |= UTF8_GOT_SURROGATE; + } + } + else if (possible_problems & UTF8_GOT_SUPER) { + possible_problems &= ~UTF8_GOT_SUPER; + + if (flags & UTF8_WARN_SUPER) { + *errors |= UTF8_GOT_SUPER; + + if ( ! (flags & UTF8_CHECK_ONLY) + && ckWARN_d(WARN_NON_UNICODE)) + { + pack_warn = packWARN(WARN_NON_UNICODE); + + if (orig_problems & UTF8_GOT_TOO_SHORT) { + message = Perl_form(aTHX_ + "Any UTF-8 sequence that starts with" + " \"%s\" is for a non-Unicode code point," + " may not be portable", + _byte_dump_string(s0, curlen, 0)); + } + else { + message = Perl_form(aTHX_ + "Code point 0x%04" UVXf " is not" + " Unicode, may not be portable", + uv); + } + } + } + + /* The maximum code point ever specified by a standard was + * 2**31 - 1. Anything larger than that is a Perl extension + * that very well may not be understood by other applications + * (including earlier perl versions on EBCDIC platforms). We + * test for these after the regular SUPER ones, and before + * possibly bailing out, so that the slightly more dire warning + * will override the regular one. */ + if ( (flags & (UTF8_WARN_ABOVE_31_BIT + |UTF8_WARN_SUPER + |UTF8_DISALLOW_ABOVE_31_BIT)) + && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT) + && UNLIKELY(is_utf8_cp_above_31_bits( + adjusted_s0, + adjusted_send))) + || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT)) + && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv))))) + { + if ( ! (flags & UTF8_CHECK_ONLY) + && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER)) + && ckWARN_d(WARN_UTF8)) + { + pack_warn = packWARN(WARN_UTF8); + + if (orig_problems & UTF8_GOT_TOO_SHORT) { + message = Perl_form(aTHX_ + "Any UTF-8 sequence that starts with" + " \"%s\" is for a non-Unicode code" + " point, and is not portable", + _byte_dump_string(s0, curlen, 0)); + } + else { + message = Perl_form(aTHX_ + "Code point 0x%" UVXf " is not Unicode," + " and not portable", + uv); + } + } + + if (flags & ( UTF8_WARN_ABOVE_31_BIT + |UTF8_DISALLOW_ABOVE_31_BIT)) + { + *errors |= UTF8_GOT_ABOVE_31_BIT; - malformed: + if (flags & UTF8_DISALLOW_ABOVE_31_BIT) { + disallowed = TRUE; + } + } + } - if (sv && ckWARN_d(WARN_UTF8)) { - pack_warn = packWARN(WARN_UTF8); - } + if (flags & UTF8_DISALLOW_SUPER) { + *errors |= UTF8_GOT_SUPER; + disallowed = TRUE; + } - disallowed: + /* The deprecated warning overrides any non-deprecated one. If + * there are other problems, a deprecation message is not + * really helpful, so don't bother to raise it in that case. + * This also keeps the code from having to handle the case + * where 'uv' is not valid. */ + if ( ! (orig_problems + & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW)) + && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv, + MAX_NON_DEPRECATED_CP); + } + } + else if (possible_problems & UTF8_GOT_NONCHAR) { + possible_problems &= ~UTF8_GOT_NONCHAR; - if (flags & UTF8_CHECK_ONLY) { - if (retlen) - *retlen = ((STRLEN) -1); - return 0; - } + if (flags & UTF8_WARN_NONCHAR) { + *errors |= UTF8_GOT_NONCHAR; - do_warn: + if ( ! (flags & UTF8_CHECK_ONLY) + && ckWARN_d(WARN_NONCHAR)) + { + /* The code above should have guaranteed that we don't + * get here with errors other than overlong */ + assert (! (orig_problems + & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR))); + + pack_warn = packWARN(WARN_NONCHAR); + message = Perl_form(aTHX_ "Unicode non-character" + " U+%04" UVXf " is not recommended" + " for open interchange", uv); + } + } - if (pack_warn) { /* was initialized to 0, and changed only - if warnings are to be raised. */ - const char * const string = SvPVX_const(sv); + if (flags & UTF8_DISALLOW_NONCHAR) { + disallowed = TRUE; + *errors |= UTF8_GOT_NONCHAR; + } + } /* End of looking through the possible flags */ + + /* Display the message (if any) for the problem being handled in + * this iteration of the loop */ + if (message) { + if (PL_op) + Perl_warner(aTHX_ pack_warn, "%s in %s", message, + OP_DESC(PL_op)); + else + Perl_warner(aTHX_ pack_warn, "%s", message); + } + } /* End of 'while (possible_problems)' */ - if (PL_op) - Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op)); - else - Perl_warner(aTHX_ pack_warn, "%s", string); - } + /* Since there was a possible problem, the returned length may need to + * be changed from the one stored at the beginning of this function. + * Instead of trying to figure out if that's needed, just do it. */ + if (retlen) { + *retlen = curlen; + } - if (retlen) { - *retlen = curlen; + if (disallowed) { + if (flags & UTF8_CHECK_ONLY && retlen) { + *retlen = ((STRLEN) -1); + } + return 0; + } } - return outlier_ret; + return UNI_TO_NATIVE(uv); } /* @@ -1030,10 +1751,12 @@ Also implemented as a macro in utf8.h UV Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) { + PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; + assert(s < send); return utf8n_to_uvchr(s, send - s, retlen, - ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } /* This is marked as deprecated @@ -1149,14 +1872,12 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) if (UTF8_IS_CONTINUATION(c1)) { c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1); } else { + /* diag_listed_as: Malformed UTF-8 character%s */ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), - "Malformed UTF-8 character " - "(unexpected non-continuation byte 0x%02x" - ", immediately after start byte 0x%02x)" - /* Dear diag.t, it's in the pod. */ - "%s%s", c1, c, - PL_op ? " in " : "", - PL_op ? OP_DESC(PL_op) : ""); + "%s %s%s", + unexpected_non_continuation_text(u - 1, 2, 1, 2), + PL_op ? " in " : "", + PL_op ? OP_DESC(PL_op) : ""); return -2; } } else { @@ -1236,13 +1957,20 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) /* =for apidoc bytes_from_utf8 -Converts a string C of length C from UTF-8 into native byte encoding. -Unlike L but like L, returns a pointer to -the newly-created string, and updates C to contain the new -length. Returns the original string if no conversion occurs, C -is unchanged. Do nothing if C points to 0. Sets C to -0 if C is converted or consisted entirely of characters that are invariant -in UTF-8 (i.e., US-ASCII on non-EBCDIC machines). +Converts a potentially UTF-8 encoded string C of length C into native +byte encoding. On input, the boolean C<*is_utf8> gives whether or not C is +actually encoded in UTF-8. + +Unlike L but like L, this is non-destructive of +the input string. + +Do nothing if C<*is_utf8> is 0, or if there are code points in the string +not expressible in native byte encoding. In these cases, C<*is_utf8> and +C<*len> are unchanged, and the return value is the original C. + +Otherwise, C<*is_utf8> is set to 0, and the return value is a pointer to a +newly created string containing a downgraded copy of C, and whose length is +returned in C<*len>, updated. =cut */ @@ -1253,7 +1981,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) U8 *d; const U8 *start = s; const U8 *send; - I32 count = 0; + Size_t count = 0; PERL_ARGS_ASSERT_BYTES_FROM_UTF8; PERL_UNUSED_CONTEXT; @@ -1347,7 +2075,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) PERL_ARGS_ASSERT_UTF16_TO_UTF8; if (bytelen & 1) - Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen); + Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf, (UV)bytelen); pend = p + bytelen; @@ -1421,7 +2149,7 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED; if (bytelen & 1) - Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf, + Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf, (UV)bytelen); while (s < send) { @@ -1438,7 +2166,7 @@ Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(classnum, tmpbuf); + return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf)); } /* Internal function so we can deprecate the external one, and call @@ -1459,7 +2187,7 @@ Perl__is_uni_perl_idcont(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return _is_utf8_perl_idcont(tmpbuf); + return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf)); } bool @@ -1467,7 +2195,7 @@ Perl__is_uni_perl_idstart(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return _is_utf8_perl_idstart(tmpbuf); + return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf)); } UV @@ -1577,7 +2305,7 @@ Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) } STATIC U8 -S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp) +S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy) { /* We have the latin1-range values compiled into the core, so just use * those, converting the result to UTF-8. Since the result is always just @@ -1585,6 +2313,8 @@ S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp) U8 converted = toLOWER_LATIN1(c); + PERL_UNUSED_ARG(dummy); + if (p != NULL) { if (NATIVE_BYTE_IS_INVARIANT(converted)) { *p = converted; @@ -1607,7 +2337,7 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) PERL_ARGS_ASSERT_TO_UNI_LOWER; if (c < 256) { - return to_lower_latin1((U8) c, p, lenp); + return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ ); } uvchr_to_utf8(p, c); @@ -1710,13 +2440,13 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) uvchr_to_utf8(p, c); return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL); } - else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with + else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with the special flags. */ U8 utf8_c[UTF8_MAXBYTES + 1]; needs_full_generality: uvchr_to_utf8(utf8_c, c); - return _to_utf8_fold_flags(utf8_c, p, lenp, flags); + return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c), p, lenp, flags); } } @@ -1745,16 +2475,46 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, * character without reading beyond the end, and pass that number on to the * validating routine */ if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) { - if (ckWARN_d(WARN_UTF8)) { - Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8), - "Passing malformed UTF-8 to \"%s\" is deprecated", swashname); - if (ckWARN(WARN_UTF8)) { /* This will output details as to the - what the malformation is */ - utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL); - } - } - return FALSE; + _force_out_malformed_utf8_message(p, p + UTF8SKIP(p), + _UTF8_NO_CONFIDENCE_IN_CURLEN, + 1 /* Die */ ); + NOT_REACHED; /* NOTREACHED */ } + + if (!*swash) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + *swash = _core_swash_init("utf8", + + /* Only use the name if there is no inversion + * list; otherwise will go out to disk */ + (invlist) ? "" : swashname, + + &PL_sv_undef, 1, 0, invlist, &flags); + } + + return swash_fetch(*swash, p, TRUE) != 0; +} + +PERL_STATIC_INLINE bool +S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, SV **swash, + const char *const swashname, SV* const invlist) +{ + /* returns a boolean giving whether or not the UTF8-encoded character that + * starts at

, and extending no further than is in the swash + * indicated by . contains a pointer to where the swash + * indicated by is to be stored; which this routine will do, so + * that future calls will look at <*swash> and only generate a swash if it + * is not null. is NULL or an inversion list that defines the + * swash. If not null, it saves time during initialization of the swash. + */ + + PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN; + + if (! isUTF8_CHAR(p, e)) { + _force_out_malformed_utf8_message(p, e, 0, 1); + NOT_REACHED; /* NOTREACHED */ + } + if (!*swash) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; *swash = _core_swash_init("utf8", @@ -1769,30 +2529,166 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, return swash_fetch(*swash, p, TRUE) != 0; } +STATIC void +S_warn_on_first_deprecated_use(pTHX_ const char * const name, + const char * const alternative, + const bool use_locale, + const char * const file, + const unsigned line) +{ + const char * key; + + PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE; + + if (ckWARN_d(WARN_DEPRECATED)) { + + key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line); + if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) { + if (! PL_seen_deprecated_macro) { + PL_seen_deprecated_macro = newHV(); + } + if (! hv_store(PL_seen_deprecated_macro, key, + strlen(key), &PL_sv_undef, 0)) + { + Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); + } + + if (instr(file, "mathoms.c")) { + Perl_warner(aTHX_ WARN_DEPRECATED, + "In %s, line %d, starting in Perl v5.30, %s()" + " will be removed. Avoid this message by" + " converting to use %s().\n", + file, line, name, alternative); + } + else { + Perl_warner(aTHX_ WARN_DEPRECATED, + "In %s, line %d, starting in Perl v5.30, %s() will" + " require an additional parameter. Avoid this" + " message by converting to use %s().\n", + file, line, name, alternative); + } + } + } +} + bool -Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p) +Perl__is_utf8_FOO(pTHX_ U8 classnum, + const U8 *p, + const char * const name, + const char * const alternative, + const bool use_utf8, + const bool use_locale, + const char * const file, + const unsigned line) { PERL_ARGS_ASSERT__IS_UTF8_FOO; + warn_on_first_deprecated_use(name, alternative, use_locale, file, line); + + if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) { + + switch (classnum) { + case _CC_WORDCHAR: + case _CC_DIGIT: + case _CC_ALPHA: + case _CC_LOWER: + case _CC_UPPER: + case _CC_PUNCT: + case _CC_PRINT: + case _CC_ALPHANUMERIC: + case _CC_GRAPH: + case _CC_CASED: + + return is_utf8_common(p, + &PL_utf8_swash_ptrs[classnum], + swash_property_names[classnum], + PL_XPosix_ptrs[classnum]); + + case _CC_SPACE: + return is_XPERLSPACE_high(p); + case _CC_BLANK: + return is_HORIZWS_high(p); + case _CC_XDIGIT: + return is_XDIGIT_high(p); + case _CC_CNTRL: + return 0; + case _CC_ASCII: + return 0; + case _CC_VERTSPACE: + return is_VERTWS_high(p); + case _CC_IDFIRST: + if (! PL_utf8_perl_idstart) { + PL_utf8_perl_idstart + = _new_invlist_C_array(_Perl_IDStart_invlist); + } + return is_utf8_common(p, &PL_utf8_perl_idstart, + "_Perl_IDStart", NULL); + case _CC_IDCONT: + if (! PL_utf8_perl_idcont) { + PL_utf8_perl_idcont + = _new_invlist_C_array(_Perl_IDCont_invlist); + } + return is_utf8_common(p, &PL_utf8_perl_idcont, + "_Perl_IDCont", NULL); + } + } + + /* idcont is the same as wordchar below 256 */ + if (classnum == _CC_IDCONT) { + classnum = _CC_WORDCHAR; + } + else if (classnum == _CC_IDFIRST) { + if (*p == '_') { + return TRUE; + } + classnum = _CC_ALPHA; + } + + if (! use_locale) { + if (! use_utf8 || UTF8_IS_INVARIANT(*p)) { + return _generic_isCC(*p, classnum); + } + + return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )), classnum); + } + else { + if (! use_utf8 || UTF8_IS_INVARIANT(*p)) { + return isFOO_lc(classnum, *p); + } + + return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 ))); + } + + NOT_REACHED; /* NOTREACHED */ +} + +bool +Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p, + const U8 * const e) +{ + PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN; + assert(classnum < _FIRST_NON_SWASH_CC); - return is_utf8_common(p, - &PL_utf8_swash_ptrs[classnum], - swash_property_names[classnum], - PL_XPosix_ptrs[classnum]); + return is_utf8_common_with_len(p, + e, + &PL_utf8_swash_ptrs[classnum], + swash_property_names[classnum], + PL_XPosix_ptrs[classnum]); } bool -Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) +Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e) { SV* invlist = NULL; - PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART; + PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN; if (! PL_utf8_perl_idstart) { invlist = _new_invlist_C_array(_Perl_IDStart_invlist); } - return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist); + return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart, + "_Perl_IDStart", invlist); } bool @@ -1806,16 +2702,17 @@ Perl__is_utf8_xidstart(pTHX_ const U8 *p) } bool -Perl__is_utf8_perl_idcont(pTHX_ const U8 *p) +Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e) { SV* invlist = NULL; - PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT; + PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN; if (! PL_utf8_perl_idcont) { invlist = _new_invlist_C_array(_Perl_IDCont_invlist); } - return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist); + return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont, + "_Perl_IDCont", invlist); } bool @@ -1842,50 +2739,6 @@ Perl__is_utf8_mark(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL); } -/* -=for apidoc to_utf8_case - -Instead use the appropriate one of L, -L, -L, -or L. - -C

contains the pointer to the UTF-8 string encoding -the character that is being converted. This routine assumes that the character -at C

is well-formed. - -C is a pointer to the character buffer to put the -conversion result to. C is a pointer to the length -of the result. - -C is a pointer to the swash to use. - -Both the special and normal mappings are stored in F, -and loaded by C, using F. C (usually, -but not always, a multicharacter mapping), is tried first. - -C is a string, normally C or C<"">. C means to not use -any special mappings; C<""> means to use the special mappings. Values other -than these two are treated as the name of the hash containing the special -mappings, like C<"utf8::ToSpecLower">. - -C is a string like C<"ToLower"> which means the swash -C<%utf8::ToLower>. - -Code points above the platform's C will raise a deprecation warning, -unless those are turned off. - -=cut */ - -UV -Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, - SV **swashp, const char *normal, const char *special) -{ - PERL_ARGS_ASSERT_TO_UTF8_CASE; - - return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special); -} - /* change namve uv1 to 'from' */ STATIC UV S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, @@ -1947,7 +2800,7 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, if (ckWARN_d(WARN_SURROGATE)) { const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; Perl_warner(aTHX_ packWARN(WARN_SURROGATE), - "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1); + "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04" UVXf, desc, uv1); } goto cases_to_self; } @@ -1960,16 +2813,14 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, } if (UNLIKELY(UNICODE_IS_SUPER(uv1))) { - if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP); + if (UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv1, + MAX_NON_DEPRECATED_CP); } if (ckWARN_d(WARN_NON_UNICODE)) { const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), - "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1); + "Operation \"%s\" returns its argument for non-Unicode code point 0x%04" UVXf, desc, uv1); } goto cases_to_self; } @@ -2107,8 +2958,8 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), - "Can't do %s(\"\\x{%"UVXf"}\") on non-UTF-8 locale; " - "resolved to \"\\x{%"UVXf"}\".", + "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8 locale; " + "resolved to \"\\x{%" UVXf "}\".", OP_DESC(PL_op), original, original); @@ -2116,10 +2967,177 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c return original; } +STATIC U32 +S_check_and_deprecate(pTHX_ const U8 *p, + const U8 **e, + const unsigned int type, /* See below */ + const bool use_locale, /* Is this a 'LC_' + macro call? */ + const char * const file, + const unsigned line) +{ + /* This is a temporary function to deprecate the unsafe calls to the case + * changing macros and functions. It keeps all the special stuff in just + * one place. + * + * It updates *e with the pointer to the end of the input string. If using + * the old-style macros, *e is NULL on input, and so this function assumes + * the input string is long enough to hold the entire UTF-8 sequence, and + * sets *e accordingly, but it then returns a flag to pass the + * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid + * using the full length if possible. + * + * It also does the assert that *e > p when *e is not NULL. This should be + * migrated to the callers when this function gets deleted. + * + * The 'type' parameter is used for the caller to specify which case + * changing function this is called from: */ + +# define DEPRECATE_TO_UPPER 0 +# define DEPRECATE_TO_TITLE 1 +# define DEPRECATE_TO_LOWER 2 +# define DEPRECATE_TO_FOLD 3 + + U32 utf8n_flags = 0; + const char * name; + const char * alternative; + + PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE; + + if (*e == NULL) { + utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN; + *e = p + UTF8SKIP(p); + + /* For mathoms.c calls, we use the function name we know is stored + * there. It could be part of a larger path */ + if (type == DEPRECATE_TO_UPPER) { + name = instr(file, "mathoms.c") + ? "to_utf8_upper" + : "toUPPER_utf8"; + alternative = "toUPPER_utf8_safe"; + } + else if (type == DEPRECATE_TO_TITLE) { + name = instr(file, "mathoms.c") + ? "to_utf8_title" + : "toTITLE_utf8"; + alternative = "toTITLE_utf8_safe"; + } + else if (type == DEPRECATE_TO_LOWER) { + name = instr(file, "mathoms.c") + ? "to_utf8_lower" + : "toLOWER_utf8"; + alternative = "toLOWER_utf8_safe"; + } + else if (type == DEPRECATE_TO_FOLD) { + name = instr(file, "mathoms.c") + ? "to_utf8_fold" + : "toFOLD_utf8"; + alternative = "toFOLD_utf8_safe"; + } + else Perl_croak(aTHX_ "panic: Unexpected case change type"); + + warn_on_first_deprecated_use(name, alternative, use_locale, file, line); + } + else { + assert (p < *e); + } + + return utf8n_flags; +} + +/* The process for changing the case is essentially the same for the four case + * change types, except there are complications for folding. Otherwise the + * difference is only which case to change to. To make sure that they all do + * the same thing, the bodies of the functions are extracted out into the + * following two macros. The functions are written with the same variable + * names, and these are known and used inside these macros. It would be + * better, of course, to have inline functions to do it, but since different + * macros are called, depending on which case is being changed to, this is not + * feasible in C (to khw's knowledge). Two macros are created so that the fold + * function can start with the common start macro, then finish with its special + * handling; while the other three cases can just use the common end macro. + * + * The algorithm is to use the proper (passed in) macro or function to change + * the case for code points that are below 256. The macro is used if using + * locale rules for the case change; the function if not. If the code point is + * above 255, it is computed from the input UTF-8, and another macro is called + * to do the conversion. If necessary, the output is converted to UTF-8. If + * using a locale, we have to check that the change did not cross the 255/256 + * boundary, see check_locale_boundary_crossing() for further details. + * + * The macros are split with the correct case change for the below-256 case + * stored into 'result', and in the middle of an else clause for the above-255 + * case. At that point in the 'else', 'result' is not the final result, but is + * the input code point calculated from the UTF-8. The fold code needs to + * realize all this and take it from there. + * + * If you read the two macros as sequential, it's easier to understand what's + * going on. */ +#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \ + L1_func_extra_param) \ + \ + if (flags & (locale_flags)) { \ + /* Treat a UTF-8 locale as not being in locale at all */ \ + if (IN_UTF8_CTYPE_LOCALE) { \ + flags &= ~(locale_flags); \ + } \ + else { \ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ + } \ + } \ + \ + if (UTF8_IS_INVARIANT(*p)) { \ + if (flags & (locale_flags)) { \ + result = LC_L1_change_macro(*p); \ + } \ + else { \ + return L1_func(*p, ustrp, lenp, L1_func_extra_param); \ + } \ + } \ + else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \ + if (flags & (locale_flags)) { \ + result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \ + *(p+1))); \ + } \ + else { \ + return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), \ + ustrp, lenp, L1_func_extra_param); \ + } \ + } \ + else { /* malformed UTF-8 or ord above 255 */ \ + STRLEN len_result; \ + result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \ + if (len_result == (STRLEN) -1) { \ + _force_out_malformed_utf8_message(p, e, utf8n_flags, \ + 1 /* Die */ ); \ + } + +#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \ + result = change_macro(result, p, ustrp, lenp); \ + \ + if (flags & (locale_flags)) { \ + result = check_locale_boundary_crossing(p, result, ustrp, lenp); \ + } \ + return result; \ + } \ + \ + /* Here, used locale rules. Convert back to UTF-8 */ \ + if (UTF8_IS_INVARIANT(result)) { \ + *ustrp = (U8) result; \ + *lenp = 1; \ + } \ + else { \ + *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \ + *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \ + *lenp = 2; \ + } \ + \ + return result; + /* =for apidoc to_utf8_upper -Instead use L. +Instead use L. =cut */ @@ -2128,67 +3146,30 @@ Instead use L. * be used. */ UV -Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) +Perl__to_utf8_upper_flags(pTHX_ const U8 *p, + const U8 *e, + U8* ustrp, + STRLEN *lenp, + bool flags, + const char * const file, + const int line) { UV result; + const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER, + cBOOL(flags), file, line); PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS; - if (flags) { - /* Treat a UTF-8 locale as not being in locale at all */ - if (IN_UTF8_CTYPE_LOCALE) { - flags = FALSE; - } - else { - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - } - } - - if (UTF8_IS_INVARIANT(*p)) { - if (flags) { - result = toUPPER_LC(*p); - } - else { - return _to_upper_title_latin1(*p, ustrp, lenp, 'S'); - } - } - else if UTF8_IS_DOWNGRADEABLE_START(*p) { - if (flags) { - U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); - result = toUPPER_LC(c); - } - else { - return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), - ustrp, lenp, 'S'); - } - } - else { /* UTF-8, ord above 255 */ - result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp); - - if (flags) { - result = check_locale_boundary_crossing(p, result, ustrp, lenp); - } - return result; - } - - /* Here, used locale rules. Convert back to UTF-8 */ - if (UTF8_IS_INVARIANT(result)) { - *ustrp = (U8) result; - *lenp = 1; - } - else { - *ustrp = UTF8_EIGHT_BIT_HI((U8) result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); - *lenp = 2; - } - - return result; + /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */ + /* 2nd char of uc(U+DF) is 'S' */ + CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S'); + CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE); } /* =for apidoc to_utf8_title -Instead use L. +Instead use L. =cut */ @@ -2199,67 +3180,29 @@ Instead use L. */ UV -Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) +Perl__to_utf8_title_flags(pTHX_ const U8 *p, + const U8 *e, + U8* ustrp, + STRLEN *lenp, + bool flags, + const char * const file, + const int line) { UV result; + const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE, + cBOOL(flags), file, line); PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; - if (flags) { - /* Treat a UTF-8 locale as not being in locale at all */ - if (IN_UTF8_CTYPE_LOCALE) { - flags = FALSE; - } - else { - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - } - } - - if (UTF8_IS_INVARIANT(*p)) { - if (flags) { - result = toUPPER_LC(*p); - } - else { - return _to_upper_title_latin1(*p, ustrp, lenp, 's'); - } - } - else if UTF8_IS_DOWNGRADEABLE_START(*p) { - if (flags) { - U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); - result = toUPPER_LC(c); - } - else { - return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), - ustrp, lenp, 's'); - } - } - else { /* UTF-8, ord above 255 */ - result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp); - - if (flags) { - result = check_locale_boundary_crossing(p, result, ustrp, lenp); - } - return result; - } - - /* Here, used locale rules. Convert back to UTF-8 */ - if (UTF8_IS_INVARIANT(result)) { - *ustrp = (U8) result; - *lenp = 1; - } - else { - *ustrp = UTF8_EIGHT_BIT_HI((U8) result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); - *lenp = 2; - } - - return result; + /* 2nd char of ucfirst(U+DF) is 's' */ + CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's'); + CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE); } /* =for apidoc to_utf8_lower -Instead use L. +Instead use L. =cut */ @@ -2269,68 +3212,28 @@ Instead use L. */ UV -Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) +Perl__to_utf8_lower_flags(pTHX_ const U8 *p, + const U8 *e, + U8* ustrp, + STRLEN *lenp, + bool flags, + const char * const file, + const int line) { UV result; + const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER, + cBOOL(flags), file, line); PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; - if (flags) { - /* Treat a UTF-8 locale as not being in locale at all */ - if (IN_UTF8_CTYPE_LOCALE) { - flags = FALSE; - } - else { - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - } - } - - if (UTF8_IS_INVARIANT(*p)) { - if (flags) { - result = toLOWER_LC(*p); - } - else { - return to_lower_latin1(*p, ustrp, lenp); - } - } - else if UTF8_IS_DOWNGRADEABLE_START(*p) { - if (flags) { - U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); - result = toLOWER_LC(c); - } - else { - return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), - ustrp, lenp); - } - } - else { /* UTF-8, ord above 255 */ - result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp); - - if (flags) { - result = check_locale_boundary_crossing(p, result, ustrp, lenp); - } - - return result; - } - - /* Here, used locale rules. Convert back to UTF-8 */ - if (UTF8_IS_INVARIANT(result)) { - *ustrp = (U8) result; - *lenp = 1; - } - else { - *ustrp = UTF8_EIGHT_BIT_HI((U8) result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); - *lenp = 2; - } - - return result; + CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */) + CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE) } /* =for apidoc to_utf8_fold -Instead use L. +Instead use L. =cut */ @@ -2345,9 +3248,17 @@ Instead use L. */ UV -Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) +Perl__to_utf8_fold_flags(pTHX_ const U8 *p, + const U8 *e, + U8* ustrp, + STRLEN *lenp, + U8 flags, + const char * const file, + const int line) { UV result; + const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD, + cBOOL(flags), file, line); PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS; @@ -2356,38 +3267,10 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) assert(p != ustrp); /* Otherwise overwrites */ - if (flags & FOLD_FLAGS_LOCALE) { - /* Treat a UTF-8 locale as not being in locale at all */ - if (IN_UTF8_CTYPE_LOCALE) { - flags &= ~FOLD_FLAGS_LOCALE; - } - else { - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - } - } + CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1, + ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII))); - if (UTF8_IS_INVARIANT(*p)) { - if (flags & FOLD_FLAGS_LOCALE) { - result = toFOLD_LC(*p); - } - else { - return _to_fold_latin1(*p, ustrp, lenp, - flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); - } - } - else if UTF8_IS_DOWNGRADEABLE_START(*p) { - if (flags & FOLD_FLAGS_LOCALE) { - U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); - result = toFOLD_LC(c); - } - else { - return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), - ustrp, lenp, - flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); - } - } - else { /* UTF-8, ord above 255 */ - result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL); + result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL); if (flags & FOLD_FLAGS_LOCALE) { @@ -2722,7 +3605,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m CORE_SWASH_INIT_RETURN(NULL); } Perl_croak(aTHX_ - "Can't find Unicode property definition \"%"SVf"\"", + "Can't find Unicode property definition \"%" SVf "\"", SVfARG(retval)); NOT_REACHED; /* NOTREACHED */ } @@ -2782,6 +3665,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m /* Add the passed-in inversion list, which invalidates the one * already stored in the swash */ invlist_in_swash_is_valid = FALSE; + SvREADONLY_off(swash_invlist); /* Turned on again below */ _invlist_union(invlist, swash_invlist, &swash_invlist); } else { @@ -2811,6 +3695,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m else SvREFCNT_inc_simple_void_NN(swash_invlist); } + /* The result is immutable. Forbid attempts to change it. */ SvREADONLY_on(swash_invlist); /* Use the inversion list stand-alone if small enough */ @@ -2991,7 +3876,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents) Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, " - "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf, + "svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf, svp, tmps, (UV)slen, (UV)needents); } @@ -3024,7 +3909,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) ((UV) tmps[off + 3]); } Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, " - "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents); + "slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents); NORETURN_FUNCTION_END; } @@ -3181,7 +4066,7 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span) PERL_ARGS_ASSERT_SWATCH_GET; if (bits != 1 && bits != 8 && bits != 16 && bits != 32) { - Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf, + Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf, (UV)bits); } @@ -3360,7 +4245,7 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span) otherbits = (STRLEN)SvUV(*otherbitssvp); if (bits < otherbits) Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, " - "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits); + "bits=%" UVuf ", otherbits=%" UVuf, (UV)bits, (UV)otherbits); /* The "other" swatch must be destroyed after. */ other = swatch_get(*othersvp, start, span); @@ -3373,7 +4258,7 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span) if (bits == 1 && otherbits == 1) { if (slen != olen) Perl_croak(aTHX_ "panic: swatch_get found swatch length " - "mismatch, slen=%"UVuf", olen=%"UVuf, + "mismatch, slen=%" UVuf ", olen=%" UVuf, (UV)slen, (UV)olen); switch (opc) { @@ -3535,7 +4420,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) /* Must have at least 8 bits to get the mappings */ if (bits != 8 && bits != 16 && bits != 32) { - Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf, + Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %" UVuf, (UV)bits); } @@ -3564,7 +4449,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) "unexpectedly is not a string, flags=%lu", (unsigned long)SvFLAGS(sv_to)); } - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/ + /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %" UVXf ", First char of to is %" UVXf "\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/ /* Each key in the inverse list is a mapped-to value, and the key's * hash value is a list of the strings (each in UTF-8) that map to @@ -3610,12 +4495,12 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) while ((from_list = (AV *) hv_iternextsv(specials_inverse, &char_to, &to_len))) { - if (av_tindex_nomg(from_list) > 0) { + if (av_tindex_skip_len_mg(from_list) > 0) { SSize_t i; /* We iterate over all combinations of i,j to place each code * point on each list */ - for (i = 0; i <= av_tindex_nomg(from_list); i++) { + for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) { SSize_t j; AV* i_list = newAV(); SV** entryp = av_fetch(from_list, i, FALSE); @@ -3632,7 +4517,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) } /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ - for (j = 0; j <= av_tindex_nomg(from_list); j++) { + for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) { entryp = av_fetch(from_list, j, FALSE); if (entryp == NULL) { Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); @@ -3643,7 +4528,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) (U8*) SvPVX(*entryp), (U8*) SvPVX(*entryp) + SvCUR(*entryp), 0))); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/ + /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/ } } } @@ -3708,7 +4593,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) /* Look through list to see if this inverse mapping already is * listed, or if there is a mapping to itself already */ - for (i = 0; i <= av_tindex_nomg(list); i++) { + for (i = 0; i <= av_tindex_skip_len_mg(list); i++) { SV** entryp = av_fetch(list, i, FALSE); SV* entry; UV uv; @@ -3717,7 +4602,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) } entry = *entryp; uv = SvUV(entry); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, uv));*/ + /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %" UVXf " contains %" UVXf "\n", val, uv));*/ if (uv == val) { found_key = TRUE; } @@ -3735,14 +4620,14 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) /* Make sure there is a mapping to itself on the list */ if (! found_key) { av_push(list, newSVuv(val)); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));*/ + /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, val, val));*/ } /* Simply add the value to the list */ if (! found_inverse) { av_push(list, newSVuv(inverse)); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));*/ + /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, inverse, val));*/ } /* swatch_get() increments the value of val for each element in the @@ -3835,7 +4720,6 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) invlist = _new_invlist(0); } else { - while (isSPACE(*l)) l++; l = (U8 *) after_atou; /* Get the 0th element, which is needed to setup the inversion list */ @@ -3850,7 +4734,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) /* Then just populate the rest of the input */ while (elements-- > 0) { if (l > lend) { - Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements); + Perl_croak(aTHX_ "panic: Expecting %" UVuf " more elements than available", elements); } while (isSPACE(*l)) l++; if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) { @@ -3949,7 +4833,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) if (bits != otherbits || bits != 1) { Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean " - "properties, bits=%"UVuf", otherbits=%"UVuf, + "properties, bits=%" UVuf ", otherbits=%" UVuf, (UV)bits, (UV)otherbits); } @@ -4028,8 +4912,7 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) return FALSE; } if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) { - STRLEN char_len; - if (UTF8_IS_SUPER(s, e)) { + if (UNLIKELY(UTF8_IS_SUPER(s, e))) { if ( ckWARN_d(WARN_NON_UNICODE) || ( ckWARN_d(WARN_DEPRECATED) #ifndef UV_IS_QUAD @@ -4048,24 +4931,24 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) #endif )) { /* A side effect of this function will be to warn */ - (void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_SUPER); + (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER); ok = FALSE; } } - else if (UTF8_IS_SURROGATE(s, e)) { + else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) { if (ckWARN_d(WARN_SURROGATE)) { /* This has a different warning than the one the called * function would output, so can't just call it, unlike we * do for the non-chars and above-unicodes */ - UV uv = utf8_to_uvchr_buf(s, e, &char_len); + UV uv = utf8_to_uvchr_buf(s, e, NULL); Perl_warner(aTHX_ packWARN(WARN_SURROGATE), - "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv); + "Unicode surrogate U+%04" UVXf " is illegal in UTF-8", uv); ok = FALSE; } } - else if ((UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) { + else if (UNLIKELY(UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) { /* A side effect of this function will be to warn */ - (void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_NONCHAR); + (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR); ok = FALSE; } } @@ -4102,7 +4985,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f PERL_ARGS_ASSERT_PV_UNI_DISPLAY; - sv_setpvs(dsv, ""); + SvPVCLEAR(dsv); SvUTF8_off(dsv); for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { UV u; @@ -4148,7 +5031,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f } } if (!ok) - Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u); + Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u); } if (truncated) sv_catpvs(dsv, "..."); @@ -4361,7 +5244,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c *foldbuf1 = toFOLD(*p1); } else if (u1) { - _to_utf8_fold_flags(p1, foldbuf1, &n1, flags_for_folder); + _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder); } else { /* Not UTF-8, get UTF-8 fold */ _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder); @@ -4385,7 +5268,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c *foldbuf2 = toFOLD(*p2); } else if (u2) { - _to_utf8_fold_flags(p2, foldbuf2, &n2, flags_for_folder); + _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder); } else { _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);