X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/c551bb019448883c2c8b94191e223158c6d40b29..80c72aa9ab6134a852bea9edb1282bafafbc671b:/utf8.c diff --git a/utf8.c b/utf8.c index f748c20..87017fa 100644 --- a/utf8.c +++ b/utf8.c @@ -37,9 +37,10 @@ static const char malformed_text[] = "Malformed UTF-8 character"; static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; static const char cp_above_legal_max[] = - "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf; + "Use of code point 0x%" UVXf " is not allowed; the" + " permissible max is 0x%" UVXf; -#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX)) +#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX)) /* =head1 Unicode Support @@ -52,6 +53,54 @@ within non-zero characters. =cut */ +void +Perl__force_out_malformed_utf8_message(pTHX_ + const U8 *const p, /* First byte in UTF-8 sequence */ + const U8 * const e, /* Final byte in sequence (may include + multiple chars */ + const U32 flags, /* Flags to pass to utf8n_to_uvchr(), + usually 0, or some DISALLOW flags */ + const bool die_here) /* If TRUE, this function does not return */ +{ + /* This core-only function is to be called when a malformed UTF-8 character + * is found, in order to output the detailed information about the + * malformation before dieing. The reason it exists is for the occasions + * when such a malformation is fatal, but warnings might be turned off, so + * that normally they would not be actually output. This ensures that they + * do get output. Because a sequence may be malformed in more than one + * way, multiple messages may be generated, so we can't make them fatal, as + * that would cause the first one to die. + * + * Instead we pretend -W was passed to perl, then die afterwards. The + * flexibility is here to return to the caller so they can finish up and + * die themselves */ + U32 errors; + + PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE; + + ENTER; + SAVEI8(PL_dowarn); + SAVESPTR(PL_curcop); + + PL_dowarn = G_WARN_ALL_ON|G_WARN_ON; + if (PL_curcop) { + PL_curcop->cop_warnings = pWARN_ALL; + } + + (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors); + + LEAVE; + + if (! errors) { + Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should" + " be called only when there are errors found"); + } + + if (die_here) { + Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); + } +} + /* =for apidoc uvoffuni_to_utf8_flags @@ -68,11 +117,21 @@ For details, see the description for L. =cut */ +/* All these formats take a single UV code point argument */ +const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf; +const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf + " is not recommended for open interchange"; +const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode," + " may not be portable"; +const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \ + " Unicode, requires a Perl extension," \ + " and so is not portable"; + #define HANDLE_UNICODE_SURROGATE(uv, flags) \ STMT_START { \ if (flags & UNICODE_WARN_SURROGATE) { \ Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \ - "UTF-16 surrogate U+%04" UVXf, uv); \ + surrogate_cp_format, uv); \ } \ if (flags & UNICODE_DISALLOW_SURROGATE) { \ return NULL; \ @@ -83,8 +142,7 @@ For details, see the description for L. STMT_START { \ if (flags & UNICODE_WARN_NONCHAR) { \ Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \ - "Unicode non-character U+%04" UVXf " is not " \ - "recommended for open interchange", uv); \ + nonchar_cp_format, uv); \ } \ if (flags & UNICODE_DISALLOW_NONCHAR) { \ return NULL; \ @@ -98,7 +156,7 @@ For details, see the description for L. #define MASK UTF_CONTINUATION_MASK U8 * -Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) +Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) { PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS; @@ -150,27 +208,24 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) * performance hit on these high EBCDIC code points. */ if (UNLIKELY(UNICODE_IS_SUPER(uv))) { - if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP); + if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP); } - if ( (flags & UNICODE_WARN_SUPER) - || ( UNICODE_IS_ABOVE_31_BIT(uv) - && (flags & UNICODE_WARN_ABOVE_31_BIT))) + if ( (flags & UNICODE_WARN_SUPER) + || ( (flags & UNICODE_WARN_PERL_EXTENDED) + && UNICODE_IS_PERL_EXTENDED(uv))) { Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), /* Choose the more dire applicable warning */ - (UNICODE_IS_ABOVE_31_BIT(uv)) - ? "Code point 0x%" UVXf " is not Unicode, and not portable" - : "Code point 0x%" UVXf " is not Unicode, may not be portable", + (UNICODE_IS_PERL_EXTENDED(uv)) + ? perl_extended_cp_format + : super_cp_format, uv); } - if (flags & UNICODE_DISALLOW_SUPER - || ( UNICODE_IS_ABOVE_31_BIT(uv) - && (flags & UNICODE_DISALLOW_ABOVE_31_BIT))) + if ( (flags & UNICODE_DISALLOW_SUPER) + || ( (flags & UNICODE_DISALLOW_PERL_EXTENDED) + && UNICODE_IS_PERL_EXTENDED(uv))) { return NULL; } @@ -238,9 +293,8 @@ is the recommended wide native character-aware way of saying *(d++) = uv; -This function accepts any UV as input, but very high code points (above -C on the platform) will raise a deprecation warning. This is -typically 0x7FFF_FFFF in a 32-bit word. +This function accepts any code point from 0..C as input. +C is typically 0x7FFF_FFFF in a 32-bit word. It is possible to forbid or warn on non-Unicode code points, or those that may be problematic by using L. @@ -275,9 +329,8 @@ This is the Unicode-aware way of saying *(d++) = uv; -If C is 0, this function accepts any UV as input, but very high code -points (above C for the platform) will raise a deprecation warning. -This is typically 0x7FFF_FFFF in a 32-bit word. +If C is 0, this function accepts any code point from 0..C as +input. C is typically 0x7FFF_FFFF in a 32-bit word. Specifying C can further restrict what is allowed and not warned on, as follows: @@ -306,30 +359,27 @@ defined in L. See L. -Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, -so using them is more problematic than other above-Unicode code points. Perl -invented an extension to UTF-8 to represent the ones above 2**36-1, so it is -likely that non-Perl languages will not be able to read files that contain -these that written by the perl interpreter; nor would Perl understand files -written by something that uses a different extension. For these reasons, there -is a separate set of flags that can warn and/or disallow these extremely high -code points, even if other above-Unicode ones are accepted. These are the -C and C flags. These -are entirely independent from the deprecation warning for code points above -C. On 32-bit machines, it will eventually be forbidden to have any -code point that needs more than 31 bits to represent. When that happens, -effectively the C flag will always be set on -32-bit machines. (Of course C will treat all -above-Unicode code points, including these, as malformations; and -C warns on these.) - -On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing -extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower -than on ASCII. Prior to that, code points 2**31 and higher were simply -unrepresentable, and a different, incompatible method was used to represent -code points between 2**30 and 2**31 - 1. The flags C -and C have the same function as on ASCII -platforms, warning and disallowing 2**31 and higher. +Extremely high code points were never specified in any standard, and require an +extension to UTF-8 to express, which Perl does. It is likely that programs +written in something other than Perl would not be able to read files that +contain these; nor would Perl understand files written by something that uses a +different extension. For these reasons, there is a separate set of flags that +can warn and/or disallow these extremely high code points, even if other +above-Unicode ones are accepted. They are the C +and C flags. For more information see +L>. Of course C will +treat all above-Unicode code points, including these, as malformations. (Note +that the Unicode standard considers anything above 0x10FFFF to be illegal, but +there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1)) + +A somewhat misleadingly named synonym for C is +retained for backward compatibility: C. Similarly, +C is usable instead of the more accurately named +C. The names are misleading because these +flags can apply to code points that actually do fit in 31 bits. This happens +on EBCDIC platforms, and sometimes when the L> is also present. The new names accurately +describe the situation in all cases. =cut */ @@ -343,8 +393,12 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) return uvchr_to_utf8_flags(d, uv, flags); } -PERL_STATIC_INLINE bool -S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) +#ifndef UV_IS_QUAD + +STATIC int +S_is_utf8_cp_above_31_bits(const U8 * const s, + const U8 * const e, + const bool consider_overlongs) { /* Returns TRUE if the first code point represented by the Perl-extended- * UTF-8-encoded string starting at 's', and looking no further than 'e - @@ -356,147 +410,170 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) * the final ones necessary for the complete representation may be beyond * 'e - 1'. * - * The function assumes that the sequence is well-formed UTF-8 as far as it - * goes, and is for a UTF-8 variant code point. If the sequence is - * incomplete, the function returns FALSE if there is any well-formed - * UTF-8 byte sequence that can complete it in such a way that a code point - * < 2**31 is produced; otherwise it returns TRUE. + * The function also can handle the case where the input is an overlong + * sequence. If 'consider_overlongs' is 0, the function assumes the + * input is not overlong, without checking, and will return based on that + * assumption. If this parameter is 1, the function will go to the trouble + * of figuring out if it actually evaluates to above or below 31 bits. * - * Getting this exactly right is slightly tricky, and has to be done in - * several places in this file, so is centralized here. It is based on the - * following table: - * - * U+7FFFFFFF (2 ** 31 - 1) - * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF - * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73 - * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72 - * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75 - * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF - * U+80000000 (2 ** 31): - * ASCII: \xFE\x82\x80\x80\x80\x80\x80 - * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13 - * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 - * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 - * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 - * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0 + * The sequence is otherwise assumed to be well-formed, without checking. */ -#ifdef EBCDIC - - /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */ - const U8 prefix[] = "\x41\x41\x41\x41\x41\x41\x42"; - const STRLEN prefix_len = sizeof(prefix) - 1; const STRLEN len = e - s; - const STRLEN cmp_len = MIN(prefix_len, len - 1); + int is_overlong; -#else + PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS; - PERL_UNUSED_ARG(e); + assert(! UTF8_IS_INVARIANT(*s) && e > s); -#endif - - PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS; +#ifdef EBCDIC - assert(! UTF8_IS_INVARIANT(*s)); + PERL_UNUSED_ARG(consider_overlongs); -#ifndef EBCDIC + /* On the EBCDIC code pages we handle, only the native start byte 0xFE can + * mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can + * also be the start byte for a 31-bit code point; we need at least 2 + * bytes, and maybe up through 8 bytes, to determine that. (It can also be + * the start byte for an overlong sequence, but for 30-bit or smaller code + * points, so we don't have to worry about overlongs on EBCDIC.) */ + if (*s != 0xFE) { + return 0; + } - /* Technically, a start byte of FE can be for a code point that fits into - * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong - * malformation. */ - return (*s >= 0xFE); + if (len == 1) { + return -1; + } #else - /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or - * larger code point (0xFF is an invariant). For 0xFE, we need at least 2 - * bytes, and maybe up through 8 bytes, to be sure if the value is above 31 - * bits. */ - if (*s != 0xFE || len == 1) { - return FALSE; + /* On ASCII, FE and FF are the only start bytes that can evaluate to + * needing more than 31 bits. */ + if (LIKELY(*s < 0xFE)) { + return 0; } - /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are - * \x41 and \x42. */ - return cBOOL(memGT(s + 1, prefix, cmp_len)); + /* What we have left are FE and FF. Both of these require more than 31 + * bits unless they are for overlongs. */ + if (! consider_overlongs) { + return 1; + } -#endif + /* Here, we have FE or FF. If the input isn't overlong, it evaluates to + * above 31 bits. But we need more than one byte to discern this, so if + * passed just the start byte, it could be an overlong evaluating to + * smaller */ + if (len == 1) { + return -1; + } -} + /* Having excluded len==1, and knowing that FE and FF are both valid start + * bytes, we can call the function below to see if the sequence is + * overlong. (We don't need the full generality of the called function, + * but for these huge code points, speed shouldn't be a consideration, and + * the compiler does have enough information, since it's static to this + * file, to optimize to just the needed parts.) */ + is_overlong = is_utf8_overlong_given_start_byte_ok(s, len); -PERL_STATIC_INLINE bool -S_does_utf8_overflow(const U8 * const s, const U8 * e) -{ - const U8 *x; - const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8; + /* If it isn't overlong, more than 31 bits are required. */ + if (is_overlong == 0) { + return 1; + } -#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC) + /* If it is indeterminate if it is overlong, return that */ + if (is_overlong < 0) { + return -1; + } - const STRLEN len = e - s; + /* Here is overlong. Such a sequence starting with FE is below 31 bits, as + * the max it can be is 2**31 - 1 */ + if (*s == 0xFE) { + return 0; + } #endif - /* Returns a boolean as to if this UTF-8 string would overflow a UV on this - * platform, that is if it represents a code point larger than the highest - * representable code point. (For ASCII platforms, we could use memcmp() - * because we don't have to convert each byte to I8, but it's very rare - * input indeed that would approach overflow, so the loop below will likely - * only get executed once. - * - * 'e' must not be beyond a full character. If it is less than a full - * character, the function returns FALSE if there is any input beyond 'e' - * that could result in a non-overflowing code point */ + /* Here, ASCII and EBCDIC rejoin: + * On ASCII: We have an overlong sequence starting with FF + * On EBCDIC: We have a sequence starting with FE. */ - PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW; - assert(s <= e && s + UTF8SKIP(s) >= e); + { /* For C89, use a block so the declaration can be close to its use */ -#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC) +#ifdef EBCDIC - /* On 32 bit ASCII machines, many overlongs that start with FF don't - * overflow */ + /* U+7FFFFFFF (2 ** 31 - 1) + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13 + * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73 + * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72 + * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75 + * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF + * U+80000000 (2 ** 31): + * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 + * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 + * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 + * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0 + * + * and since we know that *s = \xfe, any continuation sequcence + * following it that is gt the below is above 31 bits + [0] [1] [2] [3] [4] [5] [6] */ + const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42"; - if (isFF_OVERLONG(s, len)) { - const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84"; - return memGE(s, max_32_bit_overlong, - MIN(len, sizeof(max_32_bit_overlong) - 1)); - } +#else -#endif + /* FF overlong for U+7FFFFFFF (2 ** 31 - 1) + * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF + * FF overlong for U+80000000 (2 ** 31): + * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80 + * and since we know that *s = \xff, any continuation sequcence + * following it that is gt the below is above 30 bits + [0] [1] [2] [3] [4] [5] [6] */ + const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81"; - for (x = s; x < e; x++, y++) { - /* If this byte is larger than the corresponding highest UTF-8 byte, it - * overflows */ - if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) { - return TRUE; +#endif + const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1; + const STRLEN cmp_len = MIN(conts_len, len - 1); + + /* Now compare the continuation bytes in s with the ones we have + * compiled in that are for the largest 30 bit code point. If we have + * enough bytes available to determine the answer, or the bytes we do + * have differ from them, we can compare the two to get a definitive + * answer (Note that in UTF-EBCDIC, the two lowest possible + * continuation bytes are \x41 and \x42.) */ + if (cmp_len >= conts_len || memNE(s + 1, + conts_for_highest_30_bit, + cmp_len)) + { + return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len)); } - /* If not the same as this byte, it must be smaller, doesn't overflow */ - if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) { - return FALSE; - } + /* Here, all the bytes we have are the same as the highest 30-bit code + * point, but we are missing so many bytes that we can't make the + * determination */ + return -1; } - - /* Got to the end and all bytes are the same. If the input is a whole - * character, it doesn't overflow. And if it is a partial character, - * there's not enough information to tell, so assume doesn't overflow */ - return FALSE; } -PERL_STATIC_INLINE bool +#endif + +PERL_STATIC_INLINE int S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) { - /* Overlongs can occur whenever the number of continuation bytes - * changes. That means whenever the number of leading 1 bits in a start - * byte increases from the next lower start byte. That happens for start - * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following - * illegal start bytes have already been excluded, so don't need to be - * tested here; + /* Returns an int indicating whether or not the UTF-8 sequence from 's' to + * 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if + * it isn't, and -1 if there isn't enough information to tell. This last + * return value can happen if the sequence is incomplete, missing some + * trailing bytes that would form a complete character. If there are + * enough bytes to make a definitive decision, this function does so. + * Usually 2 bytes sufficient. + * + * Overlongs can occur whenever the number of continuation bytes changes. + * That means whenever the number of leading 1 bits in a start byte + * increases from the next lower start byte. That happens for start bytes + * C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal + * start bytes have already been excluded, so don't need to be tested here; * ASCII platforms: C0, C1 * EBCDIC platforms C0, C1, C2, C3, C4, E0 - * - * At least a second byte is required to determine if other sequences will - * be an overlong. */ + */ const U8 s0 = NATIVE_UTF8_TO_I8(s[0]); const U8 s1 = NATIVE_UTF8_TO_I8(s[1]); @@ -521,7 +598,7 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) # else if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) { - return TRUE; + return 1; } # define F0_ABOVE_OVERLONG 0x90 @@ -537,27 +614,178 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len) || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG)) || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG))) { - return TRUE; + return 1; } /* Check for the FF overlong */ return isFF_OVERLONG(s, len); } -PERL_STATIC_INLINE bool +PERL_STATIC_INLINE int S_isFF_OVERLONG(const U8 * const s, const STRLEN len) { + /* Returns an int indicating whether or not the UTF-8 sequence from 's' to + * 'e' - 1 is an overlong beginning with \xFF. It returns 1 if it is; 0 if + * it isn't, and -1 if there isn't enough information to tell. This last + * return value can happen if the sequence is incomplete, missing some + * trailing bytes that would form a complete character. If there are + * enough bytes to make a definitive decision, this function does so. */ + PERL_ARGS_ASSERT_ISFF_OVERLONG; - /* Check for the FF overlong. This happens only if all these bytes match; - * what comes after them doesn't matter. See tables in utf8.h, + /* To be an FF overlong, all the available bytes must match */ + if (LIKELY(memNE(s, FF_OVERLONG_PREFIX, + MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1)))) + { + return 0; + } + + /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must + * be there; what comes after them doesn't matter. See tables in utf8.h, * utfebcdic.h. */ + if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) { + return 1; + } + + /* The missing bytes could cause the result to go one way or the other, so + * the result is indeterminate */ + return -1; +} + +#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */ +# ifdef EBCDIC /* Actually is I8 */ +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" +# else +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" +# endif +#endif + +PERL_STATIC_INLINE int +S_does_utf8_overflow(const U8 * const s, + const U8 * e, + const bool consider_overlongs) +{ + /* Returns an int indicating whether or not the UTF-8 sequence from 's' to + * 'e' - 1 would overflow an IV on this platform; that is if it represents + * a code point larger than the highest representable code point. It + * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't + * enough information to tell. This last return value can happen if the + * sequence is incomplete, missing some trailing bytes that would form a + * complete character. If there are enough bytes to make a definitive + * decision, this function does so. + * + * If 'consider_overlongs' is TRUE, the function checks for the possibility + * that the sequence is an overlong that doesn't overflow. Otherwise, it + * assumes the sequence is not an overlong. This can give different + * results only on ASCII 32-bit platforms. + * + * (For ASCII platforms, we could use memcmp() because we don't have to + * convert each byte to I8, but it's very rare input indeed that would + * approach overflow, so the loop below will likely only get executed once.) + * + * 'e' - 1 must not be beyond a full character. */ + + + PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW; + assert(s <= e && s + UTF8SKIP(s) >= e); + +#if ! defined(UV_IS_QUAD) + + return is_utf8_cp_above_31_bits(s, e, consider_overlongs); + +#else + + PERL_UNUSED_ARG(consider_overlongs); + + { + const STRLEN len = e - s; + const U8 *x; + const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8; + + for (x = s; x < e; x++, y++) { + + if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) { + continue; + } + + /* If this byte is larger than the corresponding highest UTF-8 + * byte, the sequence overflow; otherwise the byte is less than, + * and so the sequence doesn't overflow */ + return NATIVE_UTF8_TO_I8(*x) > *y; + + } + + /* Got to the end and all bytes are the same. If the input is a whole + * character, it doesn't overflow. And if it is a partial character, + * there's not enough information to tell */ + if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) { + return -1; + } + + return 0; + } + +#endif - return len >= sizeof(FF_OVERLONG_PREFIX) - 1 - && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX, - sizeof(FF_OVERLONG_PREFIX) - 1)); } +#if 0 + +/* This is the portions of the above function that deal with UV_MAX instead of + * IV_MAX. They are left here in case we want to combine them so that internal + * uses can have larger code points. The only logic difference is that the + * 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has + * different logic. + */ + +/* Anything larger than this will overflow the word if it were converted into a UV */ +#if defined(UV_IS_QUAD) +# ifdef EBCDIC /* Actually is I8 */ +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" +# else +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" +# endif +#else /* 32-bit */ +# ifdef EBCDIC +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF" +# else +# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF" +# endif +#endif + +#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC) + + /* On 32 bit ASCII machines, many overlongs that start with FF don't + * overflow */ + if (consider_overlongs && isFF_OVERLONG(s, len) > 0) { + + /* To be such an overlong, the first bytes of 's' must match + * FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80". If we + * don't have any additional bytes available, the sequence, when + * completed might or might not fit in 32 bits. But if we have that + * next byte, we can tell for sure. If it is <= 0x83, then it does + * fit. */ + if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) { + return -1; + } + + return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83; + } + +/* Starting with the #else, the rest of the function is identical except + * 1. we need to move the 'len' declaration to be global to the function + * 2. the endif move to just after the UNUSED_ARG. + * An empty endif is given just below to satisfy the preprocessor + */ +#endif + +#endif + #undef F0_ABOVE_OVERLONG #undef F8_ABOVE_OVERLONG #undef FC_ABOVE_OVERLONG @@ -586,8 +814,8 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) * return will be larger than 'e - s'. * * This function assumes that the code point represented is UTF-8 variant. - * The caller should have excluded this possibility before calling this - * function. + * The caller should have excluded the possibility of it being invariant + * before calling this function. * * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags * accepted by L. If non-zero, this function will return @@ -605,7 +833,7 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER; assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE - |UTF8_DISALLOW_ABOVE_31_BIT))); + |UTF8_DISALLOW_PERL_EXTENDED))); assert(! UTF8_IS_INVARIANT(*s)); /* A variant char must begin with a start byte */ @@ -623,17 +851,29 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) { const U8 s0 = NATIVE_UTF8_TO_I8(s[0]); - /* The code below is derived from this table. Keep in mind that legal - * continuation bytes range between \x80..\xBF for UTF-8, and - * \xA0..\xBF for I8. Anything above those aren't continuation bytes. - * Hence, we don't have to test the upper edge because if any of those - * are encountered, the sequence is malformed, and will fail elsewhere - * in this function. + /* Here, we are disallowing some set of largish code points, and the + * first byte indicates the sequence is for a code point that could be + * in the excluded set. We generally don't have to look beyond this or + * the second byte to see if the sequence is actually for one of the + * excluded classes. The code below is derived from this table: + * * UTF-8 UTF-EBCDIC I8 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode * + * Keep in mind that legal continuation bytes range between \x80..\xBF + * for UTF-8, and \xA0..\xBF for I8. Anything above those aren't + * continuation bytes. Hence, we don't have to test the upper edge + * because if any of those is encountered, the sequence is malformed, + * and would fail elsewhere in this function. + * + * The code here likewise assumes that there aren't other + * malformations; again the function should fail elsewhere because of + * these. For example, an overlong beginning with FC doesn't actually + * have to be a super; it could actually represent a small code point, + * even U+0000. But, since overlongs (and other malformations) are + * illegal, the function should return FALSE in either case. */ #ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */ @@ -643,21 +883,24 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) # define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \ /* B6 and B7 */ \ && ((s1) & 0xFE ) == 0xB6) +# define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF)) #else # define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5 # define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90) # define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0) +# define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE) #endif if ( (flags & UTF8_DISALLOW_SUPER) - && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) { + && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) + { return 0; /* Above Unicode */ } - if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT) - && UNLIKELY(is_utf8_cp_above_31_bits(s, e))) + if ( (flags & UTF8_DISALLOW_PERL_EXTENDED) + && UNLIKELY(isUTF8_PERL_EXTENDED(s))) { - return 0; /* Above 31 bits */ + return 0; } if (len > 1) { @@ -692,28 +935,36 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) /* Here is syntactically valid. Next, make sure this isn't the start of an * overlong. */ - if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) { + if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) { return 0; } /* And finally, that the code point represented fits in a word on this * platform */ - if (does_utf8_overflow(s, e)) { + if (0 < does_utf8_overflow(s, e, + 0 /* Don't consider overlongs */ + )) + { return 0; } return UTF8SKIP(s); } -STATIC char * -S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len) +char * +Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format) { /* Returns a mortalized C string that is a displayable copy of the 'len' - * bytes starting at 's', each in a \xXY format. */ + * bytes starting at 'start'. 'format' gives how to display each byte. + * Currently, there are only two formats, so it is currently a bool: + * 0 \xab + * 1 ab (that is a space between two hex digit bytes) + */ const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a trailing NUL */ - const U8 * const e = s + len; + const U8 * s = start; + const U8 * const e = start + len; char * output; char * d; @@ -723,12 +974,19 @@ S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len) SAVEFREEPV(output); d = output; - for (; s < e; s++) { + for (s = start; s < e; s++) { const unsigned high_nibble = (*s & 0xF0) >> 4; const unsigned low_nibble = (*s & 0x0F); - *d++ = '\\'; - *d++ = 'x'; + if (format) { + if (s > start) { + *d++ = ' '; + } + } + else { + *d++ = '\\'; + *d++ = 'x'; + } if (high_nibble < 10) { *d++ = high_nibble + '0'; @@ -768,7 +1026,6 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, ? "immediately" : Perl_form(aTHX_ "%d bytes", (int) non_cont_byte_pos); - unsigned int i; PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT; @@ -776,22 +1033,10 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, * calculated, it's likely faster to pass it; verify under DEBUGGING */ assert(expect_len == UTF8SKIP(s)); - /* It is possible that utf8n_to_uvchr() was called incorrectly, with a - * length that is larger than is actually available in the buffer. If we - * print all the bytes based on that length, we will read past the buffer - * end. Often, the strings are NUL terminated, so to lower the chances of - * this happening, print the malformed bytes only up through any NUL. */ - for (i = 1; i < print_len; i++) { - if (*(s + i) == '\0') { - print_len = i + 1; /* +1 gets the NUL printed */ - break; - } - } - return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x," " %s after start byte 0x%02x; need %d bytes, got %d)", malformed_text, - _byte_dump_string(s, print_len), + _byte_dump_string(s, print_len, 0), *(s + non_cont_byte_pos), where, *s, @@ -819,7 +1064,7 @@ is the next possible position in C that could begin a non-malformed character. Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised. Some UTF-8 input sequences may contain multiple malformations. This function tries to find every possible one in each call, so multiple -warnings can be raised for each sequence. +warnings can be raised for the same sequence. Various ALLOW flags can be set in C to allow (and not warn on) individual types of malformations, such as the sequence being overlong (that @@ -827,10 +1072,10 @@ is, when there is a shorter sequence that can express the same code point; overlong sequences are expressly forbidden in the UTF-8 standard due to potential security issues). Another malformation example is the first byte of a character not being a legal first byte. See F for the list of such -flags. For allowed 0 length strings, this function returns 0; for allowed -overlong sequences, the computed code point is returned; for all other allowed -malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no -determinable reasonable value. +flags. Even if allowed, this function generally returns the Unicode +REPLACEMENT CHARACTER when it encounters a malformation. There are flags in +F to override this behavior for the overlong malformations, but don't +do that except for very specialized purposes. The C flag overrides the behavior when a non-allowed (by other flags) malformation is found. If this flag is set, the routine assumes that @@ -870,35 +1115,29 @@ a malformation and raise a warning, specify both the WARN and DISALLOW flags. (But note that warnings are not raised if lexically disabled nor if C is also specified.) -It is now deprecated to have very high code points (above C on the -platforms) and this function will raise a deprecation warning for these (unless -such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1) -in a 32-bit word. +Extremely high code points were never specified in any standard, and require an +extension to UTF-8 to express, which Perl does. It is likely that programs +written in something other than Perl would not be able to read files that +contain these; nor would Perl understand files written by something that uses a +different extension. For these reasons, there is a separate set of flags that +can warn and/or disallow these extremely high code points, even if other +above-Unicode ones are accepted. They are the C and +C flags. For more information see +L>. Of course C will treat all +above-Unicode code points, including these, as malformations. +(Note that the Unicode standard considers anything above 0x10FFFF to be +illegal, but there are standards predating it that allow up to 0x7FFF_FFFF +(2**31 -1)) + +A somewhat misleadingly named synonym for C is +retained for backward compatibility: C. Similarly, +C is usable instead of the more accurately named +C. The names are misleading because these flags +can apply to code points that actually do fit in 31 bits. This happens on +EBCDIC platforms, and sometimes when the L> is also present. The new names accurately +describe the situation in all cases. -Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, -so using them is more problematic than other above-Unicode code points. Perl -invented an extension to UTF-8 to represent the ones above 2**36-1, so it is -likely that non-Perl languages will not be able to read files that contain -these; nor would Perl understand files -written by something that uses a different extension. For these reasons, there -is a separate set of flags that can warn and/or disallow these extremely high -code points, even if other above-Unicode ones are accepted. These are the -C and C flags. These -are entirely independent from the deprecation warning for code points above -C. On 32-bit machines, it will eventually be forbidden to have any -code point that needs more than 31 bits to represent. When that happens, -effectively the C flag will always be set on -32-bit machines. (Of course C will treat all -above-Unicode code points, including these, as malformations; and -C warns on these.) - -On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing -extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower -than on ASCII. Prior to that, code points 2**31 and higher were simply -unrepresentable, and a different, incompatible method was used to represent -code points between 2**30 and 2**31 - 1. The flags C -and C have the same function as on ASCII -platforms, warning and disallowing 2**31 and higher. All other code points corresponding to Unicode characters, including private use and those yet to be assigned, are never considered malformed and never @@ -937,17 +1176,41 @@ to a C variable, which this function sets to indicate any errors found. Upon return, if C<*errors> is 0, there were no errors found. Otherwise, C<*errors> is the bit-wise C of the bits described in the list below. Some of these bits will be set if a malformation is found, even if the input -C parameter indicates that the given malformation is allowed; the +C parameter indicates that the given malformation is allowed; those exceptions are noted: =over 4 -=item C +=item C -The code point represented by the input UTF-8 sequence occupies more than 31 -bits. -This bit is set only if the input C parameter contains either the -C or the C flags. +The input sequence is not standard UTF-8, but a Perl extension. This bit is +set only if the input C parameter contains either the +C or the C flags. + +Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, +and so some extension must be used to express them. Perl uses a natural +extension to UTF-8 to represent the ones up to 2**36-1, and invented a further +extension to represent even higher ones, so that any code point that fits in a +64-bit word can be represented. Text using these extensions is not likely to +be portable to non-Perl code. We lump both of these extensions together and +refer to them as Perl extended UTF-8. There exist other extensions that people +have invented, incompatible with Perl's. + +On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing +extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower +than on ASCII. Prior to that, code points 2**31 and higher were simply +unrepresentable, and a different, incompatible method was used to represent +code points between 2**30 and 2**31 - 1. + +On both platforms, ASCII and EBCDIC, C is set if +Perl extended UTF-8 is used. + +In earlier Perls, this bit was named C, which you still +may use for backward compatibility. That name is misleading, as this flag may +be set when the code point actually does fit in 31 bits. This happens on +EBCDIC platforms, and sometimes when the L> is also present. The new name accurately +describes the situation in all cases. =item C @@ -963,6 +1226,9 @@ The input C parameter was 0. The input sequence was malformed in that there is some other sequence that evaluates to the same code point, but that sequence is shorter than this one. +Until Unicode 3.1, it was legal for programs to accept this malformation, but +it was discovered that this created security issues. + =item C The code point represented by the input UTF-8 sequence is for a Unicode @@ -978,7 +1244,7 @@ in a position where only a continuation type one should be. =item C The input sequence was malformed in that it is for a code point that is not -representable in the number of bits available in a UV on the current platform. +representable in the number of bits available in an IV on the current platform. =item C @@ -1002,6 +1268,9 @@ C or the C flags. =back +To do your own error handling, call this function with the C +flag to suppress any warnings, and then examine the C<*errors> return. + =cut */ @@ -1029,8 +1298,8 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * too short one. Otherwise the first two are set to 's0' and 'send', and * the third not used at all */ U8 * adjusted_s0 = (U8 *) s0; - U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong - warning) */ + U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this + routine; see [perl #130921] */ UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */ PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; @@ -1072,8 +1341,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (UNLIKELY(curlen == 0)) { possible_problems |= UTF8_GOT_EMPTY; curlen = 0; - uv = 0; /* XXX It could be argued that this should be - UNICODE_REPLACEMENT? */ + uv = UNICODE_REPLACEMENT; goto ready_to_handle_errors; } @@ -1101,7 +1369,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, } /* Here is not a continuation byte, nor an invariant. The only thing left - * is a start byte (possibly for an overlong) */ + * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START + * because it excludes start bytes like \xC0 that always lead to + * overlongs.) */ /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits * that indicate the number of bytes in the character's whole UTF-8 @@ -1119,7 +1389,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, else { send += expectlen; } - adjusted_send = send; /* Now, loop through the remaining bytes in the character's sequence, * accumulating each into the working value as we go. */ @@ -1139,14 +1408,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, /* Save how many bytes were actually in the character */ curlen = s - s0; - /* A convenience macro that matches either of the too-short conditions. */ -# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) - - if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { - uv_so_far = uv; - uv = UNICODE_REPLACEMENT; - } - /* Note that there are two types of too-short malformation. One is when * there is actual wrong data before the normal termination of the * sequence. The other is that the sequence wasn't complete before the end @@ -1154,10 +1415,22 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * This means that we were passed data for a partial character, but it is * valid as far as we saw. The other is definitely invalid. This * distinction could be important to a caller, so the two types are kept - * separate. */ + * separate. + * + * A convenience macro that matches either of the too-short conditions. */ +# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) - /* Check for overflow */ - if (UNLIKELY(does_utf8_overflow(s0, send))) { + if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { + uv_so_far = uv; + uv = UNICODE_REPLACEMENT; + } + + /* Check for overflow. The algorithm requires us to not look past the end + * of the current character, even if partial, so the upper limit is 's' */ + if (UNLIKELY(0 < does_utf8_overflow(s0, s, + 1 /* Do consider overlongs */ + ))) + { possible_problems |= UTF8_GOT_OVERFLOW; uv = UNICODE_REPLACEMENT; } @@ -1168,15 +1441,22 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * overlong */ if ( ( LIKELY(! possible_problems) && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv))) - || ( UNLIKELY( possible_problems) + || ( UNLIKELY(possible_problems) && ( UNLIKELY(! UTF8_IS_START(*s0)) || ( curlen > 1 - && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0, - send - s0)))))) + && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0, + s - s0)))))) { possible_problems |= UTF8_GOT_LONG; - if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { + if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT) + + /* The calculation in the 'true' branch of this 'if' + * below won't work if overflows, and isn't needed + * anyway. Further below we handle all overflow + * cases */ + && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))) + { UV min_uv = uv_so_far; STRLEN i; @@ -1184,47 +1464,48 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * bytes. There is no single code point it could be for, but there * may be enough information present to determine if what we have * so far is for an unallowed code point, such as for a surrogate. - * The code below has the intelligence to determine this, but just - * for non-overlong UTF-8 sequences. What we do here is calculate - * the smallest code point the input could represent if there were - * no too short malformation. Then we compute and save the UTF-8 - * for that, which is what the code below looks at instead of the - * raw input. It turns out that the smallest such code point is - * all we need. */ + * The code further below has the intelligence to determine this, + * but just for non-overlong UTF-8 sequences. What we do here is + * calculate the smallest code point the input could represent if + * there were no too short malformation. Then we compute and save + * the UTF-8 for that, which is what the code below looks at + * instead of the raw input. It turns out that the smallest such + * code point is all we need. */ for (i = curlen; i < expectlen; i++) { min_uv = UTF8_ACCUMULATE(min_uv, I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK)); } - Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8); - SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get - to free it ourselves if - warnings are made fatal */ - adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0); + adjusted_s0 = temp_char_buf; + (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0); } } - /* Now check that the input isn't for a problematic code point not allowed - * by the input parameters. */ - /* isn't problematic if < this */ - if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST) + /* Here, we have found all the possible problems, except for when the input + * is for a problematic code point not allowed by the input parameters. */ + + /* uv is valid for overlongs */ + if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) + + /* isn't problematic if < this */ + && uv >= UNICODE_SURROGATE_FIRST) || ( UNLIKELY(possible_problems) - && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0))) + + /* if overflow, we know without looking further + * precisely which of the problematic types it is, + * and we deal with those in the overflow handling + * code */ + && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)) + && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) + || UNLIKELY(isUTF8_PERL_EXTENDED(s0))))) && ((flags & ( UTF8_DISALLOW_NONCHAR |UTF8_DISALLOW_SURROGATE |UTF8_DISALLOW_SUPER - |UTF8_DISALLOW_ABOVE_31_BIT + |UTF8_DISALLOW_PERL_EXTENDED |UTF8_WARN_NONCHAR |UTF8_WARN_SURROGATE |UTF8_WARN_SUPER - |UTF8_WARN_ABOVE_31_BIT)) - /* In case of a malformation, 'uv' is not valid, and has - * been changed to something in the Unicode range. - * Currently we don't output a deprecation message if there - * is already a malformation, so we don't have to special - * case the test immediately below */ - || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)))) + |UTF8_WARN_PERL_EXTENDED)))) { /* If there were no malformations, or the only malformation is an * overlong, 'uv' is valid */ @@ -1283,15 +1564,12 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, * some subsitute value, typically the REPLACEMENT * CHARACTER. * s0 points to the first byte of the character - * send points to just after where that (potentially - * partial) character ends - * adjusted_s0 normally is the same as s0, but in case of an - * overlong for which the UTF-8 matters below, it is - * the first byte of the shortest form representation - * of the input. - * adjusted_send normally is the same as 'send', but if adjusted_s0 - * is set to something other than s0, this points one - * beyond its end + * s points to just after were we left off processing + * the character + * send points to just after where that character should + * end, based on how many bytes the start byte tells + * us should be in it, but no further than s0 + + * avail_len */ if (UNLIKELY(possible_problems)) { @@ -1304,41 +1582,58 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, /* Each 'if' clause handles one problem. They are ordered so that * the first ones' messages will be displayed before the later - * ones; this is kinda in decreasing severity order */ + * ones; this is kinda in decreasing severity order. But the + * overlong must come last, as it changes 'uv' looked at by the + * others */ if (possible_problems & UTF8_GOT_OVERFLOW) { - /* Overflow means also got a super and above 31 bits, but we - * handle all three cases here */ + /* Overflow means also got a super and are using Perl's + * extended UTF-8, but we handle all three cases here */ possible_problems - &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT); + &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED); *errors |= UTF8_GOT_OVERFLOW; /* But the API says we flag all errors found */ if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) { *errors |= UTF8_GOT_SUPER; } - if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) { - *errors |= UTF8_GOT_ABOVE_31_BIT; + if (flags + & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED)) + { + *errors |= UTF8_GOT_PERL_EXTENDED; } - disallowed = TRUE; + /* Disallow if any of the three categories say to */ + if ( ! (flags & UTF8_ALLOW_OVERFLOW) + || (flags & ( UTF8_DISALLOW_SUPER + |UTF8_DISALLOW_PERL_EXTENDED))) + { + disallowed = TRUE; + } - /* The warnings code explicitly says it doesn't handle the case - * of packWARN2 and two categories which have parent-child - * relationship. Even if it works now to raise the warning if - * either is enabled, it wouldn't necessarily do so in the - * future. We output (only) the most dire warning*/ - if (! (flags & UTF8_CHECK_ONLY)) { - if (ckWARN_d(WARN_UTF8)) { - pack_warn = packWARN(WARN_UTF8); - } - else if (ckWARN_d(WARN_NON_UNICODE)) { - pack_warn = packWARN(WARN_NON_UNICODE); - } - if (pack_warn) { - message = Perl_form(aTHX_ "%s: %s (overflows)", - malformed_text, - _byte_dump_string(s0, send - s0)); + /* Likewise, warn if any say to */ + if ( ! (flags & UTF8_ALLOW_OVERFLOW) + || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED))) + { + + /* The warnings code explicitly says it doesn't handle the + * case of packWARN2 and two categories which have + * parent-child relationship. Even if it works now to + * raise the warning if either is enabled, it wouldn't + * necessarily do so in the future. We output (only) the + * most dire warning */ + if (! (flags & UTF8_CHECK_ONLY)) { + if (ckWARN_d(WARN_UTF8)) { + pack_warn = packWARN(WARN_UTF8); + } + else if (ckWARN_d(WARN_NON_UNICODE)) { + pack_warn = packWARN(WARN_NON_UNICODE); + } + if (pack_warn) { + message = Perl_form(aTHX_ "%s: %s (overflows)", + malformed_text, + _byte_dump_string(s0, curlen, 0)); + } } } } @@ -1347,6 +1642,12 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *errors |= UTF8_GOT_EMPTY; if (! (flags & UTF8_ALLOW_EMPTY)) { + + /* This so-called malformation is now treated as a bug in + * the caller. If you have nothing to decode, skip calling + * this function */ + assert(0); + disallowed = TRUE; if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { pack_warn = packWARN(WARN_UTF8); @@ -1367,7 +1668,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, "%s: %s (unexpected continuation byte 0x%02x," " with no preceding start byte)", malformed_text, - _byte_dump_string(s0, 1), *s0); + _byte_dump_string(s0, 1, 0), *s0); } } } @@ -1380,12 +1681,12 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { pack_warn = packWARN(WARN_UTF8); message = Perl_form(aTHX_ - "%s: %s (too short; %d byte%s available, need %d)", - malformed_text, - _byte_dump_string(s0, send - s0), - (int)avail_len, - avail_len == 1 ? "" : "s", - (int)expectlen); + "%s: %s (too short; %d byte%s available, need %d)", + malformed_text, + _byte_dump_string(s0, send - s0, 0), + (int)avail_len, + avail_len == 1 ? "" : "s", + (int)expectlen); } } @@ -1397,58 +1698,22 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) { disallowed = TRUE; if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + + /* If we don't know for sure that the input length is + * valid, avoid as much as possible reading past the + * end of the buffer */ + int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN) + ? s - s0 + : send - s0; pack_warn = packWARN(WARN_UTF8); message = Perl_form(aTHX_ "%s", unexpected_non_continuation_text(s0, - send - s0, + printlen, s - s0, (int) expectlen)); } } } - else if (possible_problems & UTF8_GOT_LONG) { - possible_problems &= ~UTF8_GOT_LONG; - *errors |= UTF8_GOT_LONG; - - if (! (flags & UTF8_ALLOW_LONG)) { - disallowed = TRUE; - - if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { - pack_warn = packWARN(WARN_UTF8); - - /* These error types cause 'uv' to be something that - * isn't what was intended, so can't use it in the - * message. The other error types either can't - * generate an overlong, or else the 'uv' is valid */ - if (orig_problems & - (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW)) - { - message = Perl_form(aTHX_ - "%s: %s (any UTF-8 sequence that starts" - " with \"%s\" is overlong which can and" - " should be represented with a" - " different, shorter sequence)", - malformed_text, - _byte_dump_string(s0, send - s0), - _byte_dump_string(s0, curlen)); - } - else { - U8 tmpbuf[UTF8_MAXBYTES+1]; - const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf, - uv, 0); - message = Perl_form(aTHX_ - "%s: %s (overlong; instead use %s to represent" - " U+%0*" UVXf ")", - malformed_text, - _byte_dump_string(s0, send - s0), - _byte_dump_string(tmpbuf, e - tmpbuf), - ((uv < 256) ? 2 : 4), /* Field width of 2 for - small code points */ - uv); - } - } - } - } else if (possible_problems & UTF8_GOT_SURROGATE) { possible_problems &= ~UTF8_GOT_SURROGATE; @@ -1466,11 +1731,10 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, message = Perl_form(aTHX_ "UTF-16 surrogate (any UTF-8 sequence that" " starts with \"%s\" is for a surrogate)", - _byte_dump_string(s0, curlen)); + _byte_dump_string(s0, curlen, 0)); } else { - message = Perl_form(aTHX_ - "UTF-16 surrogate U+%04" UVXf, uv); + message = Perl_form(aTHX_ surrogate_cp_format, uv); } } } @@ -1496,59 +1760,52 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, "Any UTF-8 sequence that starts with" " \"%s\" is for a non-Unicode code point," " may not be portable", - _byte_dump_string(s0, curlen)); + _byte_dump_string(s0, curlen, 0)); } else { - message = Perl_form(aTHX_ - "Code point 0x%04" UVXf " is not" - " Unicode, may not be portable", - uv); + message = Perl_form(aTHX_ super_cp_format, uv); } } } - /* The maximum code point ever specified by a standard was - * 2**31 - 1. Anything larger than that is a Perl extension - * that very well may not be understood by other applications - * (including earlier perl versions on EBCDIC platforms). We - * test for these after the regular SUPER ones, and before - * possibly bailing out, so that the slightly more dire warning - * will override the regular one. */ - if ( (flags & (UTF8_WARN_ABOVE_31_BIT - |UTF8_WARN_SUPER - |UTF8_DISALLOW_ABOVE_31_BIT)) - && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT) - && UNLIKELY(is_utf8_cp_above_31_bits( - adjusted_s0, - adjusted_send))) - || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT)) - && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv))))) - { + /* Test for Perl's extended UTF-8 after the regular SUPER ones, + * and before possibly bailing out, so that the more dire + * warning will override the regular one. */ + if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) { if ( ! (flags & UTF8_CHECK_ONLY) - && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER)) - && ckWARN_d(WARN_UTF8)) + && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER)) + && ckWARN_d(WARN_NON_UNICODE)) { - pack_warn = packWARN(WARN_UTF8); + pack_warn = packWARN(WARN_NON_UNICODE); - if (orig_problems & UTF8_GOT_TOO_SHORT) { + /* If it is an overlong that evaluates to a code point + * that doesn't have to use the Perl extended UTF-8, it + * still used it, and so we output a message that + * doesn't refer to the code point. The same is true + * if there was a SHORT malformation where the code + * point is not valid. In that case, 'uv' will have + * been set to the REPLACEMENT CHAR, and the message + * below without the code point in it will be selected + * */ + if (UNICODE_IS_PERL_EXTENDED(uv)) { message = Perl_form(aTHX_ - "Any UTF-8 sequence that starts with" - " \"%s\" is for a non-Unicode code" - " point, and is not portable", - _byte_dump_string(s0, curlen)); + perl_extended_cp_format, uv); } else { message = Perl_form(aTHX_ - "Code point 0x%" UVXf " is not Unicode," - " and not portable", - uv); + "Any UTF-8 sequence that starts with" + " \"%s\" is a Perl extension, and" + " so is not portable", + _byte_dump_string(s0, curlen, 0)); } } - if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) { - *errors |= UTF8_GOT_ABOVE_31_BIT; + if (flags & ( UTF8_WARN_PERL_EXTENDED + |UTF8_DISALLOW_PERL_EXTENDED)) + { + *errors |= UTF8_GOT_PERL_EXTENDED; - if (flags & UTF8_DISALLOW_ABOVE_31_BIT) { + if (flags & UTF8_DISALLOW_PERL_EXTENDED) { disallowed = TRUE; } } @@ -1558,21 +1815,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *errors |= UTF8_GOT_SUPER; disallowed = TRUE; } - - /* The deprecated warning overrides any non-deprecated one. If - * there are other problems, a deprecation message is not - * really helpful, so don't bother to raise it in that case. - * This also keeps the code from having to handle the case - * where 'uv' is not valid. */ - if ( ! (orig_problems - & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW)) - && UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - message = Perl_form(aTHX_ cp_above_legal_max, - uv, MAX_NON_DEPRECATED_CP); - pack_warn = packWARN(WARN_DEPRECATED); - } } else if (possible_problems & UTF8_GOT_NONCHAR) { possible_problems &= ~UTF8_GOT_NONCHAR; @@ -1589,9 +1831,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR))); pack_warn = packWARN(WARN_NONCHAR); - message = Perl_form(aTHX_ "Unicode non-character" - " U+%04" UVXf " is not recommended" - " for open interchange", uv); + message = Perl_form(aTHX_ nonchar_cp_format, uv); } } @@ -1599,6 +1839,69 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, disallowed = TRUE; *errors |= UTF8_GOT_NONCHAR; } + } + else if (possible_problems & UTF8_GOT_LONG) { + possible_problems &= ~UTF8_GOT_LONG; + *errors |= UTF8_GOT_LONG; + + if (flags & UTF8_ALLOW_LONG) { + + /* We don't allow the actual overlong value, unless the + * special extra bit is also set */ + if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE + & ~UTF8_ALLOW_LONG))) + { + uv = UNICODE_REPLACEMENT; + } + } + else { + disallowed = TRUE; + + if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) { + pack_warn = packWARN(WARN_UTF8); + + /* These error types cause 'uv' to be something that + * isn't what was intended, so can't use it in the + * message. The other error types either can't + * generate an overlong, or else the 'uv' is valid */ + if (orig_problems & + (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW)) + { + message = Perl_form(aTHX_ + "%s: %s (any UTF-8 sequence that starts" + " with \"%s\" is overlong which can and" + " should be represented with a" + " different, shorter sequence)", + malformed_text, + _byte_dump_string(s0, send - s0, 0), + _byte_dump_string(s0, curlen, 0)); + } + else { + U8 tmpbuf[UTF8_MAXBYTES+1]; + const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf, + uv, 0); + /* Don't use U+ for non-Unicode code points, which + * includes those in the Latin1 range */ + const char * preface = ( uv > PERL_UNICODE_MAX +#ifdef EBCDIC + || uv <= 0xFF +#endif + ) + ? "0x" + : "U+"; + message = Perl_form(aTHX_ + "%s: %s (overlong; instead use %s to represent" + " %s%0*" UVXf ")", + malformed_text, + _byte_dump_string(s0, send - s0, 0), + _byte_dump_string(tmpbuf, e - tmpbuf, 0), + preface, + ((uv < 256) ? 2 : 4), /* Field width of 2 for + small code points */ + UNI_TO_NATIVE(uv)); + } + } + } } /* End of looking through the possible flags */ /* Display the message (if any) for the problem being handled in @@ -1610,7 +1913,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, else Perl_warner(aTHX_ pack_warn, "%s", message); } - } /* End of 'while (possible_problems) {' */ + } /* End of 'while (possible_problems)' */ /* Since there was a possible problem, the returned length may need to * be changed from the one stored at the beginning of this function. @@ -1646,9 +1949,6 @@ the next possible position in C that could begin a non-malformed character. See L for details on when the REPLACEMENT CHARACTER is returned. -Code points above the platform's C will raise a deprecation warning, -unless those are turned off. - =cut Also implemented as a macro in utf8.h @@ -1659,10 +1959,12 @@ Also implemented as a macro in utf8.h UV Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) { + PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; + assert(s < send); return utf8n_to_uvchr(s, send - s, retlen, - ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } /* This is marked as deprecated @@ -1686,9 +1988,6 @@ is set (if C isn't NULL) so that (S + C<*retlen>>) is the next possible position in C that could begin a non-malformed character. See L for details on when the REPLACEMENT CHARACTER is returned. -Code points above the platform's C will raise a deprecation warning, -unless those are turned off. - =cut */ @@ -1699,7 +1998,6 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) assert(send > s); - /* Call the low level routine, asking for checks */ return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen)); } @@ -1707,8 +2005,8 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) =for apidoc utf8_length Return the length of the UTF-8 char encoded string C in characters. -Stops at C (inclusive). If C s> or if the scan would end -up past C, croaks. +Stops at C (i.e. the C<*e> byte does not form part of the character). +If C s> or if the scan would end up past C, it croaks. =cut */ @@ -1780,10 +2078,10 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) } else { /* diag_listed_as: Malformed UTF-8 character%s */ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), - "%s %s%s", - unexpected_non_continuation_text(u - 1, 2, 1, 2), - PL_op ? " in " : "", - PL_op ? OP_DESC(PL_op) : ""); + "%s %s%s", + unexpected_non_continuation_text(u - 2, 2, 1, 2), + PL_op ? " in " : "", + PL_op ? OP_DESC(PL_op) : ""); return -2; } } else { @@ -1813,10 +2111,14 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) /* =for apidoc utf8_to_bytes -Converts a string C of length C from UTF-8 into native byte encoding. +Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding. Unlike L, this over-writes the original string, and -updates C to contain the new length. -Returns zero on failure, setting C to -1. +updates C<*lenp> to contain the new length. +Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1. + +Upon successful return, the number of variants in the string can be computed by +having saved the value of C<*lenp> before the call, and subtracting the +after-call value of C<*lenp> from it. If you need a copy of the string, see L. @@ -1824,107 +2126,207 @@ If you need a copy of the string, see L. */ U8 * -Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) +Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp) { - U8 * const save = s; - U8 * const send = s + *len; - U8 *d; + U8 * first_variant; PERL_ARGS_ASSERT_UTF8_TO_BYTES; PERL_UNUSED_CONTEXT; - /* ensure valid UTF-8 and chars < 256 before updating string */ - while (s < send) { - if (! UTF8_IS_INVARIANT(*s)) { - if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { - *len = ((STRLEN) -1); - return 0; + /* This is a no-op if no variants at all in the input */ + if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) { + return s; + } + + { + U8 * const save = s; + U8 * const send = s + *lenp; + U8 * d; + + /* Nothing before the first variant needs to be changed, so start the real + * work there */ + s = first_variant; + while (s < send) { + if (! UTF8_IS_INVARIANT(*s)) { + if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { + *lenp = ((STRLEN) -1); + return 0; + } + s++; } s++; } - s++; - } - d = s = save; - while (s < send) { - U8 c = *s++; - if (! UTF8_IS_INVARIANT(c)) { - /* Then it is two-byte encoded */ - c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); - s++; - } - *d++ = c; + /* Is downgradable, so do it */ + d = s = first_variant; + while (s < send) { + U8 c = *s++; + if (! UVCHR_IS_INVARIANT(c)) { + /* Then it is two-byte encoded */ + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); + s++; + } + *d++ = c; + } + *d = '\0'; + *lenp = d - save; + + return save; } - *d = '\0'; - *len = d - save; - return save; } /* =for apidoc bytes_from_utf8 -Converts a string C of length C from UTF-8 into native byte encoding. -Unlike L but like L, returns a pointer to -the newly-created string, and updates C to contain the new -length. Returns the original string if no conversion occurs, C -is unchanged. Do nothing if C points to 0. Sets C to -0 if C is converted or consisted entirely of characters that are invariant -in UTF-8 (i.e., US-ASCII on non-EBCDIC machines). +Converts a potentially UTF-8 encoded string C of length C<*lenp> into native +byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C is +actually encoded in UTF-8. + +Unlike L but like L, this is non-destructive of +the input string. + +Do nothing if C<*is_utf8p> is 0, or if there are code points in the string +not expressible in native byte encoding. In these cases, C<*is_utf8p> and +C<*lenp> are unchanged, and the return value is the original C. + +Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a +newly created string containing a downgraded copy of C, and whose length is +returned in C<*lenp>, updated. The new string is C-terminated. + +Upon successful return, the number of variants in the string can be computed by +having saved the value of C<*lenp> before the call, and subtracting the +after-call value of C<*lenp> from it. =cut + +There is a macro that avoids this function call, but this is retained for +anyone who calls it with the Perl_ prefix */ + +U8 * +Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p) +{ + PERL_ARGS_ASSERT_BYTES_FROM_UTF8; + PERL_UNUSED_CONTEXT; + + return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL); +} + +/* +No = here because currently externally undocumented +for apidoc bytes_from_utf8_loc + +Like C()>, but takes an extra parameter, a pointer to where +to store the location of the first character in C<"s"> that cannot be +converted to non-UTF8. + +If that parameter is C, this function behaves identically to +C. + +Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to +C, except it also sets C<*first_non_downgradable> to C. + +Otherwise, the function returns a newly created C-terminated string +containing the non-UTF8 equivalent of the convertible first portion of +C<"s">. C<*lenp> is set to its length, not including the terminating C. +If the entire input string was converted, C<*is_utf8p> is set to a FALSE value, +and C<*first_non_downgradable> is set to C. + +Otherwise, C<*first_non_downgradable> set to point to the first byte of the +first character in the original string that wasn't converted. C<*is_utf8p> is +unchanged. Note that the new string may have length 0. + +Another way to look at it is, if C<*first_non_downgradable> is non-C and +C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and +converts as many characters in it as possible stopping at the first one it +finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is +set to point to that. The function returns the portion that could be converted +in a newly created C-terminated string, and C<*lenp> is set to its length, +not including the terminating C. If the very first character in the +original could not be converted, C<*lenp> will be 0, and the new string will +contain just a single C. If the entire input string was converted, +C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C. + +Upon successful return, the number of variants in the converted portion of the +string can be computed by having saved the value of C<*lenp> before the call, +and subtracting the after-call value of C<*lenp> from it. + +=cut + + */ U8 * -Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) +Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted) { U8 *d; - const U8 *start = s; - const U8 *send; - I32 count = 0; + const U8 *original = s; + U8 *converted_start; + const U8 *send = s + *lenp; - PERL_ARGS_ASSERT_BYTES_FROM_UTF8; - PERL_UNUSED_CONTEXT; - if (!*is_utf8) - return (U8 *)start; - - /* ensure valid UTF-8 and chars < 256 before converting string */ - for (send = s + *len; s < send;) { - if (! UTF8_IS_INVARIANT(*s)) { - if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) { - return (U8 *)start; + PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC; + + if (! *is_utf8p) { + if (first_unconverted) { + *first_unconverted = NULL; + } + + return (U8 *) original; + } + + Newx(d, (*lenp) + 1, U8); + + converted_start = d; + while (s < send) { + U8 c = *s++; + if (! UTF8_IS_INVARIANT(c)) { + + /* Then it is multi-byte encoded. If the code point is above 0xFF, + * have to stop now */ + if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) { + if (first_unconverted) { + *first_unconverted = s - 1; + goto finish_and_return; + } + else { + Safefree(converted_start); + return (U8 *) original; + } } - count++; + + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); s++; - } - s++; + } + *d++ = c; } - *is_utf8 = FALSE; - - Newx(d, (*len) - count + 1, U8); - s = start; start = d; - while (s < send) { - U8 c = *s++; - if (! UTF8_IS_INVARIANT(c)) { - /* Then it is two-byte encoded */ - c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s); - s++; - } - *d++ = c; + /* Here, converted the whole of the input */ + *is_utf8p = FALSE; + if (first_unconverted) { + *first_unconverted = NULL; } + + finish_and_return: *d = '\0'; - *len = d - start; - return (U8 *)start; + *lenp = d - converted_start; + + /* Trim unused space */ + Renew(converted_start, *lenp + 1, U8); + + return converted_start; } /* =for apidoc bytes_to_utf8 -Converts a string C of length C bytes from the native encoding into +Converts a string C of length C<*lenp> bytes from the native encoding into UTF-8. -Returns a pointer to the newly-created string, and sets C to +Returns a pointer to the newly-created string, and sets C<*lenp> to reflect the new length in bytes. +Upon successful return, the number of variants in the string can be computed by +having saved the value of C<*lenp> before the call, and subtracting it from the +after-call value of C<*lenp>. + A C character will be written after the end of the string. If you want to convert to UTF-8 from encodings other than @@ -1934,36 +2336,47 @@ see L(). =cut */ -/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will - likewise need duplication. */ - U8* -Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len) +Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) { - const U8 * const send = s + (*len); + const U8 * const send = s + (*lenp); U8 *d; U8 *dst; PERL_ARGS_ASSERT_BYTES_TO_UTF8; PERL_UNUSED_CONTEXT; - Newx(d, (*len) * 2 + 1, U8); + Newx(d, (*lenp) * 2 + 1, U8); dst = d; while (s < send) { append_utf8_from_native_byte(*s, &d); s++; } + *d = '\0'; - *len = d-dst; + *lenp = d-dst; + + /* Trim unused space */ + Renew(dst, *lenp + 1, U8); + return dst; } /* - * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8. + * Convert native (big-endian) UTF-16 to UTF-8. For reversed (little-endian), + * use utf16_to_utf8_reversed(). * - * Destination must be pre-extended to 3/2 source. Do not use in-place. - * We optimize for native, for obvious reasons. */ + * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes. + * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes. + * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes. + * + * These functions don't check for overflow. The worst case is every code + * point in the input is 2 bytes, and requires 4 bytes on output. (If the code + * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.) Therefore the + * destination must be pre-extended to 2 times the source length. + * + * Do not use in-place. We optimize for native, for obvious reasons. */ U8* Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) @@ -1974,7 +2387,8 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) PERL_ARGS_ASSERT_UTF16_TO_UTF8; if (bytelen & 1) - Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf, (UV)bytelen); + Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf, + (UV)bytelen); pend = p + bytelen; @@ -1990,10 +2404,12 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv)); continue; } + #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST #define LAST_HIGH_SURROGATE 0xDBFF #define FIRST_LOW_SURROGATE 0xDC00 #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST +#define FIRST_IN_PLANE1 0x10000 /* This assumes that most uses will be in the first Unicode plane, not * needing surrogates */ @@ -2012,13 +2428,13 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) } p += 2; uv = ((uv - FIRST_HIGH_SURROGATE) << 10) - + (low - FIRST_LOW_SURROGATE) + 0x10000; + + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1; } } #ifdef EBCDIC d = uvoffuni_to_utf8_flags(d, uv, 0); #else - if (uv < 0x10000) { + if (uv < FIRST_IN_PLANE1) { *d++ = (U8)(( uv >> 12) | 0xe0); *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); *d++ = (U8)(( uv & 0x3f) | 0x80); @@ -2065,7 +2481,7 @@ Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO(classnum, tmpbuf); + return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf)); } /* Internal function so we can deprecate the external one, and call @@ -2086,7 +2502,7 @@ Perl__is_uni_perl_idcont(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return _is_utf8_perl_idcont(tmpbuf); + return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf)); } bool @@ -2094,11 +2510,12 @@ Perl__is_uni_perl_idstart(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; uvchr_to_utf8(tmpbuf, c); - return _is_utf8_perl_idstart(tmpbuf); + return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf)); } UV -Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s) +Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, + const char S_or_s) { /* We have the latin1-range values compiled into the core, so just use * those, converting the result to UTF-8. The only difference between upper @@ -2140,7 +2557,9 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_ return 'S'; #endif default: - Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS); + Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect" + " '%c' to map to '%c'", + c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS); NOT_REACHED; /* NOTREACHED */ } } @@ -2159,15 +2578,20 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_ * needs to have space for UTF8_MAXBYTES_CASE+1 bytes * LENP will be set to the length in bytes of the string of changed characters * - * The functions return the ordinal of the first character in the string of OUTP */ -#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "") -#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "") -#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "") + * The functions return the ordinal of the first character in the string of + * OUTP */ +#define CALL_UPPER_CASE(uv, s, d, lenp) \ + _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "") +#define CALL_TITLE_CASE(uv, s, d, lenp) \ + _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "") +#define CALL_LOWER_CASE(uv, s, d, lenp) \ + _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "") /* This additionally has the input parameter 'specials', which if non-zero will * cause this to use the specials hash for folding (meaning get full case * folding); otherwise, when zero, this implies a simple case fold */ -#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL) +#define CALL_FOLD_CASE(uv, s, d, lenp, specials) \ +_to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL) UV Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) @@ -2204,7 +2628,7 @@ Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) } STATIC U8 -S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp) +S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy) { /* We have the latin1-range values compiled into the core, so just use * those, converting the result to UTF-8. Since the result is always just @@ -2212,6 +2636,8 @@ S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp) U8 converted = toLOWER_LATIN1(c); + PERL_UNUSED_ARG(dummy); + if (p != NULL) { if (NATIVE_BYTE_IS_INVARIANT(converted)) { *p = converted; @@ -2234,7 +2660,7 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) PERL_ARGS_ASSERT_TO_UNI_LOWER; if (c < 256) { - return to_lower_latin1((U8) c, p, lenp); + return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ ); } uvchr_to_utf8(p, c); @@ -2242,7 +2668,8 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) } UV -Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags) +Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, + const unsigned int flags) { /* Corresponds to to_lower_latin1(); bits meanings: * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited @@ -2337,13 +2764,14 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) uvchr_to_utf8(p, c); return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL); } - else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with + else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with the special flags. */ U8 utf8_c[UTF8_MAXBYTES + 1]; needs_full_generality: uvchr_to_utf8(utf8_c, c); - return _to_utf8_fold_flags(utf8_c, p, lenp, flags); + return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c), + p, lenp, flags); } } @@ -2372,16 +2800,47 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, * character without reading beyond the end, and pass that number on to the * validating routine */ if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) { - if (ckWARN_d(WARN_UTF8)) { - Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8), - "Passing malformed UTF-8 to \"%s\" is deprecated", swashname); - if (ckWARN(WARN_UTF8)) { /* This will output details as to the - what the malformation is */ - utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL); - } - } - return FALSE; + _force_out_malformed_utf8_message(p, p + UTF8SKIP(p), + _UTF8_NO_CONFIDENCE_IN_CURLEN, + 1 /* Die */ ); + NOT_REACHED; /* NOTREACHED */ + } + + if (!*swash) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + *swash = _core_swash_init("utf8", + + /* Only use the name if there is no inversion + * list; otherwise will go out to disk */ + (invlist) ? "" : swashname, + + &PL_sv_undef, 1, 0, invlist, &flags); + } + + return swash_fetch(*swash, p, TRUE) != 0; +} + +PERL_STATIC_INLINE bool +S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, + SV **swash, const char *const swashname, + SV* const invlist) +{ + /* returns a boolean giving whether or not the UTF8-encoded character that + * starts at

, and extending no further than is in the swash + * indicated by . contains a pointer to where the swash + * indicated by is to be stored; which this routine will do, so + * that future calls will look at <*swash> and only generate a swash if it + * is not null. is NULL or an inversion list that defines the + * swash. If not null, it saves time during initialization of the swash. + */ + + PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN; + + if (! isUTF8_CHAR(p, e)) { + _force_out_malformed_utf8_message(p, e, 0, 1); + NOT_REACHED; /* NOTREACHED */ } + if (!*swash) { U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; *swash = _core_swash_init("utf8", @@ -2396,30 +2855,166 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, return swash_fetch(*swash, p, TRUE) != 0; } +STATIC void +S_warn_on_first_deprecated_use(pTHX_ const char * const name, + const char * const alternative, + const bool use_locale, + const char * const file, + const unsigned line) +{ + const char * key; + + PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE; + + if (ckWARN_d(WARN_DEPRECATED)) { + + key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line); + if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) { + if (! PL_seen_deprecated_macro) { + PL_seen_deprecated_macro = newHV(); + } + if (! hv_store(PL_seen_deprecated_macro, key, + strlen(key), &PL_sv_undef, 0)) + { + Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); + } + + if (instr(file, "mathoms.c")) { + Perl_warner(aTHX_ WARN_DEPRECATED, + "In %s, line %d, starting in Perl v5.30, %s()" + " will be removed. Avoid this message by" + " converting to use %s().\n", + file, line, name, alternative); + } + else { + Perl_warner(aTHX_ WARN_DEPRECATED, + "In %s, line %d, starting in Perl v5.30, %s() will" + " require an additional parameter. Avoid this" + " message by converting to use %s().\n", + file, line, name, alternative); + } + } + } +} + bool -Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p) +Perl__is_utf8_FOO(pTHX_ U8 classnum, + const U8 * const p, + const char * const name, + const char * const alternative, + const bool use_utf8, + const bool use_locale, + const char * const file, + const unsigned line) { PERL_ARGS_ASSERT__IS_UTF8_FOO; + warn_on_first_deprecated_use(name, alternative, use_locale, file, line); + + if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) { + + switch (classnum) { + case _CC_WORDCHAR: + case _CC_DIGIT: + case _CC_ALPHA: + case _CC_LOWER: + case _CC_UPPER: + case _CC_PUNCT: + case _CC_PRINT: + case _CC_ALPHANUMERIC: + case _CC_GRAPH: + case _CC_CASED: + + return is_utf8_common(p, + &PL_utf8_swash_ptrs[classnum], + swash_property_names[classnum], + PL_XPosix_ptrs[classnum]); + + case _CC_SPACE: + return is_XPERLSPACE_high(p); + case _CC_BLANK: + return is_HORIZWS_high(p); + case _CC_XDIGIT: + return is_XDIGIT_high(p); + case _CC_CNTRL: + return 0; + case _CC_ASCII: + return 0; + case _CC_VERTSPACE: + return is_VERTWS_high(p); + case _CC_IDFIRST: + if (! PL_utf8_perl_idstart) { + PL_utf8_perl_idstart + = _new_invlist_C_array(_Perl_IDStart_invlist); + } + return is_utf8_common(p, &PL_utf8_perl_idstart, + "_Perl_IDStart", NULL); + case _CC_IDCONT: + if (! PL_utf8_perl_idcont) { + PL_utf8_perl_idcont + = _new_invlist_C_array(_Perl_IDCont_invlist); + } + return is_utf8_common(p, &PL_utf8_perl_idcont, + "_Perl_IDCont", NULL); + } + } + + /* idcont is the same as wordchar below 256 */ + if (classnum == _CC_IDCONT) { + classnum = _CC_WORDCHAR; + } + else if (classnum == _CC_IDFIRST) { + if (*p == '_') { + return TRUE; + } + classnum = _CC_ALPHA; + } + + if (! use_locale) { + if (! use_utf8 || UTF8_IS_INVARIANT(*p)) { + return _generic_isCC(*p, classnum); + } + + return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )), classnum); + } + else { + if (! use_utf8 || UTF8_IS_INVARIANT(*p)) { + return isFOO_lc(classnum, *p); + } + + return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 ))); + } + + NOT_REACHED; /* NOTREACHED */ +} + +bool +Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p, + const U8 * const e) +{ + PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN; + assert(classnum < _FIRST_NON_SWASH_CC); - return is_utf8_common(p, - &PL_utf8_swash_ptrs[classnum], - swash_property_names[classnum], - PL_XPosix_ptrs[classnum]); + return is_utf8_common_with_len(p, + e, + &PL_utf8_swash_ptrs[classnum], + swash_property_names[classnum], + PL_XPosix_ptrs[classnum]); } bool -Perl__is_utf8_perl_idstart(pTHX_ const U8 *p) +Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e) { SV* invlist = NULL; - PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART; + PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN; if (! PL_utf8_perl_idstart) { invlist = _new_invlist_C_array(_Perl_IDStart_invlist); } - return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist); + return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart, + "_Perl_IDStart", invlist); } bool @@ -2433,16 +3028,17 @@ Perl__is_utf8_xidstart(pTHX_ const U8 *p) } bool -Perl__is_utf8_perl_idcont(pTHX_ const U8 *p) +Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e) { SV* invlist = NULL; - PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT; + PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN; if (! PL_utf8_perl_idcont) { invlist = _new_invlist_C_array(_Perl_IDCont_invlist); } - return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist); + return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont, + "_Perl_IDCont", invlist); } bool @@ -2469,50 +3065,6 @@ Perl__is_utf8_mark(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL); } -/* -=for apidoc to_utf8_case - -Instead use the appropriate one of L, -L, -L, -or L. - -C

contains the pointer to the UTF-8 string encoding -the character that is being converted. This routine assumes that the character -at C

is well-formed. - -C is a pointer to the character buffer to put the -conversion result to. C is a pointer to the length -of the result. - -C is a pointer to the swash to use. - -Both the special and normal mappings are stored in F, -and loaded by C, using F. C (usually, -but not always, a multicharacter mapping), is tried first. - -C is a string, normally C or C<"">. C means to not use -any special mappings; C<""> means to use the special mappings. Values other -than these two are treated as the name of the hash containing the special -mappings, like C<"utf8::ToSpecLower">. - -C is a string like C<"ToLower"> which means the swash -C<%utf8::ToLower>. - -Code points above the platform's C will raise a deprecation warning, -unless those are turned off. - -=cut */ - -UV -Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, - SV **swashp, const char *normal, const char *special) -{ - PERL_ARGS_ASSERT_TO_UTF8_CASE; - - return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special); -} - /* change namve uv1 to 'from' */ STATIC UV S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, @@ -2574,7 +3126,8 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, if (ckWARN_d(WARN_SURROGATE)) { const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; Perl_warner(aTHX_ packWARN(WARN_SURROGATE), - "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04" UVXf, desc, uv1); + "Operation \"%s\" returns its argument for" + " UTF-16 surrogate U+%04" UVXf, desc, uv1); } goto cases_to_self; } @@ -2587,16 +3140,15 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, } if (UNLIKELY(UNICODE_IS_SUPER(uv1))) { - if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP); + if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv1, + MAX_EXTERNALLY_LEGAL_CP); } if (ckWARN_d(WARN_NON_UNICODE)) { const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), - "Operation \"%s\" returns its argument for non-Unicode code point 0x%04" UVXf, desc, uv1); + "Operation \"%s\" returns its argument for" + " non-Unicode code point 0x%04" UVXf, desc, uv1); } goto cases_to_self; } @@ -2605,8 +3157,8 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C)) { - /* As of this writing, this means we avoid swash creation - * for anything beyond low Plane 1 */ + /* As of Unicode 10.0, this means we avoid swash creation + * for anything beyond high Plane 1 (below emojis) */ goto cases_to_self; } #endif @@ -2619,7 +3171,8 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, } if (!*swashp) /* load on-demand */ - *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL); + *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, + 4, 0, NULL, NULL); if (special) { /* It might be "special" (sometimes, but not always, @@ -2687,7 +3240,8 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, } STATIC UV -S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp) +S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, + U8* const ustrp, STRLEN *lenp) { /* This is called when changing the case of a UTF-8-encoded character above * the Latin1 range, and the operation is in a non-UTF-8 locale. If the @@ -2698,7 +3252,8 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c * p points to the original string whose case was changed; assumed * by this routine to be well-formed * result the code point of the first character in the changed-case string - * ustrp points to the changed-case string ( represents its first char) + * ustrp points to the changed-case string ( represents its + * first char) * lenp points to the length of */ UV original; /* To store the first code point of

*/ @@ -2734,8 +3289,8 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), - "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8 locale; " - "resolved to \"\\x{%" UVXf "}\".", + "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8" + " locale; resolved to \"\\x{%" UVXf "}\".", OP_DESC(PL_op), original, original); @@ -2743,10 +3298,177 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c return original; } +STATIC U32 +S_check_and_deprecate(pTHX_ const U8 *p, + const U8 **e, + const unsigned int type, /* See below */ + const bool use_locale, /* Is this a 'LC_' + macro call? */ + const char * const file, + const unsigned line) +{ + /* This is a temporary function to deprecate the unsafe calls to the case + * changing macros and functions. It keeps all the special stuff in just + * one place. + * + * It updates *e with the pointer to the end of the input string. If using + * the old-style macros, *e is NULL on input, and so this function assumes + * the input string is long enough to hold the entire UTF-8 sequence, and + * sets *e accordingly, but it then returns a flag to pass the + * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid + * using the full length if possible. + * + * It also does the assert that *e > p when *e is not NULL. This should be + * migrated to the callers when this function gets deleted. + * + * The 'type' parameter is used for the caller to specify which case + * changing function this is called from: */ + +# define DEPRECATE_TO_UPPER 0 +# define DEPRECATE_TO_TITLE 1 +# define DEPRECATE_TO_LOWER 2 +# define DEPRECATE_TO_FOLD 3 + + U32 utf8n_flags = 0; + const char * name; + const char * alternative; + + PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE; + + if (*e == NULL) { + utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN; + *e = p + UTF8SKIP(p); + + /* For mathoms.c calls, we use the function name we know is stored + * there. It could be part of a larger path */ + if (type == DEPRECATE_TO_UPPER) { + name = instr(file, "mathoms.c") + ? "to_utf8_upper" + : "toUPPER_utf8"; + alternative = "toUPPER_utf8_safe"; + } + else if (type == DEPRECATE_TO_TITLE) { + name = instr(file, "mathoms.c") + ? "to_utf8_title" + : "toTITLE_utf8"; + alternative = "toTITLE_utf8_safe"; + } + else if (type == DEPRECATE_TO_LOWER) { + name = instr(file, "mathoms.c") + ? "to_utf8_lower" + : "toLOWER_utf8"; + alternative = "toLOWER_utf8_safe"; + } + else if (type == DEPRECATE_TO_FOLD) { + name = instr(file, "mathoms.c") + ? "to_utf8_fold" + : "toFOLD_utf8"; + alternative = "toFOLD_utf8_safe"; + } + else Perl_croak(aTHX_ "panic: Unexpected case change type"); + + warn_on_first_deprecated_use(name, alternative, use_locale, file, line); + } + else { + assert (p < *e); + } + + return utf8n_flags; +} + +/* The process for changing the case is essentially the same for the four case + * change types, except there are complications for folding. Otherwise the + * difference is only which case to change to. To make sure that they all do + * the same thing, the bodies of the functions are extracted out into the + * following two macros. The functions are written with the same variable + * names, and these are known and used inside these macros. It would be + * better, of course, to have inline functions to do it, but since different + * macros are called, depending on which case is being changed to, this is not + * feasible in C (to khw's knowledge). Two macros are created so that the fold + * function can start with the common start macro, then finish with its special + * handling; while the other three cases can just use the common end macro. + * + * The algorithm is to use the proper (passed in) macro or function to change + * the case for code points that are below 256. The macro is used if using + * locale rules for the case change; the function if not. If the code point is + * above 255, it is computed from the input UTF-8, and another macro is called + * to do the conversion. If necessary, the output is converted to UTF-8. If + * using a locale, we have to check that the change did not cross the 255/256 + * boundary, see check_locale_boundary_crossing() for further details. + * + * The macros are split with the correct case change for the below-256 case + * stored into 'result', and in the middle of an else clause for the above-255 + * case. At that point in the 'else', 'result' is not the final result, but is + * the input code point calculated from the UTF-8. The fold code needs to + * realize all this and take it from there. + * + * If you read the two macros as sequential, it's easier to understand what's + * going on. */ +#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \ + L1_func_extra_param) \ + \ + if (flags & (locale_flags)) { \ + /* Treat a UTF-8 locale as not being in locale at all */ \ + if (IN_UTF8_CTYPE_LOCALE) { \ + flags &= ~(locale_flags); \ + } \ + else { \ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ + } \ + } \ + \ + if (UTF8_IS_INVARIANT(*p)) { \ + if (flags & (locale_flags)) { \ + result = LC_L1_change_macro(*p); \ + } \ + else { \ + return L1_func(*p, ustrp, lenp, L1_func_extra_param); \ + } \ + } \ + else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \ + if (flags & (locale_flags)) { \ + result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \ + *(p+1))); \ + } \ + else { \ + return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), \ + ustrp, lenp, L1_func_extra_param); \ + } \ + } \ + else { /* malformed UTF-8 or ord above 255 */ \ + STRLEN len_result; \ + result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \ + if (len_result == (STRLEN) -1) { \ + _force_out_malformed_utf8_message(p, e, utf8n_flags, \ + 1 /* Die */ ); \ + } + +#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \ + result = change_macro(result, p, ustrp, lenp); \ + \ + if (flags & (locale_flags)) { \ + result = check_locale_boundary_crossing(p, result, ustrp, lenp); \ + } \ + return result; \ + } \ + \ + /* Here, used locale rules. Convert back to UTF-8 */ \ + if (UTF8_IS_INVARIANT(result)) { \ + *ustrp = (U8) result; \ + *lenp = 1; \ + } \ + else { \ + *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \ + *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \ + *lenp = 2; \ + } \ + \ + return result; + /* =for apidoc to_utf8_upper -Instead use L. +Instead use L. =cut */ @@ -2755,67 +3477,30 @@ Instead use L. * be used. */ UV -Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) +Perl__to_utf8_upper_flags(pTHX_ const U8 *p, + const U8 *e, + U8* ustrp, + STRLEN *lenp, + bool flags, + const char * const file, + const int line) { UV result; + const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER, + cBOOL(flags), file, line); PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS; - if (flags) { - /* Treat a UTF-8 locale as not being in locale at all */ - if (IN_UTF8_CTYPE_LOCALE) { - flags = FALSE; - } - else { - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - } - } - - if (UTF8_IS_INVARIANT(*p)) { - if (flags) { - result = toUPPER_LC(*p); - } - else { - return _to_upper_title_latin1(*p, ustrp, lenp, 'S'); - } - } - else if UTF8_IS_DOWNGRADEABLE_START(*p) { - if (flags) { - U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); - result = toUPPER_LC(c); - } - else { - return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), - ustrp, lenp, 'S'); - } - } - else { /* UTF-8, ord above 255 */ - result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp); - - if (flags) { - result = check_locale_boundary_crossing(p, result, ustrp, lenp); - } - return result; - } - - /* Here, used locale rules. Convert back to UTF-8 */ - if (UTF8_IS_INVARIANT(result)) { - *ustrp = (U8) result; - *lenp = 1; - } - else { - *ustrp = UTF8_EIGHT_BIT_HI((U8) result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); - *lenp = 2; - } - - return result; + /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */ + /* 2nd char of uc(U+DF) is 'S' */ + CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S'); + CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE); } /* =for apidoc to_utf8_title -Instead use L. +Instead use L. =cut */ @@ -2826,67 +3511,29 @@ Instead use L. */ UV -Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) +Perl__to_utf8_title_flags(pTHX_ const U8 *p, + const U8 *e, + U8* ustrp, + STRLEN *lenp, + bool flags, + const char * const file, + const int line) { UV result; + const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE, + cBOOL(flags), file, line); PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; - if (flags) { - /* Treat a UTF-8 locale as not being in locale at all */ - if (IN_UTF8_CTYPE_LOCALE) { - flags = FALSE; - } - else { - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - } - } - - if (UTF8_IS_INVARIANT(*p)) { - if (flags) { - result = toUPPER_LC(*p); - } - else { - return _to_upper_title_latin1(*p, ustrp, lenp, 's'); - } - } - else if UTF8_IS_DOWNGRADEABLE_START(*p) { - if (flags) { - U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); - result = toUPPER_LC(c); - } - else { - return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), - ustrp, lenp, 's'); - } - } - else { /* UTF-8, ord above 255 */ - result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp); - - if (flags) { - result = check_locale_boundary_crossing(p, result, ustrp, lenp); - } - return result; - } - - /* Here, used locale rules. Convert back to UTF-8 */ - if (UTF8_IS_INVARIANT(result)) { - *ustrp = (U8) result; - *lenp = 1; - } - else { - *ustrp = UTF8_EIGHT_BIT_HI((U8) result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); - *lenp = 2; - } - - return result; + /* 2nd char of ucfirst(U+DF) is 's' */ + CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's'); + CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE); } /* =for apidoc to_utf8_lower -Instead use L. +Instead use L. =cut */ @@ -2896,68 +3543,28 @@ Instead use L. */ UV -Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags) +Perl__to_utf8_lower_flags(pTHX_ const U8 *p, + const U8 *e, + U8* ustrp, + STRLEN *lenp, + bool flags, + const char * const file, + const int line) { UV result; + const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER, + cBOOL(flags), file, line); PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; - if (flags) { - /* Treat a UTF-8 locale as not being in locale at all */ - if (IN_UTF8_CTYPE_LOCALE) { - flags = FALSE; - } - else { - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - } - } - - if (UTF8_IS_INVARIANT(*p)) { - if (flags) { - result = toLOWER_LC(*p); - } - else { - return to_lower_latin1(*p, ustrp, lenp); - } - } - else if UTF8_IS_DOWNGRADEABLE_START(*p) { - if (flags) { - U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); - result = toLOWER_LC(c); - } - else { - return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), - ustrp, lenp); - } - } - else { /* UTF-8, ord above 255 */ - result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp); - - if (flags) { - result = check_locale_boundary_crossing(p, result, ustrp, lenp); - } - - return result; - } - - /* Here, used locale rules. Convert back to UTF-8 */ - if (UTF8_IS_INVARIANT(result)) { - *ustrp = (U8) result; - *lenp = 1; - } - else { - *ustrp = UTF8_EIGHT_BIT_HI((U8) result); - *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); - *lenp = 2; - } - - return result; + CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */) + CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE) } /* =for apidoc to_utf8_fold -Instead use L. +Instead use L. =cut */ @@ -2972,9 +3579,17 @@ Instead use L. */ UV -Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) +Perl__to_utf8_fold_flags(pTHX_ const U8 *p, + const U8 *e, + U8* ustrp, + STRLEN *lenp, + U8 flags, + const char * const file, + const int line) { UV result; + const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD, + cBOOL(flags), file, line); PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS; @@ -2983,53 +3598,20 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) assert(p != ustrp); /* Otherwise overwrites */ - if (flags & FOLD_FLAGS_LOCALE) { - /* Treat a UTF-8 locale as not being in locale at all */ - if (IN_UTF8_CTYPE_LOCALE) { - flags &= ~FOLD_FLAGS_LOCALE; - } - else { - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - } - } + CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1, + ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII))); - if (UTF8_IS_INVARIANT(*p)) { - if (flags & FOLD_FLAGS_LOCALE) { - result = toFOLD_LC(*p); - } - else { - return _to_fold_latin1(*p, ustrp, lenp, - flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); - } - } - else if UTF8_IS_DOWNGRADEABLE_START(*p) { - if (flags & FOLD_FLAGS_LOCALE) { - U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); - result = toFOLD_LC(c); - } - else { - return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), - ustrp, lenp, - flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)); - } - } - else { /* UTF-8, ord above 255 */ - result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL); + result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL); if (flags & FOLD_FLAGS_LOCALE) { # define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 - const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1; - # ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8 # define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8 - const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1; - /* Special case these two characters, as what normally gets * returned under locale doesn't work */ - if (UTF8SKIP(p) == cap_sharp_s_len - && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len)) + if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), @@ -3039,8 +3621,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) } else #endif - if (UTF8SKIP(p) == long_s_t_len - && memEQ((char *) p, LONG_S_T, long_s_t_len)) + if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), @@ -3059,9 +3640,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) * 255/256 boundary which is forbidden under /l, and so the code * wouldn't catch that they are equivalent (which they are only in * this release) */ - else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1 - && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1)) - { + else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; " @@ -3175,7 +3754,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags) */ SV* -Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none) +Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, + I32 minbits, I32 none) { PERL_ARGS_ASSERT_SWASH_INIT; @@ -3183,11 +3763,14 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits * public interface, and returning a copy prevents others from doing * mischief on the original */ - return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL)); + return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, + NULL, NULL)); } SV* -Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p) +Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, + I32 minbits, I32 none, SV* invlist, + U8* const flags_p) { /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST @@ -3254,9 +3837,9 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m assert(listsv != &PL_sv_undef || strNE(name, "") || invlist); assert(! invlist || minbits == 1); - PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex - that triggered the swash init and the swash init perl logic itself. - See perl #122747 */ + PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the + regex that triggered the swash init and the swash init + perl logic itself. See perl #122747 */ /* If data was passed in to go out to utf8_heavy to find the swash of, do * so */ @@ -4164,8 +4747,8 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) /* Must have at least 8 bits to get the mappings */ if (bits != 8 && bits != 16 && bits != 32) { - Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %" UVuf, - (UV)bits); + Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %" + UVuf, (UV)bits); } if (specials_p) { /* It might be "special" (sometimes, but not always, a @@ -4239,20 +4822,22 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) while ((from_list = (AV *) hv_iternextsv(specials_inverse, &char_to, &to_len))) { - if (av_tindex_nomg(from_list) > 0) { + if (av_tindex_skip_len_mg(from_list) > 0) { SSize_t i; /* We iterate over all combinations of i,j to place each code * point on each list */ - for (i = 0; i <= av_tindex_nomg(from_list); i++) { + for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) { SSize_t j; AV* i_list = newAV(); SV** entryp = av_fetch(from_list, i, FALSE); if (entryp == NULL) { - Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); + Perl_croak(aTHX_ "panic: av_fetch() unexpectedly" + " failed"); } if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) { - Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp)); + Perl_croak(aTHX_ "panic: unexpected entry for %s", + SvPVX(*entryp)); } if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp), (SV*) i_list, FALSE)) @@ -4261,7 +4846,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) } /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ - for (j = 0; j <= av_tindex_nomg(from_list); j++) { + for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) { entryp = av_fetch(from_list, j, FALSE); if (entryp == NULL) { Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); @@ -4337,7 +4922,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash) /* Look through list to see if this inverse mapping already is * listed, or if there is a mapping to itself already */ - for (i = 0; i <= av_tindex_nomg(list); i++) { + for (i = 0; i <= av_tindex_skip_len_mg(list); i++) { SV** entryp = av_fetch(list, i, FALSE); SV* entry; UV uv; @@ -4458,32 +5043,39 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) /* The first number is a count of the rest */ l++; if (!grok_atoUV((const char *)l, &elements, &after_atou)) { - Perl_croak(aTHX_ "panic: Expecting a valid count of elements at start of inversion list"); + Perl_croak(aTHX_ "panic: Expecting a valid count of elements" + " at start of inversion list"); } if (elements == 0) { invlist = _new_invlist(0); } else { - while (isSPACE(*l)) l++; l = (U8 *) after_atou; - /* Get the 0th element, which is needed to setup the inversion list */ + /* Get the 0th element, which is needed to setup the inversion list + * */ while (isSPACE(*l)) l++; if (!grok_atoUV((const char *)l, &element0, &after_atou)) { - Perl_croak(aTHX_ "panic: Expecting a valid 0th element for inversion list"); + Perl_croak(aTHX_ "panic: Expecting a valid 0th element for" + " inversion list"); } l = (U8 *) after_atou; - invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr); + invlist = _setup_canned_invlist(elements, element0, + &other_elements_ptr); elements--; /* Then just populate the rest of the input */ while (elements-- > 0) { if (l > lend) { - Perl_croak(aTHX_ "panic: Expecting %" UVuf " more elements than available", elements); + Perl_croak(aTHX_ "panic: Expecting %" UVuf " more" + " elements than available", elements); } while (isSPACE(*l)) l++; - if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) { - Perl_croak(aTHX_ "panic: Expecting a valid element in inversion list"); + if (!grok_atoUV((const char *)l, other_elements_ptr++, + &after_atou)) + { + Perl_croak(aTHX_ "panic: Expecting a valid element" + " in inversion list"); } l = (U8 *) after_atou; } @@ -4495,7 +5087,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) * size based on worst possible case, which is each line in the input * creates 2 elements in the inversion list: 1) the beginning of a * range in the list; 2) the beginning of a range not in the list. */ - while ((loc = (strchr(loc, '\n'))) != NULL) { + while ((loc = (char *) memchr(loc, '\n', lend - (U8 *) loc)) != NULL) { elements += 2; loc++; } @@ -4638,12 +5230,10 @@ bool Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) { /* May change: warns if surrogates, non-character code points, or - * non-Unicode code points are in s which has length len bytes. Returns - * TRUE if none found; FALSE otherwise. The only other validity check is - * to make sure that this won't exceed the string's length. - * - * Code points above the platform's C will raise a deprecation - * warning, unless those are turned off. */ + * non-Unicode code points are in 's' which has length 'len' bytes. + * Returns TRUE if none found; FALSE otherwise. The only other validity + * check is to make sure that this won't exceed the string's length nor + * overflow */ const U8* const e = s + len; bool ok = TRUE; @@ -4659,22 +5249,10 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) { if (UNLIKELY(UTF8_IS_SUPER(s, e))) { if ( ckWARN_d(WARN_NON_UNICODE) - || ( ckWARN_d(WARN_DEPRECATED) -#ifndef UV_IS_QUAD - && UNLIKELY(is_utf8_cp_above_31_bits(s, e)) -#else /* Below is 64-bit words */ - /* 2**63 and up meet these conditions provided we have - * a 64-bit word. */ -# ifdef EBCDIC - && *s == 0xFE - && NATIVE_UTF8_TO_I8(s[1]) >= 0xA8 -# else - && *s == 0xFF - /* s[1] being above 0x80 overflows */ - && s[2] >= 0x88 -# endif -#endif - )) { + || UNLIKELY(0 < does_utf8_overflow(s, s + len, + 0 /* Don't consider overlongs */ + ))) + { /* A side effect of this function will be to warn */ (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER); ok = FALSE; @@ -4687,11 +5265,14 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) * do for the non-chars and above-unicodes */ UV uv = utf8_to_uvchr_buf(s, e, NULL); Perl_warner(aTHX_ packWARN(WARN_SURROGATE), - "Unicode surrogate U+%04" UVXf " is illegal in UTF-8", uv); + "Unicode surrogate U+%04" UVXf " is illegal in UTF-8", + uv); ok = FALSE; } } - else if (UNLIKELY(UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) { + else if ( UNLIKELY(UTF8_IS_NONCHAR(s, e)) + && (ckWARN_d(WARN_NONCHAR))) + { /* A side effect of this function will be to warn */ (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR); ok = FALSE; @@ -4723,7 +5304,8 @@ See also L. =cut */ char * -Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags) +Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, + UV flags) { int truncated = 0; const char *s, *e; @@ -4812,28 +5394,28 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) /* =for apidoc foldEQ_utf8 -Returns true if the leading portions of the strings C and C (either or both -of which may be in UTF-8) are the same case-insensitively; false otherwise. -How far into the strings to compare is determined by other input parameters. +Returns true if the leading portions of the strings C and C (either or +both of which may be in UTF-8) are the same case-insensitively; false +otherwise. How far into the strings to compare is determined by other input +parameters. If C is true, the string C is assumed to be in UTF-8-encoded Unicode; -otherwise it is assumed to be in native 8-bit encoding. Correspondingly for C -with respect to C. - -If the byte length C is non-zero, it says how far into C to check for fold -equality. In other words, C+C will be used as a goal to reach. The -scan will not be considered to be a match unless the goal is reached, and -scanning won't continue past that goal. Correspondingly for C with respect to -C. - -If C is non-C and the pointer it points to is not C, that pointer is -considered an end pointer to the position 1 byte past the maximum point -in C beyond which scanning will not continue under any circumstances. +otherwise it is assumed to be in native 8-bit encoding. Correspondingly for +C with respect to C. + +If the byte length C is non-zero, it says how far into C to check for +fold equality. In other words, C+C will be used as a goal to reach. +The scan will not be considered to be a match unless the goal is reached, and +scanning won't continue past that goal. Correspondingly for C with respect +to C. + +If C is non-C and the pointer it points to is not C, that +pointer is considered an end pointer to the position 1 byte past the maximum +point in C beyond which scanning will not continue under any circumstances. (This routine assumes that UTF-8 encoded input strings are not malformed; -malformed input can cause it to read past C). -This means that if both C and C are specified, and C -is less than C+C, the match will never be successful because it can -never +malformed input can cause it to read past C). This means that if both +C and C are specified, and C is less than C+C, the match +will never be successful because it can never get as far as its goal (and in fact is asserted against). Correspondingly for C with respect to C. @@ -4875,7 +5457,9 @@ L (Case Mappings). * FOLDEQ_S2_FOLDS_SANE */ I32 -Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags) +Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, + const char *s2, char **pe2, UV l2, bool u2, + U32 flags) { const U8 *p1 = (const U8*)s1; /* Point to current char */ const U8 *p2 = (const U8*)s2; @@ -4989,7 +5573,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c *foldbuf1 = toFOLD(*p1); } else if (u1) { - _to_utf8_fold_flags(p1, foldbuf1, &n1, flags_for_folder); + _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder); } else { /* Not UTF-8, get UTF-8 fold */ _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder); @@ -5013,7 +5597,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c *foldbuf2 = toFOLD(*p2); } else if (u2) { - _to_utf8_fold_flags(p2, foldbuf2, &n2, flags_for_folder); + _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder); } else { _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);