X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/c285bbc4a6321e4e787d0fac9f34c354c7647256..91ca80c3dda3ce14add7bd63f815556667be2fd4:/utf8.c diff --git a/utf8.c b/utf8.c index b55c931..3123bd0 100644 --- a/utf8.c +++ b/utf8.c @@ -37,9 +37,10 @@ static const char malformed_text[] = "Malformed UTF-8 character"; static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; static const char cp_above_legal_max[] = - "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28"; + "Use of code point 0x%" UVXf " is not allowed; the" + " permissible max is 0x%" UVXf; -#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX)) +#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX)) /* =head1 Unicode Support @@ -207,11 +208,8 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags) * performance hit on these high EBCDIC code points. */ if (UNLIKELY(UNICODE_IS_SUPER(uv))) { - if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP); + if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP); } if ( (flags & UNICODE_WARN_SUPER) || ( (flags & UNICODE_WARN_PERL_EXTENDED) @@ -295,9 +293,8 @@ is the recommended wide native character-aware way of saying *(d++) = uv; -This function accepts any UV as input, but very high code points (above -C on the platform) will raise a deprecation warning. This is -typically 0x7FFF_FFFF in a 32-bit word. +This function accepts any code point from 0..C as input. +C is typically 0x7FFF_FFFF in a 32-bit word. It is possible to forbid or warn on non-Unicode code points, or those that may be problematic by using L. @@ -332,9 +329,8 @@ This is the Unicode-aware way of saying *(d++) = uv; -If C is 0, this function accepts any UV as input, but very high code -points (above C for the platform) will raise a deprecation warning. -This is typically 0x7FFF_FFFF in a 32-bit word. +If C is 0, this function accepts any code point from 0..C as +input. C is typically 0x7FFF_FFFF in a 32-bit word. Specifying C can further restrict what is allowed and not warned on, as follows: @@ -399,8 +395,10 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) #ifndef UV_IS_QUAD -PERL_STATIC_INLINE bool -S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) +STATIC int +S_is_utf8_cp_above_31_bits(const U8 * const s, + const U8 * const e, + const bool consider_overlongs) { /* Returns TRUE if the first code point represented by the Perl-extended- * UTF-8-encoded string starting at 's', and looking no further than 'e - @@ -412,72 +410,147 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e) * the final ones necessary for the complete representation may be beyond * 'e - 1'. * - * The function assumes that the sequence is well-formed UTF-8 as far as it - * goes, and is for a UTF-8 variant code point. If the sequence is - * incomplete, the function returns FALSE if there is any well-formed - * UTF-8 byte sequence that can complete it in such a way that a code point - * < 2**31 is produced; otherwise it returns TRUE. - * - * Getting this exactly right is slightly tricky, and has to be done in - * several places in this file, so is centralized here. It is based on the - * following table: + * The function also can handle the case where the input is an overlong + * sequence. If 'consider_overlongs' is 0, the function assumes the + * input is not overlong, without checking, and will return based on that + * assumption. If this parameter is 1, the function will go to the trouble + * of figuring out if it actually evaluates to above or below 31 bits. * - * U+7FFFFFFF (2 ** 31 - 1) - * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF - * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73 - * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72 - * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75 - * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF - * U+80000000 (2 ** 31): - * ASCII: \xFE\x82\x80\x80\x80\x80\x80 - * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13 - * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 - * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 - * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 - * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0 + * The sequence is otherwise assumed to be well-formed, without checking. */ + const STRLEN len = e - s; + int is_overlong; + + PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS; + + assert(! UTF8_IS_INVARIANT(*s) && e > s); + #ifdef EBCDIC - /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */ - const U8 prefix[] = "\x41\x41\x41\x41\x41\x41\x42"; - const STRLEN prefix_len = sizeof(prefix) - 1; - const STRLEN len = e - s; - const STRLEN cmp_len = MIN(prefix_len, len - 1); + PERL_UNUSED_ARG(consider_overlongs); + + /* On the EBCDIC code pages we handle, only the native start byte 0xFE can + * mean a 32-bit or larger code point (0xFF is an invariant). 0xFE can + * also be the start byte for a 31-bit code point; we need at least 2 + * bytes, and maybe up through 8 bytes, to determine that. (It can also be + * the start byte for an overlong sequence, but for 30-bit or smaller code + * points, so we don't have to worry about overlongs on EBCDIC.) */ + if (*s != 0xFE) { + return 0; + } + + if (len == 1) { + return -1; + } #else - PERL_UNUSED_ARG(e); + /* On ASCII, FE and FF are the only start bytes that can evaluate to + * needing more than 31 bits. */ + if (LIKELY(*s < 0xFE)) { + return 0; + } + + /* What we have left are FE and FF. Both of these require more than 31 + * bits unless they are for overlongs. */ + if (! consider_overlongs) { + return 1; + } + + /* Here, we have FE or FF. If the input isn't overlong, it evaluates to + * above 31 bits. But we need more than one byte to discern this, so if + * passed just the start byte, it could be an overlong evaluating to + * smaller */ + if (len == 1) { + return -1; + } + + /* Having excluded len==1, and knowing that FE and FF are both valid start + * bytes, we can call the function below to see if the sequence is + * overlong. (We don't need the full generality of the called function, + * but for these huge code points, speed shouldn't be a consideration, and + * the compiler does have enough information, since it's static to this + * file, to optimize to just the needed parts.) */ + is_overlong = is_utf8_overlong_given_start_byte_ok(s, len); + + /* If it isn't overlong, more than 31 bits are required. */ + if (is_overlong == 0) { + return 1; + } + + /* If it is indeterminate if it is overlong, return that */ + if (is_overlong < 0) { + return -1; + } + + /* Here is overlong. Such a sequence starting with FE is below 31 bits, as + * the max it can be is 2**31 - 1 */ + if (*s == 0xFE) { + return 0; + } #endif - PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS; + /* Here, ASCII and EBCDIC rejoin: + * On ASCII: We have an overlong sequence starting with FF + * On EBCDIC: We have a sequence starting with FE. */ - assert(! UTF8_IS_INVARIANT(*s)); + { /* For C89, use a block so the declaration can be close to its use */ -#ifndef EBCDIC +#ifdef EBCDIC - /* Technically, a start byte of FE can be for a code point that fits into - * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong - * malformation. */ - return (*s >= 0xFE); + /* U+7FFFFFFF (2 ** 31 - 1) + * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13 + * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73 + * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72 + * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75 + * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF + * U+80000000 (2 ** 31): + * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 + * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 + * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41 + * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0 + * + * and since we know that *s = \xfe, any continuation sequcence + * following it that is gt the below is above 31 bits + [0] [1] [2] [3] [4] [5] [6] */ + const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42"; #else - /* On the EBCDIC code pages we handle, only the native start byte 0xFE can - * mean a 32-bit or larger code point (0xFF is an invariant). For 0xFE, we - * need at least 2 bytes, and maybe up through 8 bytes, to be sure that the - * value is above 31 bits. */ - if (*s != 0xFE || len == 1) { - return FALSE; - } + /* FF overlong for U+7FFFFFFF (2 ** 31 - 1) + * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF + * FF overlong for U+80000000 (2 ** 31): + * ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80 + * and since we know that *s = \xff, any continuation sequcence + * following it that is gt the below is above 30 bits + [0] [1] [2] [3] [4] [5] [6] */ + const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81"; - /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are - * \x41 and \x42. */ - return cBOOL(memGT(s + 1, prefix, cmp_len)); #endif + const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1; + const STRLEN cmp_len = MIN(conts_len, len - 1); + + /* Now compare the continuation bytes in s with the ones we have + * compiled in that are for the largest 30 bit code point. If we have + * enough bytes available to determine the answer, or the bytes we do + * have differ from them, we can compare the two to get a definitive + * answer (Note that in UTF-EBCDIC, the two lowest possible + * continuation bytes are \x41 and \x42.) */ + if (cmp_len >= conts_len || memNE(s + 1, + conts_for_highest_30_bit, + cmp_len)) + { + return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len)); + } + /* Here, all the bytes we have are the same as the highest 30-bit code + * point, but we are missing so many bytes that we can't make the + * determination */ + return -1; + } } #endif @@ -579,35 +652,34 @@ S_isFF_OVERLONG(const U8 * const s, const STRLEN len) return -1; } -/* Anything larger than this will overflow the word if it were converted into a UV */ -#if defined(UV_IS_QUAD) +#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */ # ifdef EBCDIC /* Actually is I8 */ # define HIGHEST_REPRESENTABLE_UTF8 \ - "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" + "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" # else # define HIGHEST_REPRESENTABLE_UTF8 \ - "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" -# endif -#else /* 32-bit */ -# ifdef EBCDIC -# define HIGHEST_REPRESENTABLE_UTF8 \ - "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF" -# else -# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF" + "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" # endif #endif PERL_STATIC_INLINE int -S_does_utf8_overflow(const U8 * const s, const U8 * e) +S_does_utf8_overflow(const U8 * const s, + const U8 * e, + const bool consider_overlongs) { /* Returns an int indicating whether or not the UTF-8 sequence from 's' to - * 'e' - 1 would overflow a UV on this platform; that is if it represents a - * code point larger than the highest representable code point. It returns - * 1 if it does overflow; 0 if it doesn't, and -1 if there isn't enough - * information to tell. This last return value can happen if the sequence - * is incomplete, missing some trailing bytes that would form a complete - * character. If there are enough bytes to make a definitive decision, - * this function does so. + * 'e' - 1 would overflow an IV on this platform; that is if it represents + * a code point larger than the highest representable code point. It + * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't + * enough information to tell. This last return value can happen if the + * sequence is incomplete, missing some trailing bytes that would form a + * complete character. If there are enough bytes to make a definitive + * decision, this function does so. + * + * If 'consider_overlongs' is TRUE, the function checks for the possibility + * that the sequence is an overlong that doesn't overflow. Otherwise, it + * assumes the sequence is not an overlong. This can give different + * results only on ASCII 32-bit platforms. * * (For ASCII platforms, we could use memcmp() because we don't have to * convert each byte to I8, but it's very rare input indeed that would @@ -615,18 +687,82 @@ S_does_utf8_overflow(const U8 * const s, const U8 * e) * * 'e' - 1 must not be beyond a full character. */ - const STRLEN len = e - s; - const U8 *x; - const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8; PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW; assert(s <= e && s + UTF8SKIP(s) >= e); +#if ! defined(UV_IS_QUAD) + + return is_utf8_cp_above_31_bits(s, e, consider_overlongs); + +#else + + PERL_UNUSED_ARG(consider_overlongs); + + { + const STRLEN len = e - s; + const U8 *x; + const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8; + + for (x = s; x < e; x++, y++) { + + if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) { + continue; + } + + /* If this byte is larger than the corresponding highest UTF-8 + * byte, the sequence overflow; otherwise the byte is less than, + * and so the sequence doesn't overflow */ + return NATIVE_UTF8_TO_I8(*x) > *y; + + } + + /* Got to the end and all bytes are the same. If the input is a whole + * character, it doesn't overflow. And if it is a partial character, + * there's not enough information to tell */ + if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) { + return -1; + } + + return 0; + } + +#endif + +} + +#if 0 + +/* This is the portions of the above function that deal with UV_MAX instead of + * IV_MAX. They are left here in case we want to combine them so that internal + * uses can have larger code points. The only logic difference is that the + * 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has + * different logic. + */ + +/* Anything larger than this will overflow the word if it were converted into a UV */ +#if defined(UV_IS_QUAD) +# ifdef EBCDIC /* Actually is I8 */ +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" +# else +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF" +# endif +#else /* 32-bit */ +# ifdef EBCDIC +# define HIGHEST_REPRESENTABLE_UTF8 \ + "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF" +# else +# define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF" +# endif +#endif + #if ! defined(UV_IS_QUAD) && ! defined(EBCDIC) /* On 32 bit ASCII machines, many overlongs that start with FF don't * overflow */ - if (isFF_OVERLONG(s, len) > 0) { + if (consider_overlongs && isFF_OVERLONG(s, len) > 0) { /* To be such an overlong, the first bytes of 's' must match * FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80". If we @@ -641,30 +777,14 @@ S_does_utf8_overflow(const U8 * const s, const U8 * e) return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83; } +/* Starting with the #else, the rest of the function is identical except + * 1. we need to move the 'len' declaration to be global to the function + * 2. the endif move to just after the UNUSED_ARG. + * An empty endif is given just below to satisfy the preprocessor + */ #endif - for (x = s; x < e; x++, y++) { - - if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) { - continue; - } - - /* If this byte is larger than the corresponding highest UTF-8 byte, - * the sequence overflow; otherwise the byte is less than, and so the - * sequence doesn't overflow */ - return NATIVE_UTF8_TO_I8(*x) > *y; - - } - - /* Got to the end and all bytes are the same. If the input is a whole - * character, it doesn't overflow. And if it is a partial character, - * there's not enough information to tell */ - if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) { - return -1; - } - - return 0; -} +#endif #undef F0_ABOVE_OVERLONG #undef F8_ABOVE_OVERLONG @@ -821,7 +941,10 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) /* And finally, that the code point represented fits in a word on this * platform */ - if (does_utf8_overflow(s, e) > 0) { + if (0 < does_utf8_overflow(s, e, + 0 /* Don't consider overlongs */ + )) + { return 0; } @@ -829,10 +952,10 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags) } char * -Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format) +Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format) { /* Returns a mortalized C string that is a displayable copy of the 'len' - * bytes starting at 's'. 'format' gives how to display each byte. + * bytes starting at 'start'. 'format' gives how to display each byte. * Currently, there are only two formats, so it is currently a bool: * 0 \xab * 1 ab (that is a space between two hex digit bytes) @@ -840,7 +963,8 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format) const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a trailing NUL */ - const U8 * const e = s + len; + const U8 * s = start; + const U8 * const e = start + len; char * output; char * d; @@ -850,12 +974,14 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format) SAVEFREEPV(output); d = output; - for (; s < e; s++) { + for (s = start; s < e; s++) { const unsigned high_nibble = (*s & 0xF0) >> 4; const unsigned low_nibble = (*s & 0x0F); if (format) { - *d++ = ' '; + if (s > start) { + *d++ = ' '; + } } else { *d++ = '\\'; @@ -1012,10 +1138,6 @@ EBCDIC platforms, and sometimes when the L> is also present. The new names accurately describe the situation in all cases. -It is now deprecated to have very high code points (above C on the -platforms) and this function will raise a deprecation warning for these (unless -such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1) -in a 32-bit word. All other code points corresponding to Unicode characters, including private use and those yet to be assigned, are never considered malformed and never @@ -1122,7 +1244,7 @@ in a position where only a continuation type one should be. =item C The input sequence was malformed in that it is for a code point that is not -representable in the number of bits available in a UV on the current platform. +representable in the number of bits available in an IV on the current platform. =item C @@ -1305,7 +1427,10 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, /* Check for overflow. The algorithm requires us to not look past the end * of the current character, even if partial, so the upper limit is 's' */ - if (UNLIKELY(does_utf8_overflow(s0, s) > 0)) { + if (UNLIKELY(0 < does_utf8_overflow(s0, s, + 1 /* Do consider overlongs */ + ))) + { possible_problems |= UTF8_GOT_OVERFLOW; uv = UNICODE_REPLACEMENT; } @@ -1380,14 +1505,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, |UTF8_WARN_NONCHAR |UTF8_WARN_SURROGATE |UTF8_WARN_SUPER - |UTF8_WARN_PERL_EXTENDED)) - /* In case of a malformation, 'uv' is not valid, and has - * been changed to something in the Unicode range. - * Currently we don't output a deprecation message if there - * is already a malformation, so we don't have to special - * case the test immediately below */ - || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)))) + |UTF8_WARN_PERL_EXTENDED)))) { /* If there were no malformations, or the only malformation is an * overlong, 'uv' is valid */ @@ -1493,11 +1611,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, disallowed = TRUE; } - /* Likewise, warn if any say to, plus if deprecation warnings - * are on, because this code point is above IV_MAX */ - if ( ckWARN_d(WARN_DEPRECATED) - || ! (flags & UTF8_ALLOW_OVERFLOW) - || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED))) + /* Likewise, warn if any say to */ + if ( ! (flags & UTF8_ALLOW_OVERFLOW) + || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED))) { /* The warnings code explicitly says it doesn't handle the @@ -1699,21 +1815,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, *errors |= UTF8_GOT_SUPER; disallowed = TRUE; } - - /* The deprecated warning overrides any non-deprecated one. If - * there are other problems, a deprecation message is not - * really helpful, so don't bother to raise it in that case. - * This also keeps the code from having to handle the case - * where 'uv' is not valid. */ - if ( ! (orig_problems - & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW)) - && UNLIKELY(uv > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - message = Perl_form(aTHX_ cp_above_legal_max, - uv, MAX_NON_DEPRECATED_CP); - pack_warn = packWARN(WARN_DEPRECATED); - } } else if (possible_problems & UTF8_GOT_NONCHAR) { possible_problems &= ~UTF8_GOT_NONCHAR; @@ -1779,9 +1880,15 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, U8 tmpbuf[UTF8_MAXBYTES+1]; const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf, uv, 0); - const char * preface = (uv <= PERL_UNICODE_MAX) - ? "U+" - : "0x"; + /* Don't use U+ for non-Unicode code points, which + * includes those in the Latin1 range */ + const char * preface = ( uv > PERL_UNICODE_MAX +#ifdef EBCDIC + || uv <= 0xFF +#endif + ) + ? "0x" + : "U+"; message = Perl_form(aTHX_ "%s: %s (overlong; instead use %s to represent" " %s%0*" UVXf ")", @@ -1791,7 +1898,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, preface, ((uv < 256) ? 2 : 4), /* Field width of 2 for small code points */ - uv); + UNI_TO_NATIVE(uv)); } } } @@ -1842,9 +1949,6 @@ the next possible position in C that could begin a non-malformed character. See L for details on when the REPLACEMENT CHARACTER is returned. -Code points above the platform's C will raise a deprecation warning, -unless those are turned off. - =cut Also implemented as a macro in utf8.h @@ -1884,9 +1988,6 @@ is set (if C isn't NULL) so that (S + C<*retlen>>) is the next possible position in C that could begin a non-malformed character. See L for details on when the REPLACEMENT CHARACTER is returned. -Code points above the platform's C will raise a deprecation warning, -unless those are turned off. - =cut */ @@ -1897,16 +1998,18 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) assert(send > s); - /* Call the low level routine, asking for checks */ return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen)); } /* =for apidoc utf8_length -Return the length of the UTF-8 char encoded string C in characters. -Stops at C (inclusive). If C s> or if the scan would end -up past C, croaks. +Returns the number of characters in the sequence of UTF-8-encoded bytes starting +at C and ending at the byte just before C. If and point to the +same place, it returns 0 with no warning raised. + +If C s> or if the scan would end up past C, it raises a UTF8 warning +and returns the number of valid characters. =cut */ @@ -2206,8 +2309,8 @@ Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** f } finish_and_return: - *d = '\0'; - *lenp = d - converted_start; + *d = '\0'; + *lenp = d - converted_start; /* Trim unused space */ Renew(converted_start, *lenp + 1, U8); @@ -2253,16 +2356,30 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) append_utf8_from_native_byte(*s, &d); s++; } + *d = '\0'; *lenp = d-dst; + + /* Trim unused space */ + Renew(dst, *lenp + 1, U8); + return dst; } /* - * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8. + * Convert native (big-endian) UTF-16 to UTF-8. For reversed (little-endian), + * use utf16_to_utf8_reversed(). + * + * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes. + * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes. + * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes. * - * Destination must be pre-extended to 3/2 source. Do not use in-place. - * We optimize for native, for obvious reasons. */ + * These functions don't check for overflow. The worst case is every code + * point in the input is 2 bytes, and requires 4 bytes on output. (If the code + * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.) Therefore the + * destination must be pre-extended to 2 times the source length. + * + * Do not use in-place. We optimize for native, for obvious reasons. */ U8* Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) @@ -2290,10 +2407,12 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv)); continue; } + #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST #define LAST_HIGH_SURROGATE 0xDBFF #define FIRST_LOW_SURROGATE 0xDC00 #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST +#define FIRST_IN_PLANE1 0x10000 /* This assumes that most uses will be in the first Unicode plane, not * needing surrogates */ @@ -2312,13 +2431,13 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) } p += 2; uv = ((uv - FIRST_HIGH_SURROGATE) << 10) - + (low - FIRST_LOW_SURROGATE) + 0x10000; + + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1; } } #ifdef EBCDIC d = uvoffuni_to_utf8_flags(d, uv, 0); #else - if (uv < 0x10000) { + if (uv < FIRST_IN_PLANE1) { *d++ = (U8)(( uv >> 12) | 0xe0); *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80); *d++ = (U8)(( uv & 0x3f) | 0x80); @@ -3024,11 +3143,9 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, } if (UNLIKELY(UNICODE_IS_SUPER(uv1))) { - if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP) - && ckWARN_d(WARN_DEPRECATED)) - { - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP); + if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv1, + MAX_EXTERNALLY_LEGAL_CP); } if (ckWARN_d(WARN_NON_UNICODE)) { const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; @@ -3492,17 +3609,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, if (flags & FOLD_FLAGS_LOCALE) { # define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8 - const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1; - # ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8 # define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8 - const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1; - /* Special case these two characters, as what normally gets * returned under locale doesn't work */ - if (UTF8SKIP(p) == cap_sharp_s_len - && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len)) + if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), @@ -3512,8 +3624,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, } else #endif - if (UTF8SKIP(p) == long_s_t_len - && memEQ((char *) p, LONG_S_T, long_s_t_len)) + if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), @@ -3532,9 +3643,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, * 255/256 boundary which is forbidden under /l, and so the code * wouldn't catch that they are equivalent (which they are only in * this release) */ - else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1 - && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1)) - { + else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; " @@ -4981,7 +5090,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) * size based on worst possible case, which is each line in the input * creates 2 elements in the inversion list: 1) the beginning of a * range in the list; 2) the beginning of a range not in the list. */ - while ((loc = (strchr(loc, '\n'))) != NULL) { + while ((loc = (char *) memchr(loc, '\n', lend - (U8 *) loc)) != NULL) { elements += 2; loc++; } @@ -5126,10 +5235,8 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) /* May change: warns if surrogates, non-character code points, or * non-Unicode code points are in 's' which has length 'len' bytes. * Returns TRUE if none found; FALSE otherwise. The only other validity - * check is to make sure that this won't exceed the string's length. - * - * Code points above the platform's C will raise a deprecation - * warning, unless those are turned off. */ + * check is to make sure that this won't exceed the string's length nor + * overflow */ const U8* const e = s + len; bool ok = TRUE; @@ -5145,22 +5252,10 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) { if (UNLIKELY(UTF8_IS_SUPER(s, e))) { if ( ckWARN_d(WARN_NON_UNICODE) - || ( ckWARN_d(WARN_DEPRECATED) -#ifndef UV_IS_QUAD - && UNLIKELY(is_utf8_cp_above_31_bits(s, e)) -#else /* Below is 64-bit words */ - /* 2**63 and up meet these conditions provided we have - * a 64-bit word. */ -# ifdef EBCDIC - && *s == 0xFE - && NATIVE_UTF8_TO_I8(s[1]) >= 0xA8 -# else - && *s == 0xFF - /* s[1] being above 0x80 overflows */ - && s[2] >= 0x88 -# endif -#endif - )) { + || UNLIKELY(0 < does_utf8_overflow(s, s + len, + 0 /* Don't consider overlongs */ + ))) + { /* A side effect of this function will be to warn */ (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER); ok = FALSE;