X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/7723e0077ab2970477de35881407f68179022294..3d5b2488ead2c20cc64174fabd97bb7a32da097f:/utf8.c diff --git a/utf8.c b/utf8.c index e3e17ff..fc4a0c1 100644 --- a/utf8.c +++ b/utf8.c @@ -42,8 +42,6 @@ static const char cp_above_legal_max[] = "Use of code point 0x%" UVXf " is not allowed; the" " permissible max is 0x%" UVXf; -#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX)) - /* =head1 Unicode Support These are various utility functions for manipulating UTF8-encoded @@ -213,7 +211,7 @@ Most code should use C()> rather than call this directly This function is for code that wants any warning and/or error messages to be returned to the caller rather than be displayed. All messages that would have -been displayed if all lexcial warnings are enabled will be returned. +been displayed if all lexical warnings are enabled will be returned. It is just like C> but it takes an extra parameter placed after all the others, C. If this parameter is 0, this function @@ -309,8 +307,8 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs) * performance hit on these high EBCDIC code points. */ if (UNLIKELY(UNICODE_IS_SUPER(uv))) { - if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) { - Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP); + if (UNLIKELY(uv > MAX_LEGAL_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP); } if ( (flags & UNICODE_WARN_SUPER) || ( (flags & UNICODE_WARN_PERL_EXTENDED) @@ -386,8 +384,8 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs) STRLEN len = OFFUNISKIP(uv); U8 *p = d+len-1; while (p > d) { - *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); - uv >>= UTF_ACCUMULATION_SHIFT; + *p-- = I8_TO_NATIVE_UTF8((uv & MASK) | MARK); + uv >>= SHIFT; } *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len)); return d+len; @@ -1123,7 +1121,7 @@ Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const boo PERL_STATIC_INLINE char * S_unexpected_non_continuation_text(pTHX_ const U8 * const s, - /* How many bytes to print */ + /* Max number of bytes to print */ STRLEN print_len, /* Which one is the non-continuation */ @@ -1139,6 +1137,8 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, ? "immediately" : Perl_form(aTHX_ "%d bytes", (int) non_cont_byte_pos); + const U8 * x = s + non_cont_byte_pos; + const U8 * e = s + print_len; PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT; @@ -1146,10 +1146,20 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, * calculated, it's likely faster to pass it; verify under DEBUGGING */ assert(expect_len == UTF8SKIP(s)); + /* As a defensive coding measure, don't output anything past a NUL. Such + * bytes shouldn't be in the middle of a malformation, and could mark the + * end of the allocated string, and what comes after is undefined */ + for (; x < e; x++) { + if (*x == '\0') { + x++; /* Output this particular NUL */ + break; + } + } + return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x," " %s after start byte 0x%02x; need %d bytes, got %d)", malformed_text, - _byte_dump_string(s, print_len, 0), + _byte_dump_string(s, x - s, 0), *(s + non_cont_byte_pos), where, *s, @@ -1262,112 +1272,16 @@ Also implemented as a macro in utf8.h */ UV -Perl_utf8n_to_uvchr(pTHX_ const U8 *s, - STRLEN curlen, - STRLEN *retlen, - const U32 flags) +Perl_utf8n_to_uvchr(const U8 *s, + STRLEN curlen, + STRLEN *retlen, + const U32 flags) { PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL); } -/* The tables below come from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, - * which requires this copyright notice */ - -/* Copyright (c) 2008-2009 Bjoern Hoehrmann - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#if 0 -static U8 utf8d_C9[] = { - /* The first part of the table maps bytes to character classes that - * to reduce the size of the transition table and create bitmasks. */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/ - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/ - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/ - 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, /*-FF*/ - - /* The second part is a transition table that maps a combination - * of a state of the automaton and a character class to a state. */ - 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, - 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, - 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, - 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, - 12,36,12,12,12,12,12,12,12,12,12,12 -}; - -#endif - -#ifndef EBCDIC - -/* This is a version of the above table customized for Perl that doesn't - * exclude surrogates and accepts start bytes up through F7 (representing - * 2**21 - 1). */ -static U8 dfa_tab_for_perl[] = { - /* The first part of the table maps bytes to character classes to reduce - * the size of the transition table and create bitmasks. */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/ - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/ - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/ - 10,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 11,4,4,4,4,4,4,4,8,8,8,8,8,8,8,8, /*-FF*/ - - /* The second part is a transition table that maps a combination - * of a state of the automaton and a character class to a state. */ - 0,12,24,36,96,12,12,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,/*23*/ - 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,/*47*/ - 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,/*71*/ - 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,/*95*/ - 12,36,12,12,12,12,12,36,12,36,12,12 /* 96- 107 */ - - /* The customization was to repurpose the surrogates type '4' to instead be - * for start bytes F1-F7. Types 5 and 6 are now unused, and their entries in - * the transition part of the table are set to 12, so are illegal. - * - * To do higher code points would require expansion and some rearrangement of - * the table. The type '1' entries for continuation bytes 80-8f would have to - * be split into several types, because they aren't treated uniformly for - * higher start bytes, since overlongs for F8 are 80-87; FC: 80-83; and FE: - * 80-81. We start needing to worry about overflow if FE is included. - * Ignoring, FE and FF, we could use type 5 for F9-FB, and 6 for FD (remember - * from the web site that these are used to right shift). FE would - * necessarily be type 7; and FF, type 8. And new states would have to be - * created for F8 and FC (and FE and FF if used), so quite a bit of work would - * be involved. - * - * XXX Better would be to customize the table so that the noncharacters are - * excluded. This again is non trivial, but doing so would simplify the code - * that uses this, and might make it small enough to make it inlinable */ -}; - -#endif - /* =for apidoc utf8n_to_uvchr_error @@ -1449,7 +1363,8 @@ C or the C flags. =item C The input sequence was malformed in that a non-continuation type byte was found -in a position where only a continuation type one should be. +in a position where only a continuation type one should be. See also +L>. =item C @@ -1462,6 +1377,34 @@ The input sequence was malformed in that C is smaller than required for a complete sequence. In other words, the input is for a partial character sequence. + +C and C both indicate a too short +sequence. The difference is that C indicates always +that there is an error, while C means that an incomplete +sequence was looked at. If no other flags are present, it means that the +sequence was valid as far as it went. Depending on the application, this could +mean one of three things: + +=over + +=item * + +The C length parameter passed in was too small, and the function was +prevented from examining all the necessary bytes. + +=item * + +The buffer being looked at is based on reading data, and the data received so +far stopped in the middle of a character, so that the next read will +read the remainder of this character. (It is up to the caller to deal with the +split bytes somehow.) + +=item * + +This is a real error, and the partial sequence is all we're going to get. + +=back + =item C The input sequence was malformed in that it is for a non-Unicode code point; @@ -1487,7 +1430,7 @@ Also implemented as a macro in utf8.h */ UV -Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, +Perl_utf8n_to_uvchr_error(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, @@ -1551,7 +1494,7 @@ The caller, of course, is responsible for freeing any returned AV. */ UV -Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, +Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, @@ -1560,27 +1503,70 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, { const U8 * const s0 = s; const U8 * send = s0 + curlen; - U32 possible_problems = 0; /* A bit is set here for each potential problem - found as we go along */ - UV uv = (UV) -1; - STRLEN expectlen = 0; /* How long should this sequence be? - (initialized to silence compilers' wrong - warning) */ - STRLEN avail_len = 0; /* When input is too short, gives what that is */ - U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL; - this gets set and discarded */ + U32 possible_problems; /* A bit is set here for each potential problem + found as we go along */ + UV uv; + STRLEN expectlen; /* How long should this sequence be? */ + STRLEN avail_len; /* When input is too short, gives what that is */ + U32 discard_errors; /* Used to save branches when 'errors' is NULL; this + gets set and discarded */ /* The below are used only if there is both an overlong malformation and a * too short one. Otherwise the first two are set to 's0' and 'send', and * the third not used at all */ - U8 * adjusted_s0 = (U8 *) s0; + U8 * adjusted_s0; U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this routine; see [perl #130921] */ - UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */ + UV uv_so_far; + dTHX; + + PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER; + + /* Here, is one of: a) malformed; b) a problematic code point (surrogate, + * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul + * syllables that the dfa doesn't properly handle. Quickly dispose of the + * final case. */ + +#ifndef EBCDIC + + /* Each of the affected Hanguls starts with \xED */ + + if (is_HANGUL_ED_utf8_safe(s0, send)) { + if (retlen) { + *retlen = 3; + } + if (errors) { + *errors = 0; + } + if (msgs) { + *msgs = NULL; + } - UV state = 0; + return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT)) + | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT) + | (s0[2] & UTF_CONTINUATION_MASK); + } - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; +#endif + + /* In conjunction with the exhaustive tests that can be enabled in + * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely + * what it is intended to do, and that no flaws in it are masked by + * dropping down and executing the code below + assert(! isUTF8_CHAR(s0, send) + || UTF8_IS_SURROGATE(s0, send) + || UTF8_IS_SUPER(s0, send) + || UTF8_IS_NONCHAR(s0,send)); + */ + + s = s0; + uv = *s0; + possible_problems = 0; + expectlen = 0; + avail_len = 0; + discard_errors = 0; + adjusted_s0 = (U8 *) s0; + uv_so_far = 0; if (errors) { *errors = 0; @@ -1633,55 +1619,6 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, *retlen = expectlen; } - /* An invariant is trivially well-formed */ - if (UTF8_IS_INVARIANT(*s0)) { - return *s0; - } - -#ifndef EBCDIC - - /* Measurements show that this dfa is somewhat faster than the regular code - * below, so use it first, dropping down for the non-normal cases. */ - -# define PERL_UTF8_DECODE_REJECT 12 - - while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) { - UV type = dfa_tab_for_perl[*s]; - - if (state != 0) { - uv = (*s & 0x3fu) | (uv << UTF_ACCUMULATION_SHIFT); - state = dfa_tab_for_perl[256 + state + type]; - } - else { - uv = (0xff >> type) & (*s); - state = dfa_tab_for_perl[256 + type]; - } - - if (state == 0) { - - /* If this could be a code point that the flags don't allow (the first - * surrogate is the first such possible one), delve further, but we already - * have calculated 'uv' */ - if ( (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE - |UTF8_WARN_ILLEGAL_INTERCHANGE)) - && uv >= UNICODE_SURROGATE_FIRST) - { - curlen = s + 1 - s0; - goto got_uv; - } - - return uv; - } - - s++; - } - - /* Here, is some sort of failure. Use the full mechanism */ - - uv = *s0; - -#endif - /* A continuation character can't start a valid sequence */ if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) { possible_problems |= UTF8_GOT_CONTINUATION; @@ -1801,8 +1738,6 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, } } - got_uv: - /* Here, we have found all the possible problems, except for when the input * is for a problematic code point not allowed by the input parameters. */ @@ -2712,7 +2647,8 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) PERL_ARGS_ASSERT_BYTES_TO_UTF8; PERL_UNUSED_CONTEXT; - Newx(d, (*lenp) * 2 + 1, U8); + /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ + Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8); dst = d; while (s < send) { @@ -2723,9 +2659,6 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) *d = '\0'; *lenp = d-dst; - /* Trim unused space */ - Renew(dst, *lenp + 1, U8); - return dst; } @@ -2858,9 +2791,7 @@ Perl__is_utf8_idstart(pTHX_ const U8 *p) if (*p == '_') return TRUE; - return is_utf8_common(p, NULL, - "This is buggy if this gets used", - PL_utf8_idstart); + return is_utf8_common(p, PL_utf8_idstart); } bool @@ -3017,8 +2948,7 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) return _to_upper_title_latin1((U8) c, p, lenp, 'S'); } - uvchr_to_utf8(p, c); - return CALL_UPPER_CASE(c, p, p, lenp); + return CALL_UPPER_CASE(c, NULL, p, lenp); } UV @@ -3030,8 +2960,7 @@ Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) return _to_upper_title_latin1((U8) c, p, lenp, 's'); } - uvchr_to_utf8(p, c); - return CALL_TITLE_CASE(c, p, p, lenp); + return CALL_TITLE_CASE(c, NULL, p, lenp); } STATIC U8 @@ -3070,8 +2999,7 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ ); } - uvchr_to_utf8(p, c); - return CALL_LOWER_CASE(c, p, p, lenp); + return CALL_LOWER_CASE(c, NULL, p, lenp); } UV @@ -3167,8 +3095,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) /* Here, above 255. If no special needs, just use the macro */ if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) { - uvchr_to_utf8(p, c); - return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL); + return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL); } else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with the special flags. */ @@ -3182,21 +3109,17 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) } PERL_STATIC_INLINE bool -S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, - const char *const swashname, SV* const invlist) +S_is_utf8_common(pTHX_ const U8 *const p, SV* const invlist) { /* returns a boolean giving whether or not the UTF8-encoded character that - * starts at

is in the swash indicated by . - * contains a pointer to where the swash indicated by - * is to be stored; which this routine will do, so that future calls will - * look at <*swash> and only generate a swash if it is not null. - * is NULL or an inversion list that defines the swash. If not null, it - * saves time during initialization of the swash. + * starts at

is in the inversion list indicated by . * * Note that it is assumed that the buffer length of

is enough to * contain all the bytes that comprise the character. Thus, <*p> should * have been checked before this call for mal-formedness enough to assure - * that. */ + * that. This function, does make sure to not look past any NUL, so it is + * safe to use on C, NUL-terminated, strings */ + STRLEN len = my_strnlen((char *) p, UTF8SKIP(p)); PERL_ARGS_ASSERT_IS_UTF8_COMMON; @@ -3205,72 +3128,34 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, * as far as there being enough bytes available in it to accommodate the * character without reading beyond the end, and pass that number on to the * validating routine */ - if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) { - _force_out_malformed_utf8_message(p, p + UTF8SKIP(p), - _UTF8_NO_CONFIDENCE_IN_CURLEN, + if (! isUTF8_CHAR(p, p + len)) { + _force_out_malformed_utf8_message(p, p + len, _UTF8_NO_CONFIDENCE_IN_CURLEN, 1 /* Die */ ); NOT_REACHED; /* NOTREACHED */ } - if (invlist) { - return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL)); - } - - assert(swash); - - if (!*swash) { - U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - *swash = _core_swash_init("utf8", - - /* Only use the name if there is no inversion - * list; otherwise will go out to disk */ - (invlist) ? "" : swashname, - - &PL_sv_undef, 1, 0, invlist, &flags); - } - - return swash_fetch(*swash, p, TRUE) != 0; + return is_utf8_common_with_len(p, p + len, invlist); } PERL_STATIC_INLINE bool S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, - SV **swash, const char *const swashname, SV* const invlist) { /* returns a boolean giving whether or not the UTF8-encoded character that - * starts at

, and extending no further than is in the swash - * indicated by . contains a pointer to where the swash - * indicated by is to be stored; which this routine will do, so - * that future calls will look at <*swash> and only generate a swash if it - * is not null. is NULL or an inversion list that defines the - * swash. If not null, it saves time during initialization of the swash. - */ + * starts at

, and extending no further than is in the inversion + * list . */ + + UV cp = utf8n_to_uvchr(p, e - p, NULL, 0); PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN; - if (! isUTF8_CHAR(p, e)) { + if (cp == 0 && (p >= e || *p != '\0')) { _force_out_malformed_utf8_message(p, e, 0, 1); NOT_REACHED; /* NOTREACHED */ } - if (invlist) { - return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL)); - } - - assert(swash); - - if (!*swash) { - U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - *swash = _core_swash_init("utf8", - - /* Only use the name if there is no inversion - * list; otherwise will go out to disk */ - (invlist) ? "" : swashname, - - &PL_sv_undef, 1, 0, invlist, &flags); - } - - return swash_fetch(*swash, p, TRUE) != 0; + assert(invlist); + return _invlist_contains_cp(invlist, cp); } STATIC void @@ -3299,14 +3184,14 @@ S_warn_on_first_deprecated_use(pTHX_ const char * const name, if (instr(file, "mathoms.c")) { Perl_warner(aTHX_ WARN_DEPRECATED, - "In %s, line %d, starting in Perl v5.30, %s()" + "In %s, line %d, starting in Perl v5.32, %s()" " will be removed. Avoid this message by" " converting to use %s().\n", file, line, name, alternative); } else { Perl_warner(aTHX_ WARN_DEPRECATED, - "In %s, line %d, starting in Perl v5.30, %s() will" + "In %s, line %d, starting in Perl v5.32, %s() will" " require an additional parameter. Avoid this" " message by converting to use %s().\n", file, line, name, alternative); @@ -3343,10 +3228,7 @@ Perl__is_utf8_FOO(pTHX_ U8 classnum, case _CC_GRAPH: case _CC_CASED: - return is_utf8_common(p, - NULL, - "This is buggy if this gets used", - PL_XPosix_ptrs[classnum]); + return is_utf8_common(p, PL_XPosix_ptrs[classnum]); case _CC_SPACE: return is_XPERLSPACE_high(p); @@ -3361,13 +3243,9 @@ Perl__is_utf8_FOO(pTHX_ U8 classnum, case _CC_VERTSPACE: return is_VERTWS_high(p); case _CC_IDFIRST: - return is_utf8_common(p, NULL, - "This is buggy if this gets used", - PL_utf8_perl_idstart); + return is_utf8_common(p, PL_utf8_perl_idstart); case _CC_IDCONT: - return is_utf8_common(p, NULL, - "This is buggy if this gets used", - PL_utf8_perl_idcont); + return is_utf8_common(p, PL_utf8_perl_idcont); } } @@ -3406,9 +3284,7 @@ Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p, { PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN; - return is_utf8_common_with_len(p, e, NULL, - "This is buggy if this gets used", - PL_XPosix_ptrs[classnum]); + return is_utf8_common_with_len(p, e, PL_XPosix_ptrs[classnum]); } bool @@ -3416,9 +3292,7 @@ Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e) { PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN; - return is_utf8_common_with_len(p, e, NULL, - "This is buggy if this gets used", - PL_utf8_perl_idstart); + return is_utf8_common_with_len(p, e, PL_utf8_perl_idstart); } bool @@ -3428,7 +3302,7 @@ Perl__is_utf8_xidstart(pTHX_ const U8 *p) if (*p == '_') return TRUE; - return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL); + return is_utf8_common(p, PL_utf8_xidstart); } bool @@ -3436,9 +3310,7 @@ Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e) { PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN; - return is_utf8_common_with_len(p, e, NULL, - "This is buggy if this gets used", - PL_utf8_perl_idcont); + return is_utf8_common_with_len(p, e, PL_utf8_perl_idcont); } bool @@ -3446,7 +3318,7 @@ Perl__is_utf8_idcont(pTHX_ const U8 *p) { PERL_ARGS_ASSERT__IS_UTF8_IDCONT; - return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL); + return is_utf8_common(p, PL_utf8_idcont); } bool @@ -3454,7 +3326,7 @@ Perl__is_utf8_xidcont(pTHX_ const U8 *p) { PERL_ARGS_ASSERT__IS_UTF8_XIDCONT; - return is_utf8_common(p, &PL_utf8_xidcont, "XIdContinue", NULL); + return is_utf8_common(p, PL_utf8_xidcont); } bool @@ -3462,7 +3334,7 @@ Perl__is_utf8_mark(pTHX_ const U8 *p) { PERL_ARGS_ASSERT__IS_UTF8_MARK; - return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL); + return is_utf8_common(p, PL_utf8_mark); } STATIC UV @@ -3552,9 +3424,9 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, } if (UNLIKELY(UNICODE_IS_SUPER(uv1))) { - if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) { + if (UNLIKELY(uv1 > MAX_LEGAL_CP)) { Perl_croak(aTHX_ cp_above_legal_max, uv1, - MAX_EXTERNALLY_LEGAL_CP); + MAX_LEGAL_CP); } if (ckWARN_d(WARN_NON_UNICODE)) { const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; @@ -3585,6 +3457,9 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, unsigned int i; const unsigned int * cp_list; U8 * d; + + /* 'index' is guaranteed to be non-negative, as this is an inversion + * map that covers all possible inputs. See [perl #133365] */ SSize_t index = _invlist_search(invlist, uv1); IV base = invmap[index]; @@ -3623,13 +3498,16 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, /* Here, there was no mapping defined, which means that the code point maps * to itself. Return the inputs */ cases_to_self: - len = UTF8SKIP(p); - if (p != ustrp) { /* Don't copy onto itself */ - Copy(p, ustrp, len, U8); + if (p) { + len = UTF8SKIP(p); + if (p != ustrp) { /* Don't copy onto itself */ + Copy(p, ustrp, len, U8); + } + *lenp = len; + } + else { + *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp; } - - if (lenp) - *lenp = len; return uv1; @@ -3657,6 +3535,8 @@ Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to, * the return can point to them, but single code points aren't, so would * need to be constructed if we didn't employ something like this API */ + /* 'index' is guaranteed to be non-negative, as this is an inversion map + * that covers all possible inputs. See [perl #133365] */ SSize_t index = _invlist_search(PL_utf8_foldclosures, cp); int base = _Perl_IVCF_invmap[index]; @@ -3794,7 +3674,10 @@ S_check_and_deprecate(pTHX_ const U8 *p, if (*e == NULL) { utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN; - *e = p + UTF8SKIP(p); + + /* strnlen() makes this function safe for the common case of + * NUL-terminated strings */ + *e = p + my_strnlen((char *) p, UTF8SKIP(p)); /* For mathoms.c calls, we use the function name we know is stored * there. It could be part of a larger path */ @@ -4065,7 +3948,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, /* Special case these two characters, as what normally gets * returned under locale doesn't work */ - if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S)) + if (memBEGINs((char *) p, e - p, CAP_SHARP_S)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), @@ -4075,7 +3958,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, } else #endif - if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T)) + if (memBEGINs((char *) p, e - p, LONG_S_T)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), @@ -4094,7 +3977,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, * 255/256 boundary which is forbidden under /l, and so the code * wouldn't catch that they are equivalent (which they are only in * this release) */ - else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) { + else if (memBEGINs((char *) p, e - p, DOTTED_I)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; " @@ -4176,7 +4059,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, * works. */ *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; - Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, + Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, ustrp, *lenp, U8); return LATIN_SMALL_LETTER_LONG_S; @@ -5188,6 +5071,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) /* Get the 0th element, which is needed to setup the inversion list * */ while (isSPACE(*l)) l++; + after_atou = (char *) lend; if (!grok_atoUV((const char *)l, &element0, &after_atou)) { Perl_croak(aTHX_ "panic: Expecting a valid 0th element for" " inversion list"); @@ -5204,6 +5088,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash) " elements than available", elements); } while (isSPACE(*l)) l++; + after_atou = (char *) lend; if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) { @@ -5586,7 +5471,13 @@ L (Case Mappings). * that effect. However, if the caller knows what * it's doing, it can pass this flag to indicate that, * and the assertion is skipped. - * FOLDEQ_S2_ALREADY_FOLDED Similarly. + * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies + * to s2, and s2 doesn't have to be UTF-8 encoded. + * This introduces an asymmetry to save a few branches + * in a loop. Currently, this is not a problem, as + * never are both inputs pre-folded. Simply call this + * function with the pre-folded one as the second + * string. * FOLDEQ_S2_FOLDS_SANE */ I32 @@ -5609,11 +5500,11 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS; - assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE)) - && (((flags & FOLDEQ_S1_ALREADY_FOLDED) - && !(flags & FOLDEQ_S1_FOLDS_SANE)) - || ((flags & FOLDEQ_S2_ALREADY_FOLDED) - && !(flags & FOLDEQ_S2_FOLDS_SANE))))); + assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE)) + && (( (flags & FOLDEQ_S1_ALREADY_FOLDED) + && !(flags & FOLDEQ_S1_FOLDS_SANE)) + || ( (flags & FOLDEQ_S2_ALREADY_FOLDED) + && !(flags & FOLDEQ_S2_FOLDS_SANE))))); /* The algorithm is to trial the folds without regard to the flags on * the first line of the above assert(), and then see if the result * violates them. This means that the inputs can't be pre-folded to a @@ -5633,6 +5524,9 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, flags_for_folder |= FOLD_FLAGS_LOCALE; } } + if (flags & FOLDEQ_UTF8_NOMIX_ASCII) { + flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII; + } if (pe1) { e1 = *(U8**)pe1; @@ -5717,9 +5611,23 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, if (n2 == 0) { /* Same for s2 */ if (flags & FOLDEQ_S2_ALREADY_FOLDED) { - f2 = (U8 *) p2; - assert(u2); - n2 = UTF8SKIP(f2); + + /* Point to the already-folded character. But for non-UTF-8 + * variants, convert to UTF-8 for the algorithm below */ + if (UTF8_IS_INVARIANT(*p2)) { + f2 = (U8 *) p2; + n2 = 1; + } + else if (u2) { + f2 = (U8 *) p2; + n2 = UTF8SKIP(f2); + } + else { + foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2); + foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2); + f2 = foldbuf2; + n2 = 2; + } } else { if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) { @@ -5877,7 +5785,26 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) { PERL_ARGS_ASSERT_UTF8_TO_UVCHR; - return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen); + /* This function is unsafe if malformed UTF-8 input is given it, which is + * why the function is deprecated. If the first byte of the input + * indicates that there are more bytes remaining in the sequence that forms + * the character than there are in the input buffer, it can read past the + * end. But we can make it safe if the input string happens to be + * NUL-terminated, as many strings in Perl are, by refusing to read past a + * NUL. A NUL indicates the start of the next character anyway. If the + * input isn't NUL-terminated, the function remains unsafe, as it always + * has been. + * + * An initial NUL has to be handled separately, but all ASCIIs can be + * handled the same way, speeding up this common case */ + + if (UTF8_IS_INVARIANT(*s)) { /* Assumes 's' contains at least 1 byte */ + return (UV) *s; + } + + return utf8_to_uvchr_buf(s, + s + my_strnlen((char *) s, UTF8SKIP(s)), + retlen); } /*