X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/9ff99fb340d32553ec2fa54534557cb168f284d1..b43c708ad6587be0945a81b1f04142f110ecb471:/utf8.c diff --git a/utf8.c b/utf8.c index af44620..07e4df7 100644 --- a/utf8.c +++ b/utf8.c @@ -42,8 +42,6 @@ static const char cp_above_legal_max[] = "Use of code point 0x%" UVXf " is not allowed; the" " permissible max is 0x%" UVXf; -#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX)) - /* =head1 Unicode Support These are various utility functions for manipulating UTF8-encoded @@ -55,6 +53,19 @@ within non-zero characters. =cut */ +/* helper for Perl__force_out_malformed_utf8_message(). Like + * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than + * PL_compiling */ + +static void +S_restore_cop_warnings(pTHX_ void *p) +{ + if (!specialWARN(PL_curcop->cop_warnings)) + PerlMemShared_free(PL_curcop->cop_warnings); + PL_curcop->cop_warnings = (STRLEN*)p; +} + + void Perl__force_out_malformed_utf8_message(pTHX_ const U8 *const p, /* First byte in UTF-8 sequence */ @@ -86,6 +97,10 @@ Perl__force_out_malformed_utf8_message(pTHX_ PL_dowarn = G_WARN_ALL_ON|G_WARN_ON; if (PL_curcop) { + /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather + * than PL_compiling */ + SAVEDESTRUCTOR_X(S_restore_cop_warnings, + (void*)PL_curcop->cop_warnings); PL_curcop->cop_warnings = pWARN_ALL; } @@ -213,7 +228,7 @@ Most code should use C()> rather than call this directly This function is for code that wants any warning and/or error messages to be returned to the caller rather than be displayed. All messages that would have -been displayed if all lexcial warnings are enabled will be returned. +been displayed if all lexical warnings are enabled will be returned. It is just like C> but it takes an extra parameter placed after all the others, C. If this parameter is 0, this function @@ -309,8 +324,8 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs) * performance hit on these high EBCDIC code points. */ if (UNLIKELY(UNICODE_IS_SUPER(uv))) { - if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) { - Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP); + if (UNLIKELY(uv > MAX_LEGAL_CP)) { + Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP); } if ( (flags & UNICODE_WARN_SUPER) || ( (flags & UNICODE_WARN_PERL_EXTENDED) @@ -386,8 +401,8 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs) STRLEN len = OFFUNISKIP(uv); U8 *p = d+len-1; while (p > d) { - *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK); - uv >>= UTF_ACCUMULATION_SHIFT; + *p-- = I8_TO_NATIVE_UTF8((uv & MASK) | MARK); + uv >>= SHIFT; } *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len)); return d+len; @@ -1123,7 +1138,7 @@ Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const boo PERL_STATIC_INLINE char * S_unexpected_non_continuation_text(pTHX_ const U8 * const s, - /* How many bytes to print */ + /* Max number of bytes to print */ STRLEN print_len, /* Which one is the non-continuation */ @@ -1139,6 +1154,8 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, ? "immediately" : Perl_form(aTHX_ "%d bytes", (int) non_cont_byte_pos); + const U8 * x = s + non_cont_byte_pos; + const U8 * e = s + print_len; PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT; @@ -1146,10 +1163,20 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, * calculated, it's likely faster to pass it; verify under DEBUGGING */ assert(expect_len == UTF8SKIP(s)); + /* As a defensive coding measure, don't output anything past a NUL. Such + * bytes shouldn't be in the middle of a malformation, and could mark the + * end of the allocated string, and what comes after is undefined */ + for (; x < e; x++) { + if (*x == '\0') { + x++; /* Output this particular NUL */ + break; + } + } + return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x," " %s after start byte 0x%02x; need %d bytes, got %d)", malformed_text, - _byte_dump_string(s, print_len, 0), + _byte_dump_string(s, x - s, 0), *(s + non_cont_byte_pos), where, *s, @@ -1262,112 +1289,16 @@ Also implemented as a macro in utf8.h */ UV -Perl_utf8n_to_uvchr(pTHX_ const U8 *s, - STRLEN curlen, - STRLEN *retlen, - const U32 flags) +Perl_utf8n_to_uvchr(const U8 *s, + STRLEN curlen, + STRLEN *retlen, + const U32 flags) { PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL); } -/* The tables below come from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, - * which requires this copyright notice */ - -/* Copyright (c) 2008-2009 Bjoern Hoehrmann - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#if 0 -static U8 utf8d_C9[] = { - /* The first part of the table maps bytes to character classes that - * to reduce the size of the transition table and create bitmasks. */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/ - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/ - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/ - 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, /*-FF*/ - - /* The second part is a transition table that maps a combination - * of a state of the automaton and a character class to a state. */ - 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, - 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, - 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, - 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, - 12,36,12,12,12,12,12,12,12,12,12,12 -}; - -#endif - -#ifndef EBCDIC - -/* This is a version of the above table customized for Perl that doesn't - * exclude surrogates and accepts start bytes up through F7 (representing - * 2**21 - 1). */ -static U8 dfa_tab_for_perl[] = { - /* The first part of the table maps bytes to character classes to reduce - * the size of the transition table and create bitmasks. */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/ - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/ - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/ - 10,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 11,4,4,4,4,4,4,4,8,8,8,8,8,8,8,8, /*-FF*/ - - /* The second part is a transition table that maps a combination - * of a state of the automaton and a character class to a state. */ - 0,12,24,36,96,12,12,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,/*23*/ - 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,/*47*/ - 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,/*71*/ - 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,/*95*/ - 12,36,12,12,12,12,12,36,12,36,12,12 /* 96- 107 */ - - /* The customization was to repurpose the surrogates type '4' to instead be - * for start bytes F1-F7. Types 5 and 6 are now unused, and their entries in - * the transition part of the table are set to 12, so are illegal. - * - * To do higher code points would require expansion and some rearrangement of - * the table. The type '1' entries for continuation bytes 80-8f would have to - * be split into several types, because they aren't treated uniformly for - * higher start bytes, since overlongs for F8 are 80-87; FC: 80-83; and FE: - * 80-81. We start needing to worry about overflow if FE is included. - * Ignoring, FE and FF, we could use type 5 for F9-FB, and 6 for FD (remember - * from the web site that these are used to right shift). FE would - * necessarily be type 7; and FF, type 8. And new states would have to be - * created for F8 and FC (and FE and FF if used), so quite a bit of work would - * be involved. - * - * XXX Better would be to customize the table so that the noncharacters are - * excluded. This again is non trivial, but doing so would simplify the code - * that uses this, and might make it small enough to make it inlinable */ -}; - -#endif - /* =for apidoc utf8n_to_uvchr_error @@ -1449,7 +1380,8 @@ C or the C flags. =item C The input sequence was malformed in that a non-continuation type byte was found -in a position where only a continuation type one should be. +in a position where only a continuation type one should be. See also +L>. =item C @@ -1462,6 +1394,34 @@ The input sequence was malformed in that C is smaller than required for a complete sequence. In other words, the input is for a partial character sequence. + +C and C both indicate a too short +sequence. The difference is that C indicates always +that there is an error, while C means that an incomplete +sequence was looked at. If no other flags are present, it means that the +sequence was valid as far as it went. Depending on the application, this could +mean one of three things: + +=over + +=item * + +The C length parameter passed in was too small, and the function was +prevented from examining all the necessary bytes. + +=item * + +The buffer being looked at is based on reading data, and the data received so +far stopped in the middle of a character, so that the next read will +read the remainder of this character. (It is up to the caller to deal with the +split bytes somehow.) + +=item * + +This is a real error, and the partial sequence is all we're going to get. + +=back + =item C The input sequence was malformed in that it is for a non-Unicode code point; @@ -1487,7 +1447,7 @@ Also implemented as a macro in utf8.h */ UV -Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, +Perl_utf8n_to_uvchr_error(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, @@ -1551,7 +1511,7 @@ The caller, of course, is responsible for freeing any returned AV. */ UV -Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, +Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, @@ -1560,27 +1520,70 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, { const U8 * const s0 = s; const U8 * send = s0 + curlen; - U32 possible_problems = 0; /* A bit is set here for each potential problem - found as we go along */ - UV uv = (UV) -1; - STRLEN expectlen = 0; /* How long should this sequence be? - (initialized to silence compilers' wrong - warning) */ - STRLEN avail_len = 0; /* When input is too short, gives what that is */ - U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL; - this gets set and discarded */ + U32 possible_problems; /* A bit is set here for each potential problem + found as we go along */ + UV uv; + STRLEN expectlen; /* How long should this sequence be? */ + STRLEN avail_len; /* When input is too short, gives what that is */ + U32 discard_errors; /* Used to save branches when 'errors' is NULL; this + gets set and discarded */ /* The below are used only if there is both an overlong malformation and a * too short one. Otherwise the first two are set to 's0' and 'send', and * the third not used at all */ - U8 * adjusted_s0 = (U8 *) s0; + U8 * adjusted_s0; U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this routine; see [perl #130921] */ - UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */ + UV uv_so_far; + dTHX; + + PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER; - UV state = 0; + /* Here, is one of: a) malformed; b) a problematic code point (surrogate, + * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul + * syllables that the dfa doesn't properly handle. Quickly dispose of the + * final case. */ - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; +#ifndef EBCDIC + + /* Each of the affected Hanguls starts with \xED */ + + if (is_HANGUL_ED_utf8_safe(s0, send)) { + if (retlen) { + *retlen = 3; + } + if (errors) { + *errors = 0; + } + if (msgs) { + *msgs = NULL; + } + + return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT)) + | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT) + | (s0[2] & UTF_CONTINUATION_MASK); + } + +#endif + + /* In conjunction with the exhaustive tests that can be enabled in + * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely + * what it is intended to do, and that no flaws in it are masked by + * dropping down and executing the code below + assert(! isUTF8_CHAR(s0, send) + || UTF8_IS_SURROGATE(s0, send) + || UTF8_IS_SUPER(s0, send) + || UTF8_IS_NONCHAR(s0,send)); + */ + + s = s0; + uv = *s0; + possible_problems = 0; + expectlen = 0; + avail_len = 0; + discard_errors = 0; + adjusted_s0 = (U8 *) s0; + uv_so_far = 0; if (errors) { *errors = 0; @@ -1633,55 +1636,6 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, *retlen = expectlen; } - /* An invariant is trivially well-formed */ - if (UTF8_IS_INVARIANT(*s0)) { - return *s0; - } - -#ifndef EBCDIC - - /* Measurements show that this dfa is somewhat faster than the regular code - * below, so use it first, dropping down for the non-normal cases. */ - -# define PERL_UTF8_DECODE_REJECT 12 - - while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) { - UV type = dfa_tab_for_perl[*s]; - - if (state != 0) { - uv = (*s & 0x3fu) | (uv << UTF_ACCUMULATION_SHIFT); - state = dfa_tab_for_perl[256 + state + type]; - } - else { - uv = (0xff >> type) & (*s); - state = dfa_tab_for_perl[256 + type]; - } - - if (state == 0) { - - /* If this could be a code point that the flags don't allow (the first - * surrogate is the first such possible one), delve further, but we already - * have calculated 'uv' */ - if ( (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE - |UTF8_WARN_ILLEGAL_INTERCHANGE)) - && uv >= UNICODE_SURROGATE_FIRST) - { - curlen = s + 1 - s0; - goto got_uv; - } - - return uv; - } - - s++; - } - - /* Here, is some sort of failure. Use the full mechanism */ - - uv = *s0; - -#endif - /* A continuation character can't start a valid sequence */ if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) { possible_problems |= UTF8_GOT_CONTINUATION; @@ -1801,8 +1755,6 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, } } - got_uv: - /* Here, we have found all the possible problems, except for when the input * is for a problematic code point not allowed by the input parameters. */ @@ -2319,10 +2271,7 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) { PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; - assert(s < send); - - return utf8n_to_uvchr(s, send - s, retlen, - ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + return _utf8_to_uvchr_buf(s, send, retlen); } /* This is marked as deprecated @@ -2385,14 +2334,14 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) * the bitops (especially ~) can create illegal UTF-8. * In other words: in Perl UTF-8 is not just for Unicode. */ - if (e < s) + if (UNLIKELY(e < s)) goto warn_and_return; while (s < e) { s += UTF8SKIP(s); len++; } - if (e != s) { + if (UNLIKELY(e != s)) { len--; warn_and_return: if (PL_op) @@ -2712,7 +2661,8 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) PERL_ARGS_ASSERT_BYTES_TO_UTF8; PERL_UNUSED_CONTEXT; - Newx(d, (*lenp) * 2 + 1, U8); + /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ + Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8); dst = d; while (s < send) { @@ -2723,9 +2673,6 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp) *d = '\0'; *lenp = d-dst; - /* Trim unused space */ - Renew(dst, *lenp + 1, U8); - return dst; } @@ -2845,9 +2792,8 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) bool Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) { - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf)); + dVAR; + return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c); } /* Internal function so we can deprecate the external one, and call @@ -2856,27 +2802,27 @@ Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) bool Perl__is_utf8_idstart(pTHX_ const U8 *p) { + dVAR; + PERL_ARGS_ASSERT__IS_UTF8_IDSTART; if (*p == '_') return TRUE; - return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL); + return is_utf8_common(p, PL_utf8_idstart); } bool Perl__is_uni_perl_idcont(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf)); + dVAR; + return _invlist_contains_cp(PL_utf8_perl_idcont, c); } bool Perl__is_uni_perl_idstart(pTHX_ UV c) { - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf)); + dVAR; + return _invlist_contains_cp(PL_utf8_perl_idstart, c); } UV @@ -2937,27 +2883,72 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, return converted; } +/* If compiled on an early Unicode version, there may not be auxiliary tables + * */ +#ifndef HAS_UC_AUX_TABLES +# define UC_AUX_TABLE_ptrs NULL +# define UC_AUX_TABLE_lengths NULL +#endif +#ifndef HAS_TC_AUX_TABLES +# define TC_AUX_TABLE_ptrs NULL +# define TC_AUX_TABLE_lengths NULL +#endif +#ifndef HAS_LC_AUX_TABLES +# define LC_AUX_TABLE_ptrs NULL +# define LC_AUX_TABLE_lengths NULL +#endif +#ifndef HAS_CF_AUX_TABLES +# define CF_AUX_TABLE_ptrs NULL +# define CF_AUX_TABLE_lengths NULL +#endif +#ifndef HAS_UC_AUX_TABLES +# define UC_AUX_TABLE_ptrs NULL +# define UC_AUX_TABLE_lengths NULL +#endif + /* Call the function to convert a UTF-8 encoded character to the specified case. * Note that there may be more than one character in the result. - * INP is a pointer to the first byte of the input character - * OUTP will be set to the first byte of the string of changed characters. It + * 's' is a pointer to the first byte of the input character + * 'd' will be set to the first byte of the string of changed characters. It * needs to have space for UTF8_MAXBYTES_CASE+1 bytes - * LENP will be set to the length in bytes of the string of changed characters + * 'lenp' will be set to the length in bytes of the string of changed characters * * The functions return the ordinal of the first character in the string of - * OUTP */ + * 'd' */ #define CALL_UPPER_CASE(uv, s, d, lenp) \ - _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "") + _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper, \ + Uppercase_Mapping_invmap, \ + UC_AUX_TABLE_ptrs, \ + UC_AUX_TABLE_lengths, \ + "uppercase") #define CALL_TITLE_CASE(uv, s, d, lenp) \ - _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "") + _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle, \ + Titlecase_Mapping_invmap, \ + TC_AUX_TABLE_ptrs, \ + TC_AUX_TABLE_lengths, \ + "titlecase") #define CALL_LOWER_CASE(uv, s, d, lenp) \ - _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "") + _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower, \ + Lowercase_Mapping_invmap, \ + LC_AUX_TABLE_ptrs, \ + LC_AUX_TABLE_lengths, \ + "lowercase") + /* This additionally has the input parameter 'specials', which if non-zero will * cause this to use the specials hash for folding (meaning get full case * folding); otherwise, when zero, this implies a simple case fold */ #define CALL_FOLD_CASE(uv, s, d, lenp, specials) \ -_to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL) + (specials) \ + ? _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold, \ + Case_Folding_invmap, \ + CF_AUX_TABLE_ptrs, \ + CF_AUX_TABLE_lengths, \ + "foldcase") \ + : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold, \ + Simple_Case_Folding_invmap, \ + NULL, NULL, \ + "foldcase") UV Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) @@ -2970,27 +2961,27 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) * The ordinal of the first character of the changed version is returned * (but note, as explained above, that there may be more.) */ + dVAR; PERL_ARGS_ASSERT_TO_UNI_UPPER; if (c < 256) { return _to_upper_title_latin1((U8) c, p, lenp, 'S'); } - uvchr_to_utf8(p, c); - return CALL_UPPER_CASE(c, p, p, lenp); + return CALL_UPPER_CASE(c, NULL, p, lenp); } UV Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) { + dVAR; PERL_ARGS_ASSERT_TO_UNI_TITLE; if (c < 256) { return _to_upper_title_latin1((U8) c, p, lenp, 's'); } - uvchr_to_utf8(p, c); - return CALL_TITLE_CASE(c, p, p, lenp); + return CALL_TITLE_CASE(c, NULL, p, lenp); } STATIC U8 @@ -3023,19 +3014,18 @@ S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy) UV Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) { + dVAR; PERL_ARGS_ASSERT_TO_UNI_LOWER; if (c < 256) { return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ ); } - uvchr_to_utf8(p, c); - return CALL_LOWER_CASE(c, p, p, lenp); + return CALL_LOWER_CASE(c, NULL, p, lenp); } UV -Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, - const unsigned int flags) +Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags) { /* Corresponds to to_lower_latin1(); bits meanings: * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited @@ -3047,7 +3037,6 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, UV converted; PERL_ARGS_ASSERT__TO_FOLD_LATIN1; - PERL_UNUSED_CONTEXT; assert (! (flags & FOLD_FLAGS_LOCALE)); @@ -3107,13 +3096,14 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited */ + dVAR; PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS; if (flags & FOLD_FLAGS_LOCALE) { - /* Treat a UTF-8 locale as not being in locale at all, except for - * potentially warning */ + /* Treat a non-Turkic UTF-8 locale as not being in locale at all, + * except for potentially warning */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if (IN_UTF8_CTYPE_LOCALE) { + if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) { flags &= ~FOLD_FLAGS_LOCALE; } else { @@ -3128,8 +3118,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) /* Here, above 255. If no special needs, just use the macro */ if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) { - uvchr_to_utf8(p, c); - return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL); + return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL); } else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with the special flags. */ @@ -3143,21 +3132,17 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags) } PERL_STATIC_INLINE bool -S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, - const char *const swashname, SV* const invlist) +S_is_utf8_common(pTHX_ const U8 *const p, SV* const invlist) { /* returns a boolean giving whether or not the UTF8-encoded character that - * starts at

is in the swash indicated by . - * contains a pointer to where the swash indicated by - * is to be stored; which this routine will do, so that future calls will - * look at <*swash> and only generate a swash if it is not null. - * is NULL or an inversion list that defines the swash. If not null, it - * saves time during initialization of the swash. + * starts at

is in the inversion list indicated by . * * Note that it is assumed that the buffer length of

is enough to * contain all the bytes that comprise the character. Thus, <*p> should * have been checked before this call for mal-formedness enough to assure - * that. */ + * that. This function, does make sure to not look past any NUL, so it is + * safe to use on C, NUL-terminated, strings */ + STRLEN len = my_strnlen((char *) p, UTF8SKIP(p)); PERL_ARGS_ASSERT_IS_UTF8_COMMON; @@ -3166,60 +3151,34 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, * as far as there being enough bytes available in it to accommodate the * character without reading beyond the end, and pass that number on to the * validating routine */ - if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) { - _force_out_malformed_utf8_message(p, p + UTF8SKIP(p), - _UTF8_NO_CONFIDENCE_IN_CURLEN, + if (! isUTF8_CHAR(p, p + len)) { + _force_out_malformed_utf8_message(p, p + len, _UTF8_NO_CONFIDENCE_IN_CURLEN, 1 /* Die */ ); NOT_REACHED; /* NOTREACHED */ } - if (!*swash) { - U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - *swash = _core_swash_init("utf8", - - /* Only use the name if there is no inversion - * list; otherwise will go out to disk */ - (invlist) ? "" : swashname, - - &PL_sv_undef, 1, 0, invlist, &flags); - } - - return swash_fetch(*swash, p, TRUE) != 0; + return is_utf8_common_with_len(p, p + len, invlist); } PERL_STATIC_INLINE bool S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, - SV **swash, const char *const swashname, SV* const invlist) { /* returns a boolean giving whether or not the UTF8-encoded character that - * starts at

, and extending no further than is in the swash - * indicated by . contains a pointer to where the swash - * indicated by is to be stored; which this routine will do, so - * that future calls will look at <*swash> and only generate a swash if it - * is not null. is NULL or an inversion list that defines the - * swash. If not null, it saves time during initialization of the swash. - */ + * starts at

, and extending no further than is in the inversion + * list . */ + + UV cp = utf8n_to_uvchr(p, e - p, NULL, 0); PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN; - if (! isUTF8_CHAR(p, e)) { + if (cp == 0 && (p >= e || *p != '\0')) { _force_out_malformed_utf8_message(p, e, 0, 1); NOT_REACHED; /* NOTREACHED */ } - if (!*swash) { - U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - *swash = _core_swash_init("utf8", - - /* Only use the name if there is no inversion - * list; otherwise will go out to disk */ - (invlist) ? "" : swashname, - - &PL_sv_undef, 1, 0, invlist, &flags); - } - - return swash_fetch(*swash, p, TRUE) != 0; + assert(invlist); + return _invlist_contains_cp(invlist, cp); } STATIC void @@ -3248,14 +3207,14 @@ S_warn_on_first_deprecated_use(pTHX_ const char * const name, if (instr(file, "mathoms.c")) { Perl_warner(aTHX_ WARN_DEPRECATED, - "In %s, line %d, starting in Perl v5.30, %s()" + "In %s, line %d, starting in Perl v5.32, %s()" " will be removed. Avoid this message by" " converting to use %s().\n", file, line, name, alternative); } else { Perl_warner(aTHX_ WARN_DEPRECATED, - "In %s, line %d, starting in Perl v5.30, %s() will" + "In %s, line %d, starting in Perl v5.32, %s() will" " require an additional parameter. Avoid this" " message by converting to use %s().\n", file, line, name, alternative); @@ -3274,6 +3233,7 @@ Perl__is_utf8_FOO(pTHX_ U8 classnum, const char * const file, const unsigned line) { + dVAR; PERL_ARGS_ASSERT__IS_UTF8_FOO; warn_on_first_deprecated_use(name, alternative, use_locale, file, line); @@ -3292,10 +3252,7 @@ Perl__is_utf8_FOO(pTHX_ U8 classnum, case _CC_GRAPH: case _CC_CASED: - return is_utf8_common(p, - &PL_utf8_swash_ptrs[classnum], - swash_property_names[classnum], - PL_XPosix_ptrs[classnum]); + return is_utf8_common(p, PL_XPosix_ptrs[classnum]); case _CC_SPACE: return is_XPERLSPACE_high(p); @@ -3310,19 +3267,9 @@ Perl__is_utf8_FOO(pTHX_ U8 classnum, case _CC_VERTSPACE: return is_VERTWS_high(p); case _CC_IDFIRST: - if (! PL_utf8_perl_idstart) { - PL_utf8_perl_idstart - = _new_invlist_C_array(_Perl_IDStart_invlist); - } - return is_utf8_common(p, &PL_utf8_perl_idstart, - "_Perl_IDStart", NULL); + return is_utf8_common(p, PL_utf8_perl_idstart); case _CC_IDCONT: - if (! PL_utf8_perl_idcont) { - PL_utf8_perl_idcont - = _new_invlist_C_array(_Perl_IDCont_invlist); - } - return is_utf8_common(p, &PL_utf8_perl_idcont, - "_Perl_IDCont", NULL); + return is_utf8_common(p, PL_utf8_perl_idcont); } } @@ -3359,86 +3306,88 @@ bool Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p, const U8 * const e) { + dVAR; PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN; - assert(classnum < _FIRST_NON_SWASH_CC); - - return is_utf8_common_with_len(p, - e, - &PL_utf8_swash_ptrs[classnum], - swash_property_names[classnum], - PL_XPosix_ptrs[classnum]); + return is_utf8_common_with_len(p, e, PL_XPosix_ptrs[classnum]); } bool Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e) { - SV* invlist = NULL; - + dVAR; PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN; - if (! PL_utf8_perl_idstart) { - invlist = _new_invlist_C_array(_Perl_IDStart_invlist); - } - return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart, - "_Perl_IDStart", invlist); + return is_utf8_common_with_len(p, e, PL_utf8_perl_idstart); } bool Perl__is_utf8_xidstart(pTHX_ const U8 *p) { + dVAR; PERL_ARGS_ASSERT__IS_UTF8_XIDSTART; if (*p == '_') return TRUE; - return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL); + return is_utf8_common(p, PL_utf8_xidstart); } bool Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e) { - SV* invlist = NULL; - + dVAR; PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN; - if (! PL_utf8_perl_idcont) { - invlist = _new_invlist_C_array(_Perl_IDCont_invlist); - } - return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont, - "_Perl_IDCont", invlist); + return is_utf8_common_with_len(p, e, PL_utf8_perl_idcont); } bool Perl__is_utf8_idcont(pTHX_ const U8 *p) { + dVAR; PERL_ARGS_ASSERT__IS_UTF8_IDCONT; - return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL); + return is_utf8_common(p, PL_utf8_idcont); } bool Perl__is_utf8_xidcont(pTHX_ const U8 *p) { + dVAR; PERL_ARGS_ASSERT__IS_UTF8_XIDCONT; - return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL); + return is_utf8_common(p, PL_utf8_xidcont); } bool Perl__is_utf8_mark(pTHX_ const U8 *p) { + dVAR; PERL_ARGS_ASSERT__IS_UTF8_MARK; - return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL); + return is_utf8_common(p, PL_utf8_mark); } - /* change namve uv1 to 'from' */ STATIC UV -S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, - SV **swashp, const char *normal, const char *special) +S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, + U8* ustrp, STRLEN *lenp, + SV *invlist, const int * const invmap, + const unsigned int * const * const aux_tables, + const U8 * const aux_table_lengths, + const char * const normal) { STRLEN len = 0; + /* Change the case of code point 'uv1' whose UTF-8 representation (assumed + * by this routine to be valid) begins at 'p'. 'normal' is a string to use + * to name the new case in any generated messages, as a fallback if the + * operation being used is not available. The new case is given by the + * data structures in the remaining arguments. + * + * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the + * entire changed case string, and the return value is the first code point + * in that string */ + PERL_ARGS_ASSERT__TO_UTF8_CASE; /* For code points that don't change case, we already know that the output @@ -3503,13 +3452,12 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, * some others */ if (uv1 < 0xFB00) { goto cases_to_self; - } if (UNLIKELY(UNICODE_IS_SUPER(uv1))) { - if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) { + if (UNLIKELY(uv1 > MAX_LEGAL_CP)) { Perl_croak(aTHX_ cp_above_legal_max, uv1, - MAX_EXTERNALLY_LEGAL_CP); + MAX_LEGAL_CP); } if (ckWARN_d(WARN_NON_UNICODE)) { const char* desc = (PL_op) ? OP_DESC(PL_op) : normal; @@ -3533,79 +3481,133 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp, } /* Note that non-characters are perfectly legal, so no warning should - * be given. There are so few of them, that it isn't worth the extra - * tests to avoid swash creation */ + * be given. */ } - if (!*swashp) /* load on-demand */ - *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, - 4, 0, NULL, NULL); + { + unsigned int i; + const unsigned int * cp_list; + U8 * d; - if (special) { - /* It might be "special" (sometimes, but not always, - * a multicharacter mapping) */ - HV *hv = NULL; - SV **svp; + /* 'index' is guaranteed to be non-negative, as this is an inversion + * map that covers all possible inputs. See [perl #133365] */ + SSize_t index = _invlist_search(invlist, uv1); + IV base = invmap[index]; - /* If passed in the specials name, use that; otherwise use any - * given in the swash */ - if (*special != '\0') { - hv = get_hv(special, 0); - } - else { - svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0); - if (svp) { - hv = MUTABLE_HV(SvRV(*svp)); - } - } + /* The data structures are set up so that if 'base' is non-negative, + * the case change is 1-to-1; and if 0, the change is to itself */ + if (base >= 0) { + IV lc; - if (hv - && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE)) - && (*svp)) - { - const char *s; - - s = SvPV_const(*svp, len); - if (len == 1) - /* EIGHTBIT */ - len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp; - else { - Copy(s, ustrp, len, U8); - } - } - } + if (base == 0) { + goto cases_to_self; + } - if (!len && *swashp) { - const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */); + /* This computes, e.g. lc(H) as 'H - A + a', using the lc table */ + lc = base + uv1 - invlist_array(invlist)[index]; + *lenp = uvchr_to_utf8(ustrp, lc) - ustrp; + return lc; + } - if (uv2) { - /* It was "normal" (a single character mapping). */ - len = uvchr_to_utf8(ustrp, uv2) - ustrp; - } - } + /* Here 'base' is negative. That means the mapping is 1-to-many, and + * requires an auxiliary table look up. abs(base) gives the index into + * a list of such tables which points to the proper aux table. And a + * parallel list gives the length of each corresponding aux table. */ + cp_list = aux_tables[-base]; - if (len) { - if (lenp) { - *lenp = len; + /* Create the string of UTF-8 from the mapped-to code points */ + d = ustrp; + for (i = 0; i < aux_table_lengths[-base]; i++) { + d = uvchr_to_utf8(d, cp_list[i]); } - return valid_utf8_to_uvchr(ustrp, 0); + *d = '\0'; + *lenp = d - ustrp; + + return cp_list[0]; } /* Here, there was no mapping defined, which means that the code point maps * to itself. Return the inputs */ cases_to_self: - len = UTF8SKIP(p); - if (p != ustrp) { /* Don't copy onto itself */ - Copy(p, ustrp, len, U8); + if (p) { + len = UTF8SKIP(p); + if (p != ustrp) { /* Don't copy onto itself */ + Copy(p, ustrp, len, U8); + } + *lenp = len; + } + else { + *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp; } - - if (lenp) - *lenp = len; return uv1; } +Size_t +Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to, + const unsigned int ** remaining_folds_to) +{ + /* Returns the count of the number of code points that fold to the input + * 'cp' (besides itself). + * + * If the return is 0, there is nothing else that folds to it, and + * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL. + * + * If the return is 1, '*first_folds_to' is set to the single code point, + * and '*remaining_folds_to' is set to NULL. + * + * Otherwise, '*first_folds_to' is set to a code point, and + * '*remaining_fold_to' is set to an array that contains the others. The + * length of this array is the returned count minus 1. + * + * The reason for this convolution is to avoid having to deal with + * allocating and freeing memory. The lists are already constructed, so + * the return can point to them, but single code points aren't, so would + * need to be constructed if we didn't employ something like this API */ + + dVAR; + /* 'index' is guaranteed to be non-negative, as this is an inversion map + * that covers all possible inputs. See [perl #133365] */ + SSize_t index = _invlist_search(PL_utf8_foldclosures, cp); + int base = _Perl_IVCF_invmap[index]; + + PERL_ARGS_ASSERT__INVERSE_FOLDS; + + if (base == 0) { /* No fold */ + *first_folds_to = 0; + *remaining_folds_to = NULL; + return 0; + } + +#ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */ + + assert(base > 0); + +#else + + if (UNLIKELY(base < 0)) { /* Folds to more than one character */ + + /* The data structure is set up so that the absolute value of 'base' is + * an index into a table of pointers to arrays, with the array + * corresponding to the index being the list of code points that fold + * to 'cp', and the parallel array containing the length of the list + * array */ + *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0]; + *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes + *first_folds_to + */ + return IVCF_AUX_TABLE_lengths[-base]; + } + +#endif + + /* Only the single code point. This works like 'fc(G) = G - A + a' */ + *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index]; + *remaining_folds_to = NULL; + return 1; +} + STATIC UV S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp) @@ -3630,7 +3632,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, assert(UTF8_IS_ABOVE_LATIN1(*p)); /* We know immediately if the first character in the string crosses the - * boundary, so can skip */ + * boundary, so can skip testing */ if (result > 255) { /* Look at every character in the result; if any cross the @@ -3704,7 +3706,10 @@ S_check_and_deprecate(pTHX_ const U8 *p, if (*e == NULL) { utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN; - *e = p + UTF8SKIP(p); + + /* strnlen() makes this function safe for the common case of + * NUL-terminated strings */ + *e = p + my_strnlen((char *) p, UTF8SKIP(p)); /* For mathoms.c calls, we use the function name we know is stored * there. It could be part of a larger path */ @@ -3743,6 +3748,120 @@ S_check_and_deprecate(pTHX_ const U8 *p, return utf8n_flags; } +STATIC UV +S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e, + U8 * ustrp, STRLEN *lenp) +{ + /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from + * p0..e-1 according to Turkic rules is the same as for non-Turkic. + * Otherwise, it returns the first code point of the Turkic foldcased + * sequence, and the entire sequence will be stored in *ustrp. ustrp will + * contain *lenp bytes + * + * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER + * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER + * DOTLESS I */ + + PERL_ARGS_ASSERT_TURKIC_FC; + assert(e > p); + + if (UNLIKELY(*p == 'I')) { + *lenp = 2; + ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I); + ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I); + return LATIN_SMALL_LETTER_DOTLESS_I; + } + + if (UNLIKELY(memBEGINs(p, e - p, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8))) + { + *lenp = 1; + *ustrp = 'i'; + return 'i'; + } + + return 0; +} + +STATIC UV +S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e, + U8 * ustrp, STRLEN *lenp) +{ + /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from + * p0..e-1 according to Turkic rules is the same as for non-Turkic. + * Otherwise, it returns the first code point of the Turkic lowercased + * sequence, and the entire sequence will be stored in *ustrp. ustrp will + * contain *lenp bytes */ + + dVAR; + PERL_ARGS_ASSERT_TURKIC_LC; + assert(e > p0); + + /* A 'I' requires context as to what to do */ + if (UNLIKELY(*p0 == 'I')) { + const U8 * p = p0 + 1; + + /* According to the Unicode SpecialCasing.txt file, a capital 'I' + * modified by a dot above lowercases to 'i' even in turkic locales. */ + while (p < e) { + UV cp; + + if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) { + ustrp[0] = 'i'; + *lenp = 1; + return 'i'; + } + + /* For the dot above to modify the 'I', it must be part of a + * combining sequence immediately following the 'I', and no other + * modifier with a ccc of 230 may intervene */ + cp = utf8_to_uvchr_buf(p, e, NULL); + if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) { + break; + } + + /* Here the combining sequence continues */ + p += UTF8SKIP(p); + } + } + + /* In all other cases the lc is the same as the fold */ + return turkic_fc(p0, e, ustrp, lenp); +} + +STATIC UV +S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e, + U8 * ustrp, STRLEN *lenp) +{ + /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence + * from p0..e-1 according to Turkic rules is the same as for non-Turkic. + * Otherwise, it returns the first code point of the Turkic upper or + * title-cased sequence, and the entire sequence will be stored in *ustrp. + * ustrp will contain *lenp bytes + * + * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER + * I WITH DOT ABOVE form a case pair, as do 'I' and and LATIN SMALL LETTER + * DOTLESS I */ + + PERL_ARGS_ASSERT_TURKIC_UC; + assert(e > p); + + if (*p == 'i') { + *lenp = 2; + ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; + } + + if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) { + *lenp = 1; + *ustrp = 'I'; + return 'I'; + } + + return 0; +} + /* The process for changing the case is essentially the same for the four case * change types, except there are complications for folding. Otherwise the * difference is only which case to change to. To make sure that they all do @@ -3769,15 +3888,24 @@ S_check_and_deprecate(pTHX_ const U8 *p, * the input code point calculated from the UTF-8. The fold code needs to * realize all this and take it from there. * + * To deal with Turkic locales, the function specified by the parameter + * 'turkic' is called when appropriate. + * * If you read the two macros as sequential, it's easier to understand what's * going on. */ #define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \ - L1_func_extra_param) \ + L1_func_extra_param, turkic) \ \ if (flags & (locale_flags)) { \ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ - /* Treat a UTF-8 locale as not being in locale at all */ \ if (IN_UTF8_CTYPE_LOCALE) { \ + if (UNLIKELY(PL_in_utf8_turkic_locale)) { \ + UV ret = turkic(p, e, ustrp, lenp); \ + if (ret) return ret; \ + } \ + \ + /* Otherwise, treat a UTF-8 locale as not being in locale at \ + * all */ \ flags &= ~(locale_flags); \ } \ } \ @@ -3791,13 +3919,12 @@ S_check_and_deprecate(pTHX_ const U8 *p, } \ } \ else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \ + U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); \ if (flags & (locale_flags)) { \ - result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \ - *(p+1))); \ + result = LC_L1_change_macro(c); \ } \ else { \ - return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), \ - ustrp, lenp, L1_func_extra_param); \ + return L1_func(c, ustrp, lenp, L1_func_extra_param); \ } \ } \ else { /* malformed UTF-8 or ord above 255 */ \ @@ -3850,6 +3977,7 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, const char * const file, const int line) { + dVAR; UV result; const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER, cBOOL(flags), file, line); @@ -3858,7 +3986,8 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */ /* 2nd char of uc(U+DF) is 'S' */ - CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S'); + CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S', + turkic_uc); CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE); } @@ -3884,6 +4013,7 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, const char * const file, const int line) { + dVAR; UV result; const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE, cBOOL(flags), file, line); @@ -3891,7 +4021,8 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS; /* 2nd char of ucfirst(U+DF) is 's' */ - CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's'); + CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's', + turkic_uc); CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE); } @@ -3916,13 +4047,15 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, const char * const file, const int line) { + dVAR; UV result; const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER, cBOOL(flags), file, line); PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS; - CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */) + CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */, + turkic_lc); CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE) } @@ -3952,6 +4085,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, const char * const file, const int line) { + dVAR; UV result; const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD, cBOOL(flags), file, line); @@ -3964,7 +4098,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, assert(p != ustrp); /* Otherwise overwrites */ CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1, - ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII))); + ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)), + turkic_fc); result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL); @@ -3976,7 +4111,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, /* Special case these two characters, as what normally gets * returned under locale doesn't work */ - if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S)) + if (memBEGINs((char *) p, e - p, CAP_SHARP_S)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), @@ -3986,7 +4121,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, } else #endif - if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T)) + if (memBEGINs((char *) p, e - p, LONG_S_T)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), @@ -4005,7 +4140,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, * 255/256 boundary which is forbidden under /l, and so the code * wouldn't catch that they are equivalent (which they are only in * this release) */ - else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) { + else if (memBEGINs((char *) p, e - p, DOTTED_I)) { /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; " @@ -4087,7 +4222,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, * works. */ *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2; - Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, + Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8, ustrp, *lenp, U8); return LATIN_SMALL_LETTER_LONG_S; @@ -4122,85 +4257,43 @@ SV* Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none) { - PERL_ARGS_ASSERT_SWASH_INIT; - /* Returns a copy of a swash initiated by the called function. This is the * public interface, and returning a copy prevents others from doing - * mischief on the original */ - - return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, - NULL, NULL)); -} - -SV* -Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, - I32 minbits, I32 none, SV* invlist, - U8* const flags_p) -{ + * mischief on the original. The only remaining use of this is in tr/// */ /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST * use the following define */ -#define CORE_SWASH_INIT_RETURN(x) \ +#define SWASH_INIT_RETURN(x) \ PL_curpm= old_PL_curpm; \ - return x + return newSVsv(x) /* Initialize and return a swash, creating it if necessary. It does this - * by calling utf8_heavy.pl in the general case. The returned value may be - * the swash's inversion list instead if the input parameters allow it. - * Which is returned should be immaterial to callers, as the only - * operations permitted on a swash, swash_fetch(), _get_swash_invlist(), - * and swash_to_invlist() handle both these transparently. - * - * This interface should only be used by functions that won't destroy or - * adversely change the swash, as doing so affects all other uses of the - * swash in the program; the general public should use 'Perl_swash_init' - * instead. + * by calling utf8_heavy.pl in the general case. * * pkg is the name of the package that should be in. - * name is the name of the swash to find. Typically it is a Unicode - * property name, including user-defined ones + * name is the name of the swash to find. * listsv is a string to initialize the swash with. It must be of the form * documented as the subroutine return value in * L * minbits is the number of bits required to represent each data element. - * It is '1' for binary properties. * none I (khw) do not understand this one, but it is used only in tr///. - * invlist is an inversion list to initialize the swash with (or NULL) - * flags_p if non-NULL is the address of various input and output flag bits - * to the routine, as follows: ('I' means is input to the routine; - * 'O' means output from the routine. Only flags marked O are - * meaningful on return.) - * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash - * came from a user-defined property. (I O) - * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking - * when the swash cannot be located, to simply return NULL. (I) - * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a - * return of an inversion list instead of a swash hash if this routine - * thinks that would result in faster execution of swash_fetch() later - * on. (I) * - * Thus there are three possible inputs to find the swash: , - * , and . At least one must be specified. The result + * Thus there are two possible inputs to find the swash: and + * . At least one must be specified. The result * will be the union of the specified ones, although 's various * actions can intersect, etc. what gives. To avoid going out to * disk at all, should specify completely what the swash should * have, and should be &PL_sv_undef and should be "". - * - * is only valid for binary properties */ + */ PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */ SV* retval = &PL_sv_undef; - HV* swash_hv = NULL; - const int invlist_swash_boundary = - (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST) - ? 512 /* Based on some benchmarking, but not extensive, see commit - message */ - : -1; /* Never return just an inversion list */ - assert(listsv != &PL_sv_undef || strNE(name, "") || invlist); - assert(! invlist || minbits == 1); + PERL_ARGS_ASSERT_SWASH_INIT; + + assert(listsv != &PL_sv_undef || strNE(name, "")); PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex that triggered the swash init and the swash init @@ -4216,7 +4309,6 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, SV* errsv_save; GV *method; - PERL_ARGS_ASSERT__CORE_SWASH_INIT; PUSHSTACKi(PERLSI_MAGIC); ENTER; @@ -4243,8 +4335,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, SAVEBOOL(TAINT_get); TAINT_NOT; #endif - Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len), - NULL); + require_pv("utf8_heavy.pl"); { /* Not ERRSV, as there is no need to vivify a scalar we are about to discard. */ @@ -4289,118 +4380,10 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, if (IN_PERL_COMPILETIME) { CopHINTS_set(PL_curcop, PL_hints); } - if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) { - if (SvPOK(retval)) { - - /* If caller wants to handle missing properties, let them */ - if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) { - CORE_SWASH_INIT_RETURN(NULL); - } - Perl_croak(aTHX_ - "Can't find Unicode property definition \"%" SVf "\"", - SVfARG(retval)); - NOT_REACHED; /* NOTREACHED */ - } - } } /* End of calling the module to find the swash */ - /* If this operation fetched a swash, and we will need it later, get it */ - if (retval != &PL_sv_undef - && (minbits == 1 || (flags_p - && ! (*flags_p - & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)))) - { - swash_hv = MUTABLE_HV(SvRV(retval)); - - /* If we don't already know that there is a user-defined component to - * this swash, and the user has indicated they wish to know if there is - * one (by passing ), find out */ - if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) { - SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE); - if (user_defined && SvUV(*user_defined)) { - *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; - } - } - } - - /* Make sure there is an inversion list for binary properties */ - if (minbits == 1) { - SV** swash_invlistsvp = NULL; - SV* swash_invlist = NULL; - bool invlist_in_swash_is_valid = FALSE; - bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has - an unclaimed reference count */ - - /* If this operation fetched a swash, get its already existing - * inversion list, or create one for it */ - - if (swash_hv) { - swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE); - if (swash_invlistsvp) { - swash_invlist = *swash_invlistsvp; - invlist_in_swash_is_valid = TRUE; - } - else { - swash_invlist = _swash_to_invlist(retval); - swash_invlist_unclaimed = TRUE; - } - } - - /* If an inversion list was passed in, have to include it */ - if (invlist) { - - /* Any fetched swash will by now have an inversion list in it; - * otherwise will be NULL, indicating that we - * didn't fetch a swash */ - if (swash_invlist) { - - /* Add the passed-in inversion list, which invalidates the one - * already stored in the swash */ - invlist_in_swash_is_valid = FALSE; - SvREADONLY_off(swash_invlist); /* Turned on again below */ - _invlist_union(invlist, swash_invlist, &swash_invlist); - } - else { - - /* Here, there is no swash already. Set up a minimal one, if - * we are going to return a swash */ - if ((int) _invlist_len(invlist) > invlist_swash_boundary) { - swash_hv = newHV(); - retval = newRV_noinc(MUTABLE_SV(swash_hv)); - } - swash_invlist = invlist; - } - } - - /* Here, we have computed the union of all the passed-in data. It may - * be that there was an inversion list in the swash which didn't get - * touched; otherwise save the computed one */ - if (! invlist_in_swash_is_valid - && (int) _invlist_len(swash_invlist) > invlist_swash_boundary) - { - if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist)) - { - Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); - } - /* We just stole a reference count. */ - if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE; - else SvREFCNT_inc_simple_void_NN(swash_invlist); - } - - /* The result is immutable. Forbid attempts to change it. */ - SvREADONLY_on(swash_invlist); - - /* Use the inversion list stand-alone if small enough */ - if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) { - SvREFCNT_dec(retval); - if (!swash_invlist_unclaimed) - SvREFCNT_inc_simple_void_NN(swash_invlist); - retval = newRV_noinc(swash_invlist); - } - } - - CORE_SWASH_INIT_RETURN(retval); -#undef CORE_SWASH_INIT_RETURN + SWASH_INIT_RETURN(retval); +#undef SWASH_INIT_RETURN } @@ -4723,41 +4706,32 @@ STATIC SV* S_swatch_get(pTHX_ SV* swash, UV start, UV span) { SV *swatch; - U8 *l, *lend, *x, *xend, *s, *send; + U8 *l, *lend, *x, *xend, *s; STRLEN lcur, xcur, scur; HV *const hv = MUTABLE_HV(SvRV(swash)); - SV** const invlistsvp = hv_fetchs(hv, "V", FALSE); SV** listsvp = NULL; /* The string containing the main body of the table */ SV** extssvp = NULL; - SV** invert_it_svp = NULL; U8* typestr = NULL; - STRLEN bits; + STRLEN bits = 0; STRLEN octets; /* if bits == 1, then octets == 0 */ UV none; UV end = start + span; - if (invlistsvp == NULL) { SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE); SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); extssvp = hv_fetchs(hv, "EXTRAS", FALSE); listsvp = hv_fetchs(hv, "LIST", FALSE); - invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE); bits = SvUV(*bitssvp); none = SvUV(*nonesvp); typestr = (U8*)SvPV_nolen(*typesvp); - } - else { - bits = 1; - none = 0; - } octets = bits >> 3; /* if bits == 1, then octets == 0 */ PERL_ARGS_ASSERT_SWATCH_GET; - if (bits != 1 && bits != 8 && bits != 16 && bits != 32) { + if (bits != 8 && bits != 16 && bits != 32) { Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf, (UV)bits); } @@ -4797,16 +4771,11 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span) SvCUR_set(swatch, scur); s = (U8*)SvPVX(swatch); - if (invlistsvp) { /* If has an inversion list set up use that */ - _invlist_populate_swatch(*invlistsvp, start, end, s); - return swatch; - } - /* read $swash->{LIST} */ l = (U8*)SvPV(*listsvp, lcur); lend = l + lcur; while (l < lend) { - UV min, max, val, upper; + UV min = 0, max = 0, val = 0, upper; l = swash_scan_list_line(l, lend, &min, &max, &val, cBOOL(octets), typestr); if (l > lend) { @@ -4855,43 +4824,9 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span) ++val; } } - else { /* bits == 1, then val should be ignored */ - UV key; - if (min < start) - min = start; - - for (key = min; key <= upper; key++) { - const STRLEN offset = (STRLEN)(key - start); - s[offset >> 3] |= 1 << (offset & 7); - } - } } /* while */ - /* Invert if the data says it should be. Assumes that bits == 1 */ - if (invert_it_svp && SvUV(*invert_it_svp)) { - - /* Unicode properties should come with all bits above PERL_UNICODE_MAX - * be 0, and their inversion should also be 0, as we don't succeed any - * Unicode property matches for non-Unicode code points */ - if (start <= PERL_UNICODE_MAX) { - - /* The code below assumes that we never cross the - * Unicode/above-Unicode boundary in a range, as otherwise we would - * have to figure out where to stop flipping the bits. Since this - * boundary is divisible by a large power of 2, and swatches comes - * in small powers of 2, this should be a valid assumption */ - assert(start + span - 1 <= PERL_UNICODE_MAX); - - send = s + scur; - while (s < send) { - *s = ~(*s); - s++; - } - } - } - - /* read $swash->{EXTRAS} - * This code also copied to swash_to_invlist() below */ + /* read $swash->{EXTRAS} */ x = (U8*)SvPV(*extssvp, xcur); xend = x + xcur; while (x < xend) { @@ -4947,34 +4882,7 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span) Perl_croak(aTHX_ "panic: swatch_get got improper swatch"); s = (U8*)SvPV(swatch, slen); - if (bits == 1 && otherbits == 1) { - if (slen != olen) - Perl_croak(aTHX_ "panic: swatch_get found swatch length " - "mismatch, slen=%" UVuf ", olen=%" UVuf, - (UV)slen, (UV)olen); - - switch (opc) { - case '+': - while (slen--) - *s++ |= *o++; - break; - case '!': - while (slen--) - *s++ |= ~*o++; - break; - case '-': - while (slen--) - *s++ &= ~*o++; - break; - case '&': - while (slen--) - *s++ &= *o++; - break; - default: - break; - } - } - else { + { STRLEN otheroctets = otherbits >> 3; STRLEN offset = 0; U8* const send = s + slen; @@ -5020,577 +4928,13 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span) *s++ = (U8)((otherval >> 8) & 0xff); *s++ = (U8)( otherval & 0xff); } - } + } } sv_free(other); /* through with it! */ } /* while */ return swatch; } -HV* -Perl__swash_inversion_hash(pTHX_ SV* const swash) -{ - - /* Subject to change or removal. For use only in regcomp.c and regexec.c - * Can't be used on a property that is subject to user override, as it - * relies on the value of SPECIALS in the swash which would be set by - * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set - * for overridden properties - * - * Returns a hash which is the inversion and closure of a swash mapping. - * For example, consider the input lines: - * 004B 006B - * 004C 006C - * 212A 006B - * - * The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for - * 006C. The value for each key is an array. For 006C, the array would - * have two elements, the UTF-8 for itself, and for 004C. For 006B, there - * would be three elements in its array, the UTF-8 for 006B, 004B and 212A. - * - * Note that there are no elements in the hash for 004B, 004C, 212A. The - * keys are only code points that are folded-to, so it isn't a full closure. - * - * Essentially, for any code point, it gives all the code points that map to - * it, or the list of 'froms' for that point. - * - * Currently it ignores any additions or deletions from other swashes, - * looking at just the main body of the swash, and if there are SPECIALS - * in the swash, at that hash - * - * The specials hash can be extra code points, and most likely consists of - * maps from single code points to multiple ones (each expressed as a string - * of UTF-8 characters). This function currently returns only 1-1 mappings. - * However consider this possible input in the specials hash: - * "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074 - * "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074 - * - * Both FB05 and FB06 map to the same multi-char sequence, which we don't - * currently handle. But it also means that FB05 and FB06 are equivalent in - * a 1-1 mapping which we should handle, and this relationship may not be in - * the main table. Therefore this function examines all the multi-char - * sequences and adds the 1-1 mappings that come out of that. - * - * XXX This function was originally intended to be multipurpose, but its - * only use is quite likely to remain for constructing the inversion of - * the CaseFolding (//i) property. If it were more general purpose for - * regex patterns, it would have to do the FB05/FB06 game for simple folds, - * because certain folds are prohibited under /iaa and /il. As an example, - * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both - * equivalent under /i. But under /iaa and /il, the folds to 'i' are - * prohibited, so we would not figure out that they fold to each other. - * Code could be written to automatically figure this out, similar to the - * code that does this for multi-character folds, but this is the only case - * where something like this is ever likely to happen, as all the single - * char folds to the 0-255 range are now quite settled. Instead there is a - * little special code that is compiled only for this Unicode version. This - * is smaller and didn't require much coding time to do. But this makes - * this routine strongly tied to being used just for CaseFolding. If ever - * it should be generalized, this would have to be fixed */ - - U8 *l, *lend; - STRLEN lcur; - HV *const hv = MUTABLE_HV(SvRV(swash)); - - /* The string containing the main body of the table. This will have its - * assertion fail if the swash has been converted to its inversion list */ - SV** const listsvp = hv_fetchs(hv, "LIST", FALSE); - - SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); - SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); - SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE); - /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/ - const U8* const typestr = (U8*)SvPV_nolen(*typesvp); - const STRLEN bits = SvUV(*bitssvp); - const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */ - const UV none = SvUV(*nonesvp); - SV **specials_p = hv_fetchs(hv, "SPECIALS", 0); - - HV* ret = newHV(); - - PERL_ARGS_ASSERT__SWASH_INVERSION_HASH; - - /* Must have at least 8 bits to get the mappings */ - if (bits != 8 && bits != 16 && bits != 32) { - Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %" - UVuf, (UV)bits); - } - - if (specials_p) { /* It might be "special" (sometimes, but not always, a - mapping to more than one character */ - - /* Construct an inverse mapping hash for the specials */ - HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p)); - HV * specials_inverse = newHV(); - char *char_from; /* the lhs of the map */ - I32 from_len; /* its byte length */ - char *char_to; /* the rhs of the map */ - I32 to_len; /* its byte length */ - SV *sv_to; /* and in a sv */ - AV* from_list; /* list of things that map to each 'to' */ - - hv_iterinit(specials_hv); - - /* The keys are the characters (in UTF-8) that map to the corresponding - * UTF-8 string value. Iterate through the list creating the inverse - * list. */ - while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) { - SV** listp; - if (! SvPOK(sv_to)) { - Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() " - "unexpectedly is not a string, flags=%lu", - (unsigned long)SvFLAGS(sv_to)); - } - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %" UVXf ", First char of to is %" UVXf "\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/ - - /* Each key in the inverse list is a mapped-to value, and the key's - * hash value is a list of the strings (each in UTF-8) that map to - * it. Those strings are all one character long */ - if ((listp = hv_fetch(specials_inverse, - SvPVX(sv_to), - SvCUR(sv_to), 0))) - { - from_list = (AV*) *listp; - } - else { /* No entry yet for it: create one */ - from_list = newAV(); - if (! hv_store(specials_inverse, - SvPVX(sv_to), - SvCUR(sv_to), - (SV*) from_list, 0)) - { - Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); - } - } - - /* Here have the list associated with this 'to' (perhaps newly - * created and empty). Just add to it. Note that we ASSUME that - * the input is guaranteed to not have duplications, so we don't - * check for that. Duplications just slow down execution time. */ - av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE)); - } - - /* Here, 'specials_inverse' contains the inverse mapping. Go through - * it looking for cases like the FB05/FB06 examples above. There would - * be an entry in the hash like - * 'st' => [ FB05, FB06 ] - * In this example we will create two lists that get stored in the - * returned hash, 'ret': - * FB05 => [ FB05, FB06 ] - * FB06 => [ FB05, FB06 ] - * - * Note that there is nothing to do if the array only has one element. - * (In the normal 1-1 case handled below, we don't have to worry about - * two lists, as everything gets tied to the single list that is - * generated for the single character 'to'. But here, we are omitting - * that list, ('st' in the example), so must have multiple lists.) */ - while ((from_list = (AV *) hv_iternextsv(specials_inverse, - &char_to, &to_len))) - { - if (av_tindex_skip_len_mg(from_list) > 0) { - SSize_t i; - - /* We iterate over all combinations of i,j to place each code - * point on each list */ - for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) { - SSize_t j; - AV* i_list = newAV(); - SV** entryp = av_fetch(from_list, i, FALSE); - if (entryp == NULL) { - Perl_croak(aTHX_ "panic: av_fetch() unexpectedly" - " failed"); - } - if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) { - Perl_croak(aTHX_ "panic: unexpected entry for %s", - SvPVX(*entryp)); - } - if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp), - (SV*) i_list, FALSE)) - { - Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); - } - - /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/ - for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) { - entryp = av_fetch(from_list, j, FALSE); - if (entryp == NULL) { - Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); - } - - /* When i==j this adds itself to the list */ - av_push(i_list, newSVuv(utf8_to_uvchr_buf( - (U8*) SvPVX(*entryp), - (U8*) SvPVX(*entryp) + SvCUR(*entryp), - 0))); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/ - } - } - } - } - SvREFCNT_dec(specials_inverse); /* done with it */ - } /* End of specials */ - - /* read $swash->{LIST} */ - -#if UNICODE_MAJOR_VERSION == 3 \ - && UNICODE_DOT_VERSION == 0 \ - && UNICODE_DOT_DOT_VERSION == 1 - - /* For this version only U+130 and U+131 are equivalent under qr//i. Add a - * rule so that things work under /iaa and /il */ - - SV * mod_listsv = sv_mortalcopy(*listsvp); - sv_catpv(mod_listsv, "130\t130\t131\n"); - l = (U8*)SvPV(mod_listsv, lcur); - -#else - - l = (U8*)SvPV(*listsvp, lcur); - -#endif - - lend = l + lcur; - - /* Go through each input line */ - while (l < lend) { - UV min, max, val; - UV inverse; - l = swash_scan_list_line(l, lend, &min, &max, &val, - cBOOL(octets), typestr); - if (l > lend) { - break; - } - - /* Each element in the range is to be inverted */ - for (inverse = min; inverse <= max; inverse++) { - AV* list; - SV** listp; - IV i; - bool found_key = FALSE; - bool found_inverse = FALSE; - - /* The key is the inverse mapping */ - char key[UTF8_MAXBYTES+1]; - char* key_end = (char *) uvchr_to_utf8((U8*) key, val); - STRLEN key_len = key_end - key; - - /* Get the list for the map */ - if ((listp = hv_fetch(ret, key, key_len, FALSE))) { - list = (AV*) *listp; - } - else { /* No entry yet for it: create one */ - list = newAV(); - if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) { - Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); - } - } - - /* Look through list to see if this inverse mapping already is - * listed, or if there is a mapping to itself already */ - for (i = 0; i <= av_tindex_skip_len_mg(list); i++) { - SV** entryp = av_fetch(list, i, FALSE); - SV* entry; - UV uv; - if (entryp == NULL) { - Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed"); - } - entry = *entryp; - uv = SvUV(entry); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %" UVXf " contains %" UVXf "\n", val, uv));*/ - if (uv == val) { - found_key = TRUE; - } - if (uv == inverse) { - found_inverse = TRUE; - } - - /* No need to continue searching if found everything we are - * looking for */ - if (found_key && found_inverse) { - break; - } - } - - /* Make sure there is a mapping to itself on the list */ - if (! found_key) { - av_push(list, newSVuv(val)); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, val, val));*/ - } - - - /* Simply add the value to the list */ - if (! found_inverse) { - av_push(list, newSVuv(inverse)); - /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, inverse, val));*/ - } - - /* swatch_get() increments the value of val for each element in the - * range. That makes more compact tables possible. You can - * express the capitalization, for example, of all consecutive - * letters with a single line: 0061\t007A\t0041 This maps 0061 to - * 0041, 0062 to 0042, etc. I (khw) have never understood 'none', - * and it's not documented; it appears to be used only in - * implementing tr//; I copied the semantics from swatch_get(), just - * in case */ - if (!none || val < none) { - ++val; - } - } - } - - return ret; -} - -SV* -Perl__swash_to_invlist(pTHX_ SV* const swash) -{ - - /* Subject to change or removal. For use only in one place in regcomp.c. - * Ownership is given to one reference count in the returned SV* */ - - U8 *l, *lend; - char *loc; - STRLEN lcur; - HV *const hv = MUTABLE_HV(SvRV(swash)); - UV elements = 0; /* Number of elements in the inversion list */ - U8 empty[] = ""; - SV** listsvp; - SV** typesvp; - SV** bitssvp; - SV** extssvp; - SV** invert_it_svp; - - U8* typestr; - STRLEN bits; - STRLEN octets; /* if bits == 1, then octets == 0 */ - U8 *x, *xend; - STRLEN xcur; - - SV* invlist; - - PERL_ARGS_ASSERT__SWASH_TO_INVLIST; - - /* If not a hash, it must be the swash's inversion list instead */ - if (SvTYPE(hv) != SVt_PVHV) { - return SvREFCNT_inc_simple_NN((SV*) hv); - } - - /* The string containing the main body of the table */ - listsvp = hv_fetchs(hv, "LIST", FALSE); - typesvp = hv_fetchs(hv, "TYPE", FALSE); - bitssvp = hv_fetchs(hv, "BITS", FALSE); - extssvp = hv_fetchs(hv, "EXTRAS", FALSE); - invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE); - - typestr = (U8*)SvPV_nolen(*typesvp); - bits = SvUV(*bitssvp); - octets = bits >> 3; /* if bits == 1, then octets == 0 */ - - /* read $swash->{LIST} */ - if (SvPOK(*listsvp)) { - l = (U8*)SvPV(*listsvp, lcur); - } - else { - /* LIST legitimately doesn't contain a string during compilation phases - * of Perl itself, before the Unicode tables are generated. In this - * case, just fake things up by creating an empty list */ - l = empty; - lcur = 0; - } - loc = (char *) l; - lend = l + lcur; - - if (*l == 'V') { /* Inversion list format */ - const char *after_atou = (char *) lend; - UV element0; - UV* other_elements_ptr; - - /* The first number is a count of the rest */ - l++; - if (!grok_atoUV((const char *)l, &elements, &after_atou)) { - Perl_croak(aTHX_ "panic: Expecting a valid count of elements" - " at start of inversion list"); - } - if (elements == 0) { - invlist = _new_invlist(0); - } - else { - l = (U8 *) after_atou; - - /* Get the 0th element, which is needed to setup the inversion list - * */ - while (isSPACE(*l)) l++; - if (!grok_atoUV((const char *)l, &element0, &after_atou)) { - Perl_croak(aTHX_ "panic: Expecting a valid 0th element for" - " inversion list"); - } - l = (U8 *) after_atou; - invlist = _setup_canned_invlist(elements, element0, - &other_elements_ptr); - elements--; - - /* Then just populate the rest of the input */ - while (elements-- > 0) { - if (l > lend) { - Perl_croak(aTHX_ "panic: Expecting %" UVuf " more" - " elements than available", elements); - } - while (isSPACE(*l)) l++; - if (!grok_atoUV((const char *)l, other_elements_ptr++, - &after_atou)) - { - Perl_croak(aTHX_ "panic: Expecting a valid element" - " in inversion list"); - } - l = (U8 *) after_atou; - } - } - } - else { - - /* Scan the input to count the number of lines to preallocate array - * size based on worst possible case, which is each line in the input - * creates 2 elements in the inversion list: 1) the beginning of a - * range in the list; 2) the beginning of a range not in the list. */ - while ((loc = (char *) memchr(loc, '\n', lend - (U8 *) loc)) != NULL) { - elements += 2; - loc++; - } - - /* If the ending is somehow corrupt and isn't a new line, add another - * element for the final range that isn't in the inversion list */ - if (! (*lend == '\n' - || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n')))) - { - elements++; - } - - invlist = _new_invlist(elements); - - /* Now go through the input again, adding each range to the list */ - while (l < lend) { - UV start, end; - UV val; /* Not used by this function */ - - l = swash_scan_list_line(l, lend, &start, &end, &val, - cBOOL(octets), typestr); - - if (l > lend) { - break; - } - - invlist = _add_range_to_invlist(invlist, start, end); - } - } - - /* Invert if the data says it should be */ - if (invert_it_svp && SvUV(*invert_it_svp)) { - _invlist_invert(invlist); - } - - /* This code is copied from swatch_get() - * read $swash->{EXTRAS} */ - x = (U8*)SvPV(*extssvp, xcur); - xend = x + xcur; - while (x < xend) { - STRLEN namelen; - U8 *namestr; - SV** othersvp; - HV* otherhv; - STRLEN otherbits; - SV **otherbitssvp, *other; - U8 *nl; - - const U8 opc = *x++; - if (opc == '\n') - continue; - - nl = (U8*)memchr(x, '\n', xend - x); - - if (opc != '-' && opc != '+' && opc != '!' && opc != '&') { - if (nl) { - x = nl + 1; /* 1 is length of "\n" */ - continue; - } - else { - x = xend; /* to EXTRAS' end at which \n is not found */ - break; - } - } - - namestr = x; - if (nl) { - namelen = nl - namestr; - x = nl + 1; - } - else { - namelen = xend - namestr; - x = xend; - } - - othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE); - otherhv = MUTABLE_HV(SvRV(*othersvp)); - otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE); - otherbits = (STRLEN)SvUV(*otherbitssvp); - - if (bits != otherbits || bits != 1) { - Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean " - "properties, bits=%" UVuf ", otherbits=%" UVuf, - (UV)bits, (UV)otherbits); - } - - /* The "other" swatch must be destroyed after. */ - other = _swash_to_invlist((SV *)*othersvp); - - /* End of code copied from swatch_get() */ - switch (opc) { - case '+': - _invlist_union(invlist, other, &invlist); - break; - case '!': - _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist); - break; - case '-': - _invlist_subtract(invlist, other, &invlist); - break; - case '&': - _invlist_intersection(invlist, other, &invlist); - break; - default: - break; - } - sv_free(other); /* through with it! */ - } - - SvREADONLY_on(invlist); - return invlist; -} - -SV* -Perl__get_swash_invlist(pTHX_ SV* const swash) -{ - SV** ptr; - - PERL_ARGS_ASSERT__GET_SWASH_INVLIST; - - if (! SvROK(swash)) { - return NULL; - } - - /* If it really isn't a hash, it isn't really swash; must be an inversion - * list */ - if (SvTYPE(SvRV(swash)) != SVt_PVHV) { - return SvRV(swash); - } - - ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE); - if (! ptr) { - return NULL; - } - - return *ptr; -} - bool Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len) { @@ -5818,7 +5162,13 @@ L (Case Mappings). * that effect. However, if the caller knows what * it's doing, it can pass this flag to indicate that, * and the assertion is skipped. - * FOLDEQ_S2_ALREADY_FOLDED Similarly. + * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies + * to s2, and s2 doesn't have to be UTF-8 encoded. + * This introduces an asymmetry to save a few branches + * in a loop. Currently, this is not a problem, as + * never are both inputs pre-folded. Simply call this + * function with the pre-folded one as the second + * string. * FOLDEQ_S2_FOLDS_SANE */ I32 @@ -5841,11 +5191,11 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS; - assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE)) - && (((flags & FOLDEQ_S1_ALREADY_FOLDED) - && !(flags & FOLDEQ_S1_FOLDS_SANE)) - || ((flags & FOLDEQ_S2_ALREADY_FOLDED) - && !(flags & FOLDEQ_S2_FOLDS_SANE))))); + assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE)) + && (( (flags & FOLDEQ_S1_ALREADY_FOLDED) + && !(flags & FOLDEQ_S1_FOLDS_SANE)) + || ( (flags & FOLDEQ_S2_ALREADY_FOLDED) + && !(flags & FOLDEQ_S2_FOLDS_SANE))))); /* The algorithm is to trial the folds without regard to the flags on * the first line of the above assert(), and then see if the result * violates them. This means that the inputs can't be pre-folded to a @@ -5859,12 +5209,20 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, if (flags & FOLDEQ_LOCALE) { if (IN_UTF8_CTYPE_LOCALE) { - flags &= ~FOLDEQ_LOCALE; + if (UNLIKELY(PL_in_utf8_turkic_locale)) { + flags_for_folder |= FOLD_FLAGS_LOCALE; + } + else { + flags &= ~FOLDEQ_LOCALE; + } } else { flags_for_folder |= FOLD_FLAGS_LOCALE; } } + if (flags & FOLDEQ_UTF8_NOMIX_ASCII) { + flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII; + } if (pe1) { e1 = *(U8**)pe1; @@ -5949,9 +5307,23 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, if (n2 == 0) { /* Same for s2 */ if (flags & FOLDEQ_S2_ALREADY_FOLDED) { - f2 = (U8 *) p2; - assert(u2); - n2 = UTF8SKIP(f2); + + /* Point to the already-folded character. But for non-UTF-8 + * variants, convert to UTF-8 for the algorithm below */ + if (UTF8_IS_INVARIANT(*p2)) { + f2 = (U8 *) p2; + n2 = 1; + } + else if (u2) { + f2 = (U8 *) p2; + n2 = UTF8SKIP(f2); + } + else { + foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2); + foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2); + f2 = foldbuf2; + n2 = 2; + } } else { if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) { @@ -6083,5 +5455,57 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) } /* +=for apidoc utf8_to_uvchr + +Returns the native code point of the first character in the string C +which is assumed to be in UTF-8 encoding; C will be set to the +length, in bytes, of that character. + +Some, but not all, UTF-8 malformations are detected, and in fact, some +malformed input could cause reading beyond the end of the input buffer, which +is why this function is deprecated. Use L instead. + +If C points to one of the detected malformations, and UTF8 warnings are +enabled, zero is returned and C<*retlen> is set (if C isn't +C) to -1. If those warnings are off, the computed value if well-defined (or +the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> +is set (if C isn't NULL) so that (S + C<*retlen>>) is the +next possible position in C that could begin a non-malformed character. +See L for details on when the REPLACEMENT CHARACTER is returned. + +=cut +*/ + +UV +Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) +{ + PERL_ARGS_ASSERT_UTF8_TO_UVCHR; + + /* This function is unsafe if malformed UTF-8 input is given it, which is + * why the function is deprecated. If the first byte of the input + * indicates that there are more bytes remaining in the sequence that forms + * the character than there are in the input buffer, it can read past the + * end. But we can make it safe if the input string happens to be + * NUL-terminated, as many strings in Perl are, by refusing to read past a + * NUL. A NUL indicates the start of the next character anyway. If the + * input isn't NUL-terminated, the function remains unsafe, as it always + * has been. + * + * An initial NUL has to be handled separately, but all ASCIIs can be + * handled the same way, speeding up this common case */ + + if (UTF8_IS_INVARIANT(*s)) { /* Assumes 's' contains at least 1 byte */ + if (retlen) { + *retlen = 1; + } + return (UV) *s; + } + + return utf8_to_uvchr_buf(s, + s + my_strnlen((char *) s, UTF8SKIP(s)), + retlen); +} + +/* * ex: set ts=8 sts=4 sw=4 et: */