X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/33a4312b882ab5f5396974a8fe8f5235559c2d82..973ae08839a3c974ff88014594ecf63cdb20d5e8:/inline.h diff --git a/inline.h b/inline.h index 4cc6a74..7d81da6 100644 --- a/inline.h +++ b/inline.h @@ -5,8 +5,33 @@ * You may distribute under the terms of either the GNU General Public * License or the Artistic License, as specified in the README file. * + * This file contains tables and code adapted from + * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this + * copyright notice: + +Copyright (c) 2008-2009 Bjoern Hoehrmann + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + * * This file is a home for static inline functions that cannot go in other - * headers files, because they depend on proto.h (included after most other + * header files, because they depend on proto.h (included after most other * headers) or struct definitions. * * Each section names the header file that the functions "belong" to. @@ -131,7 +156,7 @@ PERL_STATIC_INLINE I32 S_TOPMARK(pTHX) { DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, - "MARK top %p %"IVdf"\n", + "MARK top %p %" IVdf "\n", PL_markstack_ptr, (IV)*PL_markstack_ptr))); return *PL_markstack_ptr; @@ -141,7 +166,7 @@ PERL_STATIC_INLINE I32 S_POPMARK(pTHX) { DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, - "MARK pop %p %"IVdf"\n", + "MARK pop %p %" IVdf "\n", (PL_markstack_ptr-1), (IV)*(PL_markstack_ptr-1)))); assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow"); @@ -153,8 +178,10 @@ S_POPMARK(pTHX) PERL_STATIC_INLINE struct regexp * S_ReANY(const REGEXP * const re) { + XPV* const p = (XPV*)SvANY(re); assert(isREGEXP(re)); - return re->sv_u.svu_rx; + return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx + : (struct regexp *)p; } /* ------------------------------- sv.h ------------------------------- */ @@ -190,116 +217,1650 @@ S_SvREFCNT_dec(pTHX_ SV *sv) } } -PERL_STATIC_INLINE void -S_SvREFCNT_dec_NN(pTHX_ SV *sv) -{ - U32 rc = SvREFCNT(sv); - if (LIKELY(rc > 1)) - SvREFCNT(sv) = rc - 1; - else - Perl_sv_free2(aTHX_ sv, rc); -} +PERL_STATIC_INLINE void +S_SvREFCNT_dec_NN(pTHX_ SV *sv) +{ + U32 rc = SvREFCNT(sv); + if (LIKELY(rc > 1)) + SvREFCNT(sv) = rc - 1; + else + Perl_sv_free2(aTHX_ sv, rc); +} + +PERL_STATIC_INLINE void +SvAMAGIC_on(SV *sv) +{ + assert(SvROK(sv)); + if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv))); +} +PERL_STATIC_INLINE void +SvAMAGIC_off(SV *sv) +{ + if (SvROK(sv) && SvOBJECT(SvRV(sv))) + HvAMAGIC_off(SvSTASH(SvRV(sv))); +} + +PERL_STATIC_INLINE U32 +S_SvPADSTALE_on(SV *sv) +{ + assert(!(SvFLAGS(sv) & SVs_PADTMP)); + return SvFLAGS(sv) |= SVs_PADSTALE; +} +PERL_STATIC_INLINE U32 +S_SvPADSTALE_off(SV *sv) +{ + assert(!(SvFLAGS(sv) & SVs_PADTMP)); + return SvFLAGS(sv) &= ~SVs_PADSTALE; +} +#if defined(PERL_CORE) || defined (PERL_EXT) +PERL_STATIC_INLINE STRLEN +S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp) +{ + PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B; + if (SvGAMAGIC(sv)) { + U8 *hopped = utf8_hop((U8 *)pv, pos); + if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped); + return (STRLEN)(hopped - (U8 *)pv); + } + return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN); +} +#endif + +/* ------------------------------- handy.h ------------------------------- */ + +/* saves machine code for a common noreturn idiom typically used in Newx*() */ +GCC_DIAG_IGNORE_DECL(-Wunused-function); +static void +S_croak_memory_wrap(void) +{ + Perl_croak_nocontext("%s",PL_memory_wrap); +} +GCC_DIAG_RESTORE_DECL; + +/* ------------------------------- utf8.h ------------------------------- */ + +/* +=head1 Unicode Support +*/ + +PERL_STATIC_INLINE void +S_append_utf8_from_native_byte(const U8 byte, U8** dest) +{ + /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8 + * encoded string at '*dest', updating '*dest' to include it */ + + PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE; + + if (NATIVE_BYTE_IS_INVARIANT(byte)) + *((*dest)++) = byte; + else { + *((*dest)++) = UTF8_EIGHT_BIT_HI(byte); + *((*dest)++) = UTF8_EIGHT_BIT_LO(byte); + } +} + +/* +=for apidoc valid_utf8_to_uvchr +Like C>, but should only be called when it is known that +the next character in the input UTF-8 string C is well-formed (I, +it passes C>. Surrogates, non-character code points, and +non-Unicode code points are allowed. + +=cut + + */ + +PERL_STATIC_INLINE UV +Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen) +{ + const UV expectlen = UTF8SKIP(s); + const U8* send = s + expectlen; + UV uv = *s; + + PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR; + + if (retlen) { + *retlen = expectlen; + } + + /* An invariant is trivially returned */ + if (expectlen == 1) { + return uv; + } + + /* Remove the leading bits that indicate the number of bytes, leaving just + * the bits that are part of the value */ + uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); + + /* Now, loop through the remaining bytes, accumulating each into the + * working total as we go. (I khw tried unrolling the loop for up to 4 + * bytes, but there was no performance improvement) */ + for (++s; s < send; s++) { + uv = UTF8_ACCUMULATE(uv, *s); + } + + return UNI_TO_NATIVE(uv); + +} + +/* +=for apidoc is_utf8_invariant_string + +Returns TRUE if the first C bytes of the string C are the same +regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on +EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they +are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only +the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range +characters are invariant, but so also are the C1 controls. + +If C is 0, it will be calculated using C, (which means if you +use this option, that C can't have embedded C characters and has to +have a terminating C byte). + +See also +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +and +C>. + +=cut + +*/ + +#define is_utf8_invariant_string(s, len) \ + is_utf8_invariant_string_loc(s, len, NULL) + +/* +=for apidoc is_utf8_invariant_string_loc + +Like C> but upon failure, stores the location of +the first UTF-8 variant character in the C pointer; if all characters are +UTF-8 invariant, this function does not change the contents of C<*ep>. + +=cut + +*/ + +PERL_STATIC_INLINE bool +S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) +{ + const U8* send; + const U8* x = s; + + PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC; + + if (len == 0) { + len = strlen((const char *)s); + } + + send = s + len; + +/* This looks like 0x010101... */ +# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF) + +/* This looks like 0x808080... */ +# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80) +# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T) +# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1) + +/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by + * or'ing together the lowest bits of 'x'. Hopefully the final term gets + * optimized out completely on a 32-bit system, and its mask gets optimized out + * on a 64-bit system */ +# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \ + | ( PTR2nat(x) >> 1) \ + | ( ( (PTR2nat(x) \ + & PERL_WORD_BOUNDARY_MASK) >> 2)))) + +#ifndef EBCDIC + + /* Do the word-at-a-time iff there is at least one usable full word. That + * means that after advancing to a word boundary, there still is at least a + * full word left. The number of bytes needed to advance is 'wordsize - + * offset' unless offset is 0. */ + if ((STRLEN) (send - x) >= PERL_WORDSIZE + + /* This term is wordsize if subword; 0 if not */ + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) + + /* 'offset' */ + - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) + { + + /* Process per-byte until reach word boundary. XXX This loop could be + * eliminated if we knew that this platform had fast unaligned reads */ + while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { + if (! UTF8_IS_INVARIANT(*x)) { + if (ep) { + *ep = x; + } + + return FALSE; + } + x++; + } + + /* Here, we know we have at least one full word to process. Process + * per-word as long as we have at least a full word left */ + do { + if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) { + + /* Found a variant. Just return if caller doesn't want its + * exact position */ + if (! ep) { + return FALSE; + } + +# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \ + || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 + + *ep = x + _variant_byte_number(* (PERL_UINTMAX_T *) x); + assert(*ep >= s && *ep < send); + + return FALSE; + +# else /* If weird byte order, drop into next loop to do byte-at-a-time + checks. */ + + break; +# endif + } + + x += PERL_WORDSIZE; + + } while (x + PERL_WORDSIZE <= send); + } + +#endif /* End of ! EBCDIC */ + + /* Process per-byte */ + while (x < send) { + if (! UTF8_IS_INVARIANT(*x)) { + if (ep) { + *ep = x; + } + + return FALSE; + } + + x++; + } + + return TRUE; +} + +#ifndef EBCDIC + +PERL_STATIC_INLINE unsigned int +S__variant_byte_number(PERL_UINTMAX_T word) +{ + + /* This returns the position in a word (0..7) of the first variant byte in + * it. This is a helper function. Note that there are no branches */ + + assert(word); + + /* Get just the msb bits of each byte */ + word &= PERL_VARIANTS_WORD_MASK; + +# ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the + easiest thing is to hide that from the callers */ + { + unsigned int i; + const U8 * s = (U8 *) &word; + dTHX; + + for (i = 0; i < sizeof(word); i++ ) { + if (s[i]) { + return i; + } + } + + Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n", + __FILE__, __LINE__); + } + +# elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 + + /* Bytes are stored like + * Byte8 ... Byte2 Byte1 + * 63..56...15...8 7...0 + * + * Isolate the lsb; + * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set + * + * The word will look this this, with a rightmost set bit in position 's': + * ('x's are don't cares) + * s + * x..x100..0 + * x..xx10..0 Right shift (rightmost 0 is shifted off) + * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and + * the 1 just to their left into a 0; the remainder is + * untouched + * 0..0011..1 The xor with the original, x..xx10..0, clears that + * remainder, sets the bottom to all 1 + * 0..0100..0 Add 1 to clear the word except for the bit in 's' + * + * Another method is to do 'word &= -word'; but it generates a compiler + * message on some platforms about taking the negative of an unsigned */ + + word >>= 1; + word = 1 + (word ^ (word - 1)); + +# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 + + /* Bytes are stored like + * Byte1 Byte2 ... Byte8 + * 63..56 55..47 ... 7...0 + * + * Isolate the msb; http://codeforces.com/blog/entry/10330 + * + * Only the most significant set bit matters. Or'ing word with its right + * shift of 1 makes that bit and the next one to its right both 1. Then + * right shifting by 2 makes for 4 1-bits in a row. ... We end with the + * msb and all to the right being 1. */ + word |= word >> 1; + word |= word >> 2; + word |= word >> 4; + word |= word >> 8; + word |= word >> 16; + word |= word >> 32; /* This should get optimized out on 32-bit systems. */ + + /* Then subtracting the right shift by 1 clears all but the left-most of + * the 1 bits, which is our desired result */ + word -= (word >> 1); + +# else +# error Unexpected byte order +# endif + + /* Here 'word' has a single bit set: the msb of the first byte in which it + * is set. Calculate that position in the word. We can use this + * specialized solution: https://stackoverflow.com/a/32339674/1626653, + * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should + * just get shifted off at compile time) */ + word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48) + | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32) + | (39 << 24) | (47 << 16) + | (55 << 8) | (63 << 0)); + word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */ + + /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */ + word = ((word + 1) >> 3) - 1; + +# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 + + /* And invert the result */ + word = CHARBITS - word - 1; + +# endif + + return (unsigned int) word; +} + +#endif +#if defined(PERL_CORE) || defined(PERL_EXT) + +/* +=for apidoc variant_under_utf8_count + +This function looks at the sequence of bytes between C and C, which are +assumed to be encoded in ASCII/Latin1, and returns how many of them would +change should the string be translated into UTF-8. Due to the nature of UTF-8, +each of these would occupy two bytes instead of the single one in the input +string. Thus, this function returns the precise number of bytes the string +would expand by when translated to UTF-8. + +Unlike most of the other functions that have C in their name, the input +to this function is NOT a UTF-8-encoded string. The function name is slightly +I to emphasize this. + +This function is internal to Perl because khw thinks that any XS code that +would want this is probably operating too close to the internals. Presenting a +valid use case could change that. + +See also +C> +and +C>, + +=cut + +*/ + +PERL_STATIC_INLINE Size_t +S_variant_under_utf8_count(const U8* const s, const U8* const e) +{ + const U8* x = s; + Size_t count = 0; + + PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT; + +# ifndef EBCDIC + + /* Test if the string is long enough to use word-at-a-time. (Logic is the + * same as for is_utf8_invariant_string()) */ + if ((STRLEN) (e - x) >= PERL_WORDSIZE + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) + - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) + { + + /* Process per-byte until reach word boundary. XXX This loop could be + * eliminated if we knew that this platform had fast unaligned reads */ + while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { + count += ! UTF8_IS_INVARIANT(*x++); + } + + /* Process per-word as long as we have at least a full word left */ + do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an + explanation of how this works */ + PERL_UINTMAX_T increment + = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7) + * PERL_COUNT_MULTIPLIER) + >> ((PERL_WORDSIZE - 1) * CHARBITS); + count += (Size_t) increment; + x += PERL_WORDSIZE; + } while (x + PERL_WORDSIZE <= e); + } + +# endif + + /* Process per-byte */ + while (x < e) { + if (! UTF8_IS_INVARIANT(*x)) { + count++; + } + + x++; + } + + return count; +} + +#endif + +#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */ +# undef PERL_WORDSIZE +# undef PERL_COUNT_MULTIPLIER +# undef PERL_WORD_BOUNDARY_MASK +# undef PERL_VARIANTS_WORD_MASK +#endif + +/* +=for apidoc is_utf8_string + +Returns TRUE if the first C bytes of string C form a valid +Perl-extended-UTF-8 string; returns FALSE otherwise. If C is 0, it will +be calculated using C (which means if you use this option, that C +can't have embedded C characters and has to have a terminating C +byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. + +This function considers Perl's extended UTF-8 to be valid. That means that +code points above Unicode, surrogates, and non-character code points are +considered valid by this function. Use C>, +C>, or C> to restrict what +code points are considered valid. + +See also +C>, +C>, +C>, +C>, +C>, +C>, +C>, + +=cut +*/ + +#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL) + +#if defined(PERL_CORE) || defined (PERL_EXT) + +/* +=for apidoc is_utf8_non_invariant_string + +Returns TRUE if L returns FALSE for the first +C bytes of the string C, but they are, nonetheless, legal Perl-extended +UTF-8; otherwise returns FALSE. + +A TRUE return means that at least one code point represented by the sequence +either is a wide character not representable as a single byte, or the +representation differs depending on whether the sequence is encoded in UTF-8 or +not. + +See also +C>, +C> + +=cut + +This is commonly used to determine if a SV's UTF-8 flag should be turned on. +It generally needn't be if its string is entirely UTF-8 invariant, and it +shouldn't be if it otherwise contains invalid UTF-8. + +It is an internal function because khw thinks that XS code shouldn't be working +at this low a level. A valid use case could change that. + +*/ + +PERL_STATIC_INLINE bool +S_is_utf8_non_invariant_string(const U8* const s, STRLEN len) +{ + const U8 * first_variant; + + PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING; + + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + return FALSE; + } + + return is_utf8_string(first_variant, len - (first_variant - s)); +} + +#endif + +/* +=for apidoc is_strict_utf8_string + +Returns TRUE if the first C bytes of string C form a valid +UTF-8-encoded string that is fully interchangeable by any application using +Unicode rules; otherwise it returns FALSE. If C is 0, it will be +calculated using C (which means if you use this option, that C +can't have embedded C characters and has to have a terminating C +byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'. + +This function returns FALSE for strings containing any +code points above the Unicode max of 0x10FFFF, surrogate code points, or +non-character code points. + +See also +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +and +C>. + +=cut +*/ + +#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL) + +/* +=for apidoc is_c9strict_utf8_string + +Returns TRUE if the first C bytes of string C form a valid +UTF-8-encoded string that conforms to +L; +otherwise it returns FALSE. If C is 0, it will be calculated using +C (which means if you use this option, that C can't have embedded +C characters and has to have a terminating C byte). Note that all +characters being ASCII constitute 'a valid UTF-8 string'. + +This function returns FALSE for strings containing any code points above the +Unicode max of 0x10FFFF or surrogate code points, but accepts non-character +code points per +L. + +See also +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +and +C>. + +=cut +*/ + +#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0) + +/* +=for apidoc is_utf8_string_flags + +Returns TRUE if the first C bytes of string C form a valid +UTF-8 string, subject to the restrictions imposed by C; +returns FALSE otherwise. If C is 0, it will be calculated +using C (which means if you use this option, that C can't have +embedded C characters and has to have a terminating C byte). Note +that all characters being ASCII constitute 'a valid UTF-8 string'. + +If C is 0, this gives the same results as C>; if +C is C, this gives the same results +as C>; and if C is +C, this gives the same results as +C>. Otherwise C may be any +combination of the C> flags understood by +C>, with the same meanings. + +See also +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +C>, +and +C>. + +=cut +*/ + +PERL_STATIC_INLINE bool +S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) +{ + const U8 * first_variant; + + PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS; + assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE + |UTF8_DISALLOW_PERL_EXTENDED))); + + if (len == 0) { + len = strlen((const char *)s); + } + + if (flags == 0) { + return is_utf8_string(s, len); + } + + if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) + == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) + { + return is_strict_utf8_string(s, len); + } + + if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) + == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) + { + return is_c9strict_utf8_string(s, len); + } + + if (! is_utf8_invariant_string_loc(s, len, &first_variant)) { + const U8* const send = s + len; + const U8* x = first_variant; + + while (x < send) { + STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); + if (UNLIKELY(! cur_len)) { + return FALSE; + } + x += cur_len; + } + } + + return TRUE; +} + +/* + +=for apidoc is_utf8_string_loc + +Like C> but stores the location of the failure (in the +case of "utf8ness failure") or the location C+C (in the case of +"utf8ness success") in the C pointer. + +See also C>. + +=cut +*/ + +#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0) + +/* + +=for apidoc is_utf8_string_loclen + +Like C> but stores the location of the failure (in the +case of "utf8ness failure") or the location C+C (in the case of +"utf8ness success") in the C pointer, and the number of UTF-8 +encoded characters in the C pointer. + +See also C>. + +=cut +*/ + +PERL_STATIC_INLINE bool +Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) +{ + const U8 * first_variant; + + PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; + + if (len == 0) { + len = strlen((const char *) s); + } + + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } + + return TRUE; + } + + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isUTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } +} + +/* + +=for apidoc isUTF8_CHAR + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8, as extended by Perl, +that represents some code point; otherwise it evaluates to 0. If non-zero, the +value gives how many bytes starting at C comprise the code point's +representation. Any bytes remaining before C, but beyond the ones needed to +form the first code point in C, are not examined. + +The code point can be any that will fit in an IV on this machine, using Perl's +extension to official UTF-8 to represent those higher than the Unicode maximum +of 0x10FFFF. That means that this macro is used to efficiently decide if the +next few bytes in C is legal UTF-8 for a single character. + +Use C> to restrict the acceptable code points to those +defined by Unicode to be fully interchangeable across applications; +C> to use the L definition of allowable +code points; and C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. + +Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC +machines) is a valid UTF-8 character. + +=cut + +This uses an adaptation of the table and algorithm given in +http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of PL_extended_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +S_isUTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISUTF8_CHAR; + + /* This dfa is fast. If it accepts the input, it was for a well-formed, + * code point, which can be returned immediately. Otherwise, it is either + * malformed, or for the start byte FF which the dfa doesn't handle (except + * on 32-bit ASCII platforms where it trivially is an error). Call a + * helper function for the other platforms. */ + + while (s < e && LIKELY(state != 1)) { + state = PL_extended_utf8_dfa_tab[256 + + state + + PL_extended_utf8_dfa_tab[*s]]; + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + +#if defined(UV_IS_QUAD) || defined(EBCDIC) + + if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) { + return _is_utf8_char_helper(s0, e, 0); + } + +#endif + + return 0; +} + +/* + +=for apidoc isSTRICT_UTF8_CHAR + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8 that represents some +Unicode code point completely acceptable for open interchange between all +applications; otherwise it evaluates to 0. If non-zero, the value gives how +many bytes starting at C comprise the code point's representation. Any +bytes remaining before C, but beyond the ones needed to form the first code +point in C, are not examined. + +The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not +be a surrogate nor a non-character code point. Thus this excludes any code +point from Perl's extended UTF-8. + +This is used to efficiently decide if the next few bytes in C is +legal Unicode-acceptable UTF-8 for a single character. + +Use C> to use the L definition of allowable +code points; C> to check for Perl's extended UTF-8; +and C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. + +=cut + +This uses an adaptation of the tables and algorithm given in +http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of strict_extended_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; + + while (s < e && LIKELY(state != 1)) { + state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]]; + + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + +#ifndef EBCDIC + + /* The dfa above drops out for certain Hanguls; handle them specially */ + if (is_HANGUL_ED_utf8_safe(s0, e)) { + return 3; + } + +#endif + + return 0; +} + +/* + +=for apidoc isC9_STRICT_UTF8_CHAR + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8 that represents some +Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, +the value gives how many bytes starting at C comprise the code point's +representation. Any bytes remaining before C, but beyond the ones needed to +form the first code point in C, are not examined. + +The largest acceptable code point is the Unicode maximum 0x10FFFF. This +differs from C> only in that it accepts non-character +code points. This corresponds to +L. +which said that non-character code points are merely discouraged rather than +completely forbidden in open interchange. See +L. + +Use C> to check for Perl's extended UTF-8; and +C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. + +=cut + +This uses an adaptation of the tables and algorithm given in +http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of PL_c9_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +S_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR; + + while (s < e && LIKELY(state != 1)) { + state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]]; + + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + + return 0; +} + +/* + +=for apidoc is_strict_utf8_string_loc + +Like C> but stores the location of the failure (in the +case of "utf8ness failure") or the location C+C (in the case of +"utf8ness success") in the C pointer. + +See also C>. + +=cut +*/ + +#define is_strict_utf8_string_loc(s, len, ep) \ + is_strict_utf8_string_loclen(s, len, ep, 0) + +/* + +=for apidoc is_strict_utf8_string_loclen + +Like C> but stores the location of the failure (in the +case of "utf8ness failure") or the location C+C (in the case of +"utf8ness success") in the C pointer, and the number of UTF-8 +encoded characters in the C pointer. + +See also C>. + +=cut +*/ + +PERL_STATIC_INLINE bool +S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) +{ + const U8 * first_variant; + + PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN; + + if (len == 0) { + len = strlen((const char *) s); + } + + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } + + return TRUE; + } + + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } +} + +/* + +=for apidoc is_c9strict_utf8_string_loc + +Like C> but stores the location of the failure (in +the case of "utf8ness failure") or the location C+C (in the case of +"utf8ness success") in the C pointer. + +See also C>. + +=cut +*/ + +#define is_c9strict_utf8_string_loc(s, len, ep) \ + is_c9strict_utf8_string_loclen(s, len, ep, 0) + +/* + +=for apidoc is_c9strict_utf8_string_loclen + +Like C> but stores the location of the failure (in +the case of "utf8ness failure") or the location C+C (in the case of +"utf8ness success") in the C pointer, and the number of UTF-8 encoded +characters in the C pointer. + +See also C>. + +=cut +*/ + +PERL_STATIC_INLINE bool +S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) +{ + const U8 * first_variant; + + PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN; + + if (len == 0) { + len = strlen((const char *) s); + } + + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } + + return TRUE; + } + + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } +} + +/* + +=for apidoc is_utf8_string_loc_flags + +Like C> but stores the location of the failure (in the +case of "utf8ness failure") or the location C+C (in the case of +"utf8ness success") in the C pointer. + +See also C>. + +=cut +*/ + +#define is_utf8_string_loc_flags(s, len, ep, flags) \ + is_utf8_string_loclen_flags(s, len, ep, 0, flags) + + +/* The above 3 actual functions could have been moved into the more general one + * just below, and made #defines that call it with the right 'flags'. They are + * currently kept separate to increase their chances of getting inlined */ + +/* + +=for apidoc is_utf8_string_loclen_flags + +Like C> but stores the location of the failure (in the +case of "utf8ness failure") or the location C+C (in the case of +"utf8ness success") in the C pointer, and the number of UTF-8 +encoded characters in the C pointer. + +See also C>. + +=cut +*/ + +PERL_STATIC_INLINE bool +S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) +{ + const U8 * first_variant; + + PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS; + assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE + |UTF8_DISALLOW_PERL_EXTENDED))); + + if (len == 0) { + len = strlen((const char *) s); + } + + if (flags == 0) { + return is_utf8_string_loclen(s, len, ep, el); + } + + if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) + == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) + { + return is_strict_utf8_string_loclen(s, len, ep, el); + } + + if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) + == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) + { + return is_c9strict_utf8_string_loclen(s, len, ep, el); + } + + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } + + return TRUE; + } + + { + const U8* send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } +} + +/* +=for apidoc utf8_distance -PERL_STATIC_INLINE void -SvAMAGIC_on(SV *sv) -{ - assert(SvROK(sv)); - if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv))); -} -PERL_STATIC_INLINE void -SvAMAGIC_off(SV *sv) +Returns the number of UTF-8 characters between the UTF-8 pointers C +and C. + +WARNING: use only if you *know* that the pointers point inside the +same UTF-8 buffer. + +=cut +*/ + +PERL_STATIC_INLINE IV +Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) { - if (SvROK(sv) && SvOBJECT(SvRV(sv))) - HvAMAGIC_off(SvSTASH(SvRV(sv))); + PERL_ARGS_ASSERT_UTF8_DISTANCE; + + return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); } -PERL_STATIC_INLINE U32 -S_SvPADSTALE_on(SV *sv) +/* +=for apidoc utf8_hop + +Return the UTF-8 pointer C displaced by C characters, either +forward or backward. + +WARNING: do not use the following unless you *know* C is within +the UTF-8 data pointed to by C *and* that on entry C is aligned +on the first byte of character or just after the last byte of a character. + +=cut +*/ + +PERL_STATIC_INLINE U8 * +Perl_utf8_hop(const U8 *s, SSize_t off) { - assert(!(SvFLAGS(sv) & SVs_PADTMP)); - return SvFLAGS(sv) |= SVs_PADSTALE; + PERL_ARGS_ASSERT_UTF8_HOP; + + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + + if (off >= 0) { + while (off--) + s += UTF8SKIP(s); + } + else { + while (off++) { + s--; + while (UTF8_IS_CONTINUATION(*s)) + s--; + } + } + GCC_DIAG_IGNORE(-Wcast-qual) + return (U8 *)s; + GCC_DIAG_RESTORE } -PERL_STATIC_INLINE U32 -S_SvPADSTALE_off(SV *sv) + +/* +=for apidoc utf8_hop_forward + +Return the UTF-8 pointer C displaced by up to C characters, +forward. + +C must be non-negative. + +C must be before or equal to C. + +When moving forward it will not move beyond C. + +Will not exceed this limit even if the string is not valid "UTF-8". + +=cut +*/ + +PERL_STATIC_INLINE U8 * +Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) { - assert(!(SvFLAGS(sv) & SVs_PADTMP)); - return SvFLAGS(sv) &= ~SVs_PADSTALE; + PERL_ARGS_ASSERT_UTF8_HOP_FORWARD; + + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + + assert(s <= end); + assert(off >= 0); + + while (off--) { + STRLEN skip = UTF8SKIP(s); + if ((STRLEN)(end - s) <= skip) { + GCC_DIAG_IGNORE(-Wcast-qual) + return (U8 *)end; + GCC_DIAG_RESTORE + } + s += skip; + } + + GCC_DIAG_IGNORE(-Wcast-qual) + return (U8 *)s; + GCC_DIAG_RESTORE } -#if defined(PERL_CORE) || defined (PERL_EXT) -PERL_STATIC_INLINE STRLEN -S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp) + +/* +=for apidoc utf8_hop_back + +Return the UTF-8 pointer C displaced by up to C characters, +backward. + +C must be non-positive. + +C must be after or equal to C. + +When moving backward it will not move before C. + +Will not exceed this limit even if the string is not valid "UTF-8". + +=cut +*/ + +PERL_STATIC_INLINE U8 * +Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) { - PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B; - if (SvGAMAGIC(sv)) { - U8 *hopped = utf8_hop((U8 *)pv, pos); - if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped); - return (STRLEN)(hopped - (U8 *)pv); + PERL_ARGS_ASSERT_UTF8_HOP_BACK; + + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + + assert(start <= s); + assert(off <= 0); + + while (off++ && s > start) { + do { + s--; + } while (UTF8_IS_CONTINUATION(*s) && s > start); } - return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN); + + GCC_DIAG_IGNORE(-Wcast-qual) + return (U8 *)s; + GCC_DIAG_RESTORE } -#endif -/* ------------------------------- handy.h ------------------------------- */ +/* +=for apidoc utf8_hop_safe -/* saves machine code for a common noreturn idiom typically used in Newx*() */ -#ifdef GCC_DIAG_PRAGMA -GCC_DIAG_IGNORE(-Wunused-function) /* Intentionally left semicolonless. */ -#endif -static void -S_croak_memory_wrap(void) +Return the UTF-8 pointer C displaced by up to C characters, +either forward or backward. + +When moving backward it will not move before C. + +When moving forward it will not move beyond C. + +Will not exceed those limits even if the string is not valid "UTF-8". + +=cut +*/ + +PERL_STATIC_INLINE U8 * +Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) { - Perl_croak_nocontext("%s",PL_memory_wrap); + PERL_ARGS_ASSERT_UTF8_HOP_SAFE; + + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g + * the bitops (especially ~) can create illegal UTF-8. + * In other words: in Perl UTF-8 is not just for Unicode. */ + + assert(start <= s && s <= end); + + if (off >= 0) { + return utf8_hop_forward(s, off, end); + } + else { + return utf8_hop_back(s, off, start); + } } -#ifdef GCC_DIAG_PRAGMA -GCC_DIAG_RESTORE /* Intentionally left semicolonless. */ -#endif -/* ------------------------------- utf8.h ------------------------------- */ +/* -PERL_STATIC_INLINE void -S_append_utf8_from_native_byte(const U8 byte, U8** dest) +=for apidoc is_utf8_valid_partial_char + +Returns 0 if the sequence of bytes starting at C and looking no further than +S> is the UTF-8 encoding, as extended by Perl, for one or more code +points. Otherwise, it returns 1 if there exists at least one non-empty +sequence of bytes that when appended to sequence C, starting at position +C causes the entire sequence to be the well-formed UTF-8 of some code point; +otherwise returns 0. + +In other words this returns TRUE if C points to a partial UTF-8-encoded code +point. + +This is useful when a fixed-length buffer is being tested for being well-formed +UTF-8, but the final few bytes in it don't comprise a full character; that is, +it is split somewhere in the middle of the final code point's UTF-8 +representation. (Presumably when the buffer is refreshed with the next chunk +of data, the new first bytes will complete the partial code point.) This +function is used to verify that the final bytes in the current buffer are in +fact the legal beginning of some code point, so that if they aren't, the +failure can be signalled without having to wait for the next read. + +=cut +*/ +#define is_utf8_valid_partial_char(s, e) \ + is_utf8_valid_partial_char_flags(s, e, 0) + +/* + +=for apidoc is_utf8_valid_partial_char_flags + +Like C>, it returns a boolean giving whether +or not the input is a valid UTF-8 encoded partial character, but it takes an +extra parameter, C, which can further restrict which code points are +considered valid. + +If C is 0, this behaves identically to +C>. Otherwise C can be any combination +of the C> flags accepted by C>. If +there is any sequence of bytes that can complete the input partial character in +such a way that a non-prohibited character is formed, the function returns +TRUE; otherwise FALSE. Non character code points cannot be determined based on +partial character input. But many of the other possible excluded types can be +determined from just the first one or two bytes. + +=cut + */ + +PERL_STATIC_INLINE bool +S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags) { - /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8 - * encoded string at '*dest', updating '*dest' to include it */ + PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS; - PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE; + assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE + |UTF8_DISALLOW_PERL_EXTENDED))); - if (NATIVE_BYTE_IS_INVARIANT(byte)) - *(*dest)++ = byte; - else { - *(*dest)++ = UTF8_EIGHT_BIT_HI(byte); - *(*dest)++ = UTF8_EIGHT_BIT_LO(byte); + if (s >= e || s + UTF8SKIP(s) <= e) { + return FALSE; } + + return cBOOL(_is_utf8_char_helper(s, e, flags)); } /* -A helper function for the macro isUTF8_CHAR(), which should be used instead of -this function. The macro will handle smaller code points directly saving time, -using this function as a fall-back for higher code points. +=for apidoc is_utf8_fixed_width_buf_flags -Tests if the first bytes of string C form a valid UTF-8 character. 0 is -returned if the bytes starting at C up to but not including C do not form a -complete well-formed UTF-8 character; otherwise the number of bytes in the -character is returned. +Returns TRUE if the fixed-width buffer starting at C with length C +is entirely valid UTF-8, subject to the restrictions given by C; +otherwise it returns FALSE. -Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a valid UTF-8 -character. +If C is 0, any well-formed UTF-8, as extended by Perl, is accepted +without restriction. If the final few bytes of the buffer do not form a +complete code point, this will return TRUE anyway, provided that +C> returns TRUE for them. -=cut */ -PERL_STATIC_INLINE STRLEN -S__is_utf8_char_slow(const U8 *s, const U8 *e) +If C in non-zero, it can be any combination of the +C> flags accepted by C>, and with the +same meanings. + +This function differs from C> only in that the latter +returns FALSE if the final few bytes of the string don't form a complete code +point. + +=cut + */ +#define is_utf8_fixed_width_buf_flags(s, len, flags) \ + is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags) + +/* + +=for apidoc is_utf8_fixed_width_buf_loc_flags + +Like C> but stores the location of the +failure in the C pointer. If the function returns TRUE, C<*ep> will point +to the beginning of any partial character at the end of the buffer; if there is +no partial character C<*ep> will contain C+C. + +See also C>. + +=cut +*/ + +#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \ + is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags) + +/* + +=for apidoc is_utf8_fixed_width_buf_loclen_flags + +Like C> but stores the number of +complete, valid characters found in the C pointer. + +=cut +*/ + +PERL_STATIC_INLINE bool +S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, + STRLEN len, + const U8 **ep, + STRLEN *el, + const U32 flags) +{ + const U8 * maybe_partial; + + PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS; + + if (! ep) { + ep = &maybe_partial; + } + + /* If it's entirely valid, return that; otherwise see if the only error is + * that the final few bytes are for a partial character */ + return is_utf8_string_loclen_flags(s, len, ep, el, flags) + || is_utf8_valid_partial_char_flags(*ep, s + len, flags); +} + +PERL_STATIC_INLINE UV +S_utf8n_to_uvchr_msgs(const U8 *s, + STRLEN curlen, + STRLEN *retlen, + const U32 flags, + U32 * errors, + AV ** msgs) { - dTHX; /* The function called below requires thread context */ + /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the + * simple cases, and, if necessary calls a helper function to deal with the + * more complex ones. Almost all well-formed non-problematic code points + * are considered simple, so that it's unlikely that the helper function + * will need to be called. + * + * This is an adaptation of the tables and algorithm given in + * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides + * comprehensive documentation of the original version. A copyright notice + * for the original version is given at the beginning of this file. The + * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[]. + */ + + const U8 * const s0 = s; + const U8 * send = s0 + curlen; + UV uv = 0; /* The 0 silences some stupid compilers */ + UV state = 0; + + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; + + /* This dfa is fast. If it accepts the input, it was for a well-formed, + * non-problematic code point, which can be returned immediately. + * Otherwise we call a helper function to figure out the more complicated + * cases. */ + + while (s < send && LIKELY(state != 1)) { + UV type = PL_strict_utf8_dfa_tab[*s]; + + uv = (state == 0) + ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s)) + : UTF8_ACCUMULATE(uv, *s); + state = PL_strict_utf8_dfa_tab[256 + state + type]; + + if (state != 0) { + s++; + continue; + } - STRLEN actual_len; + if (retlen) { + *retlen = s - s0 + 1; + } + if (errors) { + *errors = 0; + } + if (msgs) { + *msgs = NULL; + } + + return uv; + } + + /* Here is potentially problematic. Use the full mechanism */ + return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs); +} + +PERL_STATIC_INLINE UV +S__utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) +{ + PERL_ARGS_ASSERT__UTF8_TO_UVCHR_BUF; - PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW; + assert(s < send); - assert(e >= s); - utf8n_to_uvchr(s, e - s, &actual_len, UTF8_CHECK_ONLY); + if (! ckWARN_d(WARN_UTF8)) { + return utf8n_to_uvchr(s, send - s, retlen, + (UTF8_ALLOW_ANY & ~UTF8_ALLOW_EMPTY)); + } + else { + UV ret = utf8n_to_uvchr(s, send - s, retlen, 0); + if (retlen && ret == 0 && *s != '\0') { + *retlen = (STRLEN) -1; + } - return (actual_len == (STRLEN) -1) ? 0 : actual_len; + return ret; + } } /* ------------------------------- perl.h ----------------------------- */ @@ -307,7 +1868,7 @@ S__is_utf8_char_slow(const U8 *s, const U8 *e) /* =head1 Miscellaneous Functions -=for apidoc AiR|bool|is_safe_syscall|const char *pv|STRLEN len|const char *what|const char *op_name +=for apidoc is_safe_syscall Test that the given C doesn't contain any internal C characters. If it does, set C to C, optionally warn, and return FALSE. @@ -373,6 +1934,40 @@ S_should_warn_nl(const char *pv) { #endif +#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C) + +PERL_STATIC_INLINE bool +S_lossless_NV_to_IV(const NV nv, IV *ivp) +{ + /* This function determines if the input NV 'nv' may be converted without + * loss of data to an IV. If not, it returns FALSE taking no other action. + * But if it is possible, it does the conversion, returning TRUE, and + * storing the converted result in '*ivp' */ + + PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV; + +# if defined(Perl_isnan) + + if (UNLIKELY(Perl_isnan(nv))) { + return FALSE; + } + +# endif + + if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) { + return FALSE; + } + + if ((IV) nv != nv) { + return FALSE; + } + + *ivp = (IV) nv; + return TRUE; +} + +#endif + /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */ #define MAX_CHARSET_NAME_LENGTH 2 @@ -624,9 +2219,9 @@ S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) cx->blk_eval.cv = NULL; /* later set by doeval_compile() */ cx->blk_eval.cur_top_env = PL_top_env; - assert(!(PL_in_eval & ~ 0x7F)); + assert(!(PL_in_eval & ~ 0x3F)); assert(!(PL_op->op_type & ~0x1FF)); - cx->blk_u16 = (PL_in_eval & 0x7F) | ((U16)PL_op->op_type << 7); + cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7); } @@ -639,9 +2234,10 @@ S_cx_popeval(pTHX_ PERL_CONTEXT *cx) assert(CxTYPE(cx) == CXt_EVAL); PL_in_eval = CxOLD_IN_EVAL(cx); + assert(!(PL_in_eval & 0xc0)); PL_eval_root = cx->blk_eval.old_eval_root; sv = cx->blk_eval.cur_text; - if (sv && SvSCREAM(sv)) { + if (sv && CxEVAL_TXT_REFCNTED(cx)) { cx->blk_eval.cur_text = NULL; SvREFCNT_dec_NN(sv); } @@ -769,8 +2365,114 @@ S_cx_popgiven(pTHX_ PERL_CONTEXT *cx) SvREFCNT_dec(sv); } +/* ------------------ util.h ------------------------------------------- */ + +/* +=head1 Miscellaneous Functions + +=for apidoc foldEQ + +Returns true if the leading C bytes of the strings C and C are the +same +case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes +match themselves and their opposite case counterparts. Non-cased and non-ASCII +range bytes match only themselves. + +=cut +*/ + +PERL_STATIC_INLINE I32 +Perl_foldEQ(const char *s1, const char *s2, I32 len) +{ + const U8 *a = (const U8 *)s1; + const U8 *b = (const U8 *)s2; + + PERL_ARGS_ASSERT_FOLDEQ; + + assert(len >= 0); + + while (len--) { + if (*a != *b && *a != PL_fold[*b]) + return 0; + a++,b++; + } + return 1; +} + +PERL_STATIC_INLINE I32 +Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len) +{ + /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds + * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and + * does not check for this. Nor does it check that the strings each have + * at least 'len' characters. */ + + const U8 *a = (const U8 *)s1; + const U8 *b = (const U8 *)s2; + + PERL_ARGS_ASSERT_FOLDEQ_LATIN1; + + assert(len >= 0); + + while (len--) { + if (*a != *b && *a != PL_fold_latin1[*b]) { + return 0; + } + a++, b++; + } + return 1; +} + +/* +=for apidoc foldEQ_locale + +Returns true if the leading C bytes of the strings C and C are the +same case-insensitively in the current locale; false otherwise. + +=cut +*/ + +PERL_STATIC_INLINE I32 +Perl_foldEQ_locale(const char *s1, const char *s2, I32 len) +{ + dVAR; + const U8 *a = (const U8 *)s1; + const U8 *b = (const U8 *)s2; + + PERL_ARGS_ASSERT_FOLDEQ_LOCALE; + + assert(len >= 0); + + while (len--) { + if (*a != *b && *a != PL_fold_locale[*b]) + return 0; + a++,b++; + } + return 1; +} + +#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT)) + +PERL_STATIC_INLINE void * +S_my_memrchr(const char * s, const char c, const STRLEN len) +{ + /* memrchr(), since many platforms lack it */ + + const char * t = s + len - 1; + + PERL_ARGS_ASSERT_MY_MEMRCHR; + + while (t >= s) { + if (*t == c) { + return (void *) t; + } + t--; + } + return NULL; +} +#endif /* * ex: set ts=8 sts=4 sw=4 et: