X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/15e4ac9a427692a356fe62b255db9e08982879f5..HEAD:/inline.h diff --git a/inline.h b/inline.h index e4a0458..4507b99 100644 --- a/inline.h +++ b/inline.h @@ -5,38 +5,267 @@ * You may distribute under the terms of either the GNU General Public * License or the Artistic License, as specified in the README file. * + * This file contains tables and code adapted from + * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this + * copyright notice: + +Copyright (c) 2008-2009 Bjoern Hoehrmann + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + * * This file is a home for static inline functions that cannot go in other - * headers files, because they depend on proto.h (included after most other + * header files, because they depend on proto.h (included after most other * headers) or struct definitions. * + * Note also perlstatic.h for functions that can't or shouldn't be inlined, but + * whose details should be exposed to the compiler, for such things as tail + * call optimization. + * * Each section names the header file that the functions "belong" to. */ /* ------------------------------- av.h ------------------------------- */ -PERL_STATIC_INLINE SSize_t -S_av_top_index(pTHX_ AV *av) +/* +=for apidoc_section $AV +=for apidoc av_count +Returns the number of elements in the array C. This is the true length of +the array, including any undefined elements. It is always the same as +S>. + +=cut +*/ +PERL_STATIC_INLINE Size_t +Perl_av_count(pTHX_ AV *av) +{ + PERL_ARGS_ASSERT_AV_COUNT; + assert(SvTYPE(av) == SVt_PVAV); + + return AvFILL(av) + 1; +} + +/* ------------------------------- av.c ------------------------------- */ + +/* +=for apidoc av_store_simple + +This is a cut-down version of av_store that assumes that the array is +very straightforward - no magic, not readonly, and AvREAL - and that +C is not negative. This function MUST NOT be used in situations +where any of those assumptions may not hold. + +Stores an SV in an array. The array index is specified as C. It +can be dereferenced to get the C that was stored there (= C)). + +Note that the caller is responsible for suitably incrementing the reference +count of C before the call. + +Approximate Perl equivalent: C. + +=cut +*/ + +PERL_STATIC_INLINE SV** +Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val) +{ + SV** ary; + + PERL_ARGS_ASSERT_AV_STORE_SIMPLE; + assert(SvTYPE(av) == SVt_PVAV); + assert(!SvMAGICAL(av)); + assert(!SvREADONLY(av)); + assert(AvREAL(av)); + assert(key > -1); + + ary = AvARRAY(av); + + if (AvFILLp(av) < key) { + if (key > AvMAX(av)) { + av_extend(av,key); + ary = AvARRAY(av); + } + AvFILLp(av) = key; + } else + SvREFCNT_dec(ary[key]); + + ary[key] = val; + return &ary[key]; +} + +/* +=for apidoc av_fetch_simple + +This is a cut-down version of av_fetch that assumes that the array is +very straightforward - no magic, not readonly, and AvREAL - and that +C is not negative. This function MUST NOT be used in situations +where any of those assumptions may not hold. + +Returns the SV at the specified index in the array. The C is the +index. If lval is true, you are guaranteed to get a real SV back (in case +it wasn't real before), which you can then modify. Check that the return +value is non-null before dereferencing it to a C. + +The rough perl equivalent is C<$myarray[$key]>. + +=cut +*/ + +PERL_STATIC_INLINE SV** +Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval) +{ + PERL_ARGS_ASSERT_AV_FETCH_SIMPLE; + assert(SvTYPE(av) == SVt_PVAV); + assert(!SvMAGICAL(av)); + assert(!SvREADONLY(av)); + assert(AvREAL(av)); + assert(key > -1); + + if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) { + return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL; + } else { + return &AvARRAY(av)[key]; + } +} + +/* +=for apidoc av_push_simple + +This is a cut-down version of av_push that assumes that the array is very +straightforward - no magic, not readonly, and AvREAL - and that C is +not less than -1. This function MUST NOT be used in situations where any +of those assumptions may not hold. + +Pushes an SV (transferring control of one reference count) onto the end of the +array. The array will grow automatically to accommodate the addition. + +Perl equivalent: C. + +=cut +*/ + +PERL_STATIC_INLINE void +Perl_av_push_simple(pTHX_ AV *av, SV *val) { - PERL_ARGS_ASSERT_AV_TOP_INDEX; + PERL_ARGS_ASSERT_AV_PUSH_SIMPLE; assert(SvTYPE(av) == SVt_PVAV); + assert(!SvMAGICAL(av)); + assert(!SvREADONLY(av)); + assert(AvREAL(av)); + assert(AvFILLp(av) > -2); + + (void)av_store_simple(av,AvFILLp(av)+1,val); +} + +/* +=for apidoc av_new_alloc + +This implements L> +and L>, which are the public API for this +functionality. + +Creates a new AV and allocates its SV* array. + +This is similar to, but more efficient than doing: + + AV *av = newAV(); + av_extend(av, key); + +The size parameter is used to pre-allocate a SV* array large enough to +hold at least elements C<0..(size-1)>. C must be at least 1. + +The C parameter controls whether or not the array is NULL +initialized. + +=cut +*/ + +PERL_STATIC_INLINE AV * +Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag) +{ + AV * const av = newAV(); + SV** ary; + PERL_ARGS_ASSERT_AV_NEW_ALLOC; + assert(size > 0); + + Newx(ary, size, SV*); /* Newx performs the memwrap check */ + AvALLOC(av) = ary; + AvARRAY(av) = ary; + AvMAX(av) = size - 1; + + if (zeroflag) + Zero(ary, size, SV*); + + return av; +} - return AvFILL(av); + +/* remove (AvARRAY(av) - AvALLOC(av)) offset from empty array */ + +PERL_STATIC_INLINE void +Perl_av_remove_offset(pTHX_ AV *av) +{ + PERL_ARGS_ASSERT_AV_REMOVE_OFFSET; + assert(AvFILLp(av) == -1); + SSize_t i = AvARRAY(av) - AvALLOC(av); + if (i) { + AvARRAY(av) = AvALLOC(av); + AvMAX(av) += i; +#ifdef PERL_RC_STACK + Zero(AvALLOC(av), i, SV*); +#endif + } } + /* ------------------------------- cv.h ------------------------------- */ +/* +=for apidoc_section $CV +=for apidoc CvGV +Returns the GV associated with the CV C, reifying it if necessary. + +=cut +*/ PERL_STATIC_INLINE GV * -S_CvGV(pTHX_ CV *sv) +Perl_CvGV(pTHX_ CV *sv) { + PERL_ARGS_ASSERT_CVGV; + return CvNAMED(sv) - ? Perl_cvgv_from_hek(aTHX_ sv) - : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv; + ? Perl_cvgv_from_hek(aTHX_ sv) + : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv; } +/* +=for apidoc CvDEPTH +Returns the recursion level of the CV C. Hence >= 2 indicates we are in a +recursive call. + +=cut +*/ PERL_STATIC_INLINE I32 * -S_CvDEPTHp(const CV * const sv) +Perl_CvDEPTH(const CV * const sv) { + PERL_ARGS_ASSERT_CVDEPTH; assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM); + return &((XPVCV*)SvANY(sv))->xcv_depth; } @@ -60,14 +289,33 @@ S_strip_spaces(pTHX_ const char * orig, STRLEN * const len) tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP); tmps = SvPVX(tmpsv); while ((*len)--) { - if (!isSPACE(*orig)) - *tmps++ = *orig; - orig++; + if (!isSPACE(*orig)) + *tmps++ = *orig; + orig++; } *tmps = '\0'; *len = tmps - SvPVX(tmpsv); - return SvPVX(tmpsv); + return SvPVX(tmpsv); +} +#endif + +/* ------------------------------- iperlsys.h ------------------------------- */ +#if ! defined(PERL_IMPLICIT_SYS) && defined(USE_ITHREADS) + +/* Otherwise this function is implemented as macros in iperlsys.h */ + +PERL_STATIC_INLINE bool +S_PerlEnv_putenv(pTHX_ char * str) +{ + PERL_ARGS_ASSERT_PERLENV_PUTENV; + + ENV_LOCK; + bool retval = putenv(str); + ENV_UNLOCK; + + return retval; } + #endif /* ------------------------------- mg.h ------------------------------- */ @@ -80,12 +328,12 @@ S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len) assert(mg->mg_type == PERL_MAGIC_regex_global); assert(mg->mg_len != -1); if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv)) - return (STRLEN)mg->mg_len; + return (STRLEN)mg->mg_len; else { - const STRLEN pos = (STRLEN)mg->mg_len; - /* Without this check, we may read past the end of the buffer: */ - if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1; - return sv_or_pv_pos_u2b(sv, s, pos, NULL); + const STRLEN pos = (STRLEN)mg->mg_len; + /* Without this check, we may read past the end of the buffer: */ + if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1; + return sv_or_pv_pos_u2b(sv, s, pos, NULL); } } #endif @@ -94,249 +342,970 @@ S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len) #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C) PERL_STATIC_INLINE bool -PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq) +S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq) { + PERL_ARGS_ASSERT_PADNAMEIN_SCOPE; + /* is seq within the range _LOW to _HIGH ? * This is complicated by the fact that PL_cop_seqmax * may have wrapped around at some point */ if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO) - return FALSE; /* not yet introduced */ + return FALSE; /* not yet introduced */ if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) { /* in compiling scope */ - if ( - (seq > COP_SEQ_RANGE_LOW(pn)) - ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1)) - : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1)) - ) - return TRUE; + if ( + (seq > COP_SEQ_RANGE_LOW(pn)) + ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1)) + : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1)) + ) + return TRUE; } else if ( - (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn)) - ? - ( seq > COP_SEQ_RANGE_LOW(pn) - || seq <= COP_SEQ_RANGE_HIGH(pn)) + (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn)) + ? + ( seq > COP_SEQ_RANGE_LOW(pn) + || seq <= COP_SEQ_RANGE_HIGH(pn)) - : ( seq > COP_SEQ_RANGE_LOW(pn) - && seq <= COP_SEQ_RANGE_HIGH(pn)) + : ( seq > COP_SEQ_RANGE_LOW(pn) + && seq <= COP_SEQ_RANGE_HIGH(pn)) ) - return TRUE; + return TRUE; return FALSE; } #endif /* ------------------------------- pp.h ------------------------------- */ -PERL_STATIC_INLINE I32 -S_TOPMARK(pTHX) +PERL_STATIC_INLINE Stack_off_t +Perl_TOPMARK(pTHX) { DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, - "MARK top %p %" IVdf "\n", - PL_markstack_ptr, - (IV)*PL_markstack_ptr))); + "MARK top %p %" IVdf "\n", + PL_markstack_ptr, + (IV)*PL_markstack_ptr))); return *PL_markstack_ptr; } -PERL_STATIC_INLINE I32 -S_POPMARK(pTHX) +PERL_STATIC_INLINE Stack_off_t +Perl_POPMARK(pTHX) { DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, - "MARK pop %p %" IVdf "\n", - (PL_markstack_ptr-1), - (IV)*(PL_markstack_ptr-1)))); + "MARK pop %p %" IVdf "\n", + (PL_markstack_ptr-1), + (IV)*(PL_markstack_ptr-1)))); assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow"); return *PL_markstack_ptr--; } -/* ----------------------------- regexp.h ----------------------------- */ +/* +=for apidoc_section $rpp -PERL_STATIC_INLINE struct regexp * -S_ReANY(const REGEXP * const re) -{ - XPV* const p = (XPV*)SvANY(re); - assert(isREGEXP(re)); - return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx - : (struct regexp *)p; -} +=for apidoc rpp_extend +Ensures that there is space on the stack to push C items, extending it +if necessary. -/* ------------------------------- sv.h ------------------------------- */ +=cut +*/ -PERL_STATIC_INLINE SV * -S_SvREFCNT_inc(SV *sv) -{ - if (LIKELY(sv != NULL)) - SvREFCNT(sv)++; - return sv; -} -PERL_STATIC_INLINE SV * -S_SvREFCNT_inc_NN(SV *sv) -{ - SvREFCNT(sv)++; - return sv; -} PERL_STATIC_INLINE void -S_SvREFCNT_inc_void(SV *sv) +Perl_rpp_extend(pTHX_ SSize_t n) { - if (LIKELY(sv != NULL)) - SvREFCNT(sv)++; + PERL_ARGS_ASSERT_RPP_EXTEND; + + EXTEND_HWM_SET(PL_stack_sp, n); +#ifndef STRESS_REALLOC + if (UNLIKELY(_EXTEND_NEEDS_GROW(PL_stack_sp, n))) +#endif + { + (void)stack_grow(PL_stack_sp, PL_stack_sp, n); + } } + + +/* +=for apidoc rpp_popfree_to + +Pop and free all items on the argument stack above C. On return, +C will be equal to C. + +=cut +*/ + PERL_STATIC_INLINE void -S_SvREFCNT_dec(pTHX_ SV *sv) +Perl_rpp_popfree_to(pTHX_ SV **sp) { - if (LIKELY(sv != NULL)) { - U32 rc = SvREFCNT(sv); - if (LIKELY(rc > 1)) - SvREFCNT(sv) = rc - 1; - else - Perl_sv_free2(aTHX_ sv, rc); + PERL_ARGS_ASSERT_RPP_POPFREE_TO; + + assert(sp <= PL_stack_sp); +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + while (PL_stack_sp > sp) { + SV *sv = *PL_stack_sp--; + SvREFCNT_dec(sv); } +#else + PL_stack_sp = sp; +#endif } + +/* +=for apidoc rpp_popfree_to_NN + +A variant of rpp_popfree_to() which assumes that all the pointers being +popped off the stack are non-NULL. + +=cut +*/ + PERL_STATIC_INLINE void -S_SvREFCNT_dec_NN(pTHX_ SV *sv) +Perl_rpp_popfree_to_NN(pTHX_ SV **sp) { - U32 rc = SvREFCNT(sv); - if (LIKELY(rc > 1)) - SvREFCNT(sv) = rc - 1; - else - Perl_sv_free2(aTHX_ sv, rc); + PERL_ARGS_ASSERT_RPP_POPFREE_TO_NN; + + assert(sp <= PL_stack_sp); +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + while (PL_stack_sp > sp) { + SV *sv = *PL_stack_sp--; + assert(sv); + SvREFCNT_dec_NN(sv); + } +#else + PL_stack_sp = sp; +#endif } + +/* +=for apidoc rpp_popfree_1 + +Pop and free the top item on the argument stack and update C. + +=cut +*/ + PERL_STATIC_INLINE void -SvAMAGIC_on(SV *sv) +Perl_rpp_popfree_1(pTHX) { - assert(SvROK(sv)); - if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv))); + PERL_ARGS_ASSERT_RPP_POPFREE_1; + +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + SV *sv = *PL_stack_sp--; + SvREFCNT_dec(sv); +#else + PL_stack_sp--; +#endif } + + +/* +=for apidoc rpp_popfree_1_NN + +A variant of rpp_popfree_1() which assumes that the pointer being popped +off the stack is non-NULL. + +=cut +*/ + PERL_STATIC_INLINE void -SvAMAGIC_off(SV *sv) +Perl_rpp_popfree_1_NN(pTHX) { - if (SvROK(sv) && SvOBJECT(SvRV(sv))) - HvAMAGIC_off(SvSTASH(SvRV(sv))); + PERL_ARGS_ASSERT_RPP_POPFREE_1_NN; + + assert(*PL_stack_sp); +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + SV *sv = *PL_stack_sp--; + SvREFCNT_dec_NN(sv); +#else + PL_stack_sp--; +#endif } -PERL_STATIC_INLINE U32 -S_SvPADSTALE_on(SV *sv) -{ - assert(!(SvFLAGS(sv) & SVs_PADTMP)); - return SvFLAGS(sv) |= SVs_PADSTALE; -} -PERL_STATIC_INLINE U32 -S_SvPADSTALE_off(SV *sv) -{ - assert(!(SvFLAGS(sv) & SVs_PADTMP)); - return SvFLAGS(sv) &= ~SVs_PADSTALE; -} -#if defined(PERL_CORE) || defined (PERL_EXT) -PERL_STATIC_INLINE STRLEN -S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp) -{ - PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B; - if (SvGAMAGIC(sv)) { - U8 *hopped = utf8_hop((U8 *)pv, pos); - if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped); - return (STRLEN)(hopped - (U8 *)pv); - } - return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN); -} -#endif -/* ------------------------------- handy.h ------------------------------- */ +/* +=for apidoc rpp_popfree_2 -/* saves machine code for a common noreturn idiom typically used in Newx*() */ -#ifdef GCC_DIAG_PRAGMA -GCC_DIAG_IGNORE(-Wunused-function) /* Intentionally left semicolonless. */ -#endif -static void -S_croak_memory_wrap(void) +Pop and free the top two items on the argument stack and update +C. + +=cut +*/ + + +PERL_STATIC_INLINE void +Perl_rpp_popfree_2(pTHX) { - Perl_croak_nocontext("%s",PL_memory_wrap); -} -#ifdef GCC_DIAG_PRAGMA -GCC_DIAG_RESTORE /* Intentionally left semicolonless. */ + PERL_ARGS_ASSERT_RPP_POPFREE_2; + +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + for (int i = 0; i < 2; i++) { + SV *sv = *PL_stack_sp--; + SvREFCNT_dec(sv); + } +#else + PL_stack_sp -= 2; #endif +} -/* ------------------------------- utf8.h ------------------------------- */ /* -=head1 Unicode Support +=for apidoc rpp_popfree_2_NN + +A variant of rpp_popfree_2() which assumes that the two pointers being +popped off the stack are non-NULL. + +=cut */ + PERL_STATIC_INLINE void -S_append_utf8_from_native_byte(const U8 byte, U8** dest) +Perl_rpp_popfree_2_NN(pTHX) { - /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8 - * encoded string at '*dest', updating '*dest' to include it */ - - PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE; - - if (NATIVE_BYTE_IS_INVARIANT(byte)) - *((*dest)++) = byte; + PERL_ARGS_ASSERT_RPP_POPFREE_2_NN; +#ifdef PERL_RC_STACK + SV *sv2 = *PL_stack_sp--; + assert(sv2); + SV *sv1 = *PL_stack_sp; + assert(sv1); + + assert(rpp_stack_is_rc()); + U32 rc1 = SvREFCNT(sv1); + U32 rc2 = SvREFCNT(sv2); + /* This expression is intended to be true if either of rc1 or rc2 has + * the value 0 or 1, but using only a single branch test, rather + * than the two branches that a compiler would plant for a boolean + * expression. We are working on the assumption that, most of the + * time, neither of the args to a binary function will need to be + * freed - they're likely to lex vars, or PADTMPs or whatever. + * So give the CPU a single branch that is rarely taken. */ + if (UNLIKELY( !(rc1>>1) + !(rc2>>1) )) + /* at least one of the old SVs needs freeing. Do it the long way */ + Perl_rpp_free_2_(aTHX_ sv1, sv2, rc1, rc2); else { - *((*dest)++) = UTF8_EIGHT_BIT_HI(byte); - *((*dest)++) = UTF8_EIGHT_BIT_LO(byte); + SvREFCNT(sv1) = rc1 - 1; + SvREFCNT(sv2) = rc2 - 1; } + PL_stack_sp--; +#else + PL_stack_sp -= 2; +#endif } + /* -=for apidoc valid_utf8_to_uvchr -Like C>, but should only be called when it is known that -the next character in the input UTF-8 string C is well-formed (I, -it passes C>. Surrogates, non-character code points, and -non-Unicode code points are allowed. +=for apidoc rpp_pop_1_norc -=cut +Pop and return the top item off the argument stack and update +C. It's similar to rpp_popfree_1(), except that it actually +returns a value, and it I decrement the SV's reference count. +On non-C builds it actually increments the SV's reference +count. - */ +This is useful in cases where the popped value is immediately embedded +somewhere e.g. via av_store(), allowing you skip decrementing and then +immediately incrementing the reference count again (and risk prematurely +freeing the SV if it had a RC of 1). On non-RC builds, the reference count +bookkeeping still works too, which is why it should be used rather than +a simple C<*PL_stack_sp-->. -PERL_STATIC_INLINE UV -Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen) +=cut +*/ + +PERL_STATIC_INLINE SV* +Perl_rpp_pop_1_norc(pTHX) { - const UV expectlen = UTF8SKIP(s); - const U8* send = s + expectlen; - UV uv = *s; + PERL_ARGS_ASSERT_RPP_POP_1_NORC - PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR; + SV *sv = *PL_stack_sp--; - if (retlen) { - *retlen = expectlen; - } +#ifndef PERL_RC_STACK + SvREFCNT_inc(sv); +#else + assert(rpp_stack_is_rc()); +#endif + return sv; +} - /* An invariant is trivially returned */ - if (expectlen == 1) { - return uv; - } - /* Remove the leading bits that indicate the number of bytes, leaving just - * the bits that are part of the value */ - uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); - /* Now, loop through the remaining bytes, accumulating each into the - * working total as we go. (I khw tried unrolling the loop for up to 4 - * bytes, but there was no performance improvement) */ - for (++s; s < send; s++) { - uv = UTF8_ACCUMULATE(uv, *s); - } +/* +=for apidoc rpp_push_1 +=for apidoc_item rpp_push_IMM +=for apidoc_item rpp_push_2 +=for apidoc_item rpp_xpush_1 +=for apidoc_item rpp_xpush_IMM +=for apidoc_item rpp_xpush_2 + +Push one or two SVs onto the stack, incrementing their reference counts +and updating C. With the C variants, it extends the stack +first. The C variants assume that the single argument is an immortal +such as <&PL_sv_undef> and, for efficiency, will skip incrementing its +reference count. - return UNI_TO_NATIVE(uv); +=cut +*/ +PERL_STATIC_INLINE void +Perl_rpp_push_1(pTHX_ SV *sv) +{ + PERL_ARGS_ASSERT_RPP_PUSH_1; + + *++PL_stack_sp = sv; +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + SvREFCNT_inc_simple_void_NN(sv); +#endif } -/* -=for apidoc is_utf8_invariant_string +PERL_STATIC_INLINE void +Perl_rpp_push_IMM(pTHX_ SV *sv) +{ + PERL_ARGS_ASSERT_RPP_PUSH_IMM; -Returns TRUE if the first C bytes of the string C are the same -regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on -EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they -are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only -the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range -characters are invariant, but so also are the C1 controls. + assert(SvIMMORTAL(sv)); + *++PL_stack_sp = sv; +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); +#endif +} -If C is 0, it will be calculated using C, (which means if you -use this option, that C can't have embedded C characters and has to -have a terminating C byte). +PERL_STATIC_INLINE void +Perl_rpp_push_2(pTHX_ SV *sv1, SV *sv2) +{ + PERL_ARGS_ASSERT_RPP_PUSH_2; + + *++PL_stack_sp = sv1; + *++PL_stack_sp = sv2; +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + SvREFCNT_inc_simple_void_NN(sv1); + SvREFCNT_inc_simple_void_NN(sv2); +#endif +} -See also +PERL_STATIC_INLINE void +Perl_rpp_xpush_1(pTHX_ SV *sv) +{ + PERL_ARGS_ASSERT_RPP_XPUSH_1; + + rpp_extend(1); + rpp_push_1(sv); +} + +PERL_STATIC_INLINE void +Perl_rpp_xpush_IMM(pTHX_ SV *sv) +{ + PERL_ARGS_ASSERT_RPP_XPUSH_IMM; + + rpp_extend(1); + rpp_push_IMM(sv); +} + +PERL_STATIC_INLINE void +Perl_rpp_xpush_2(pTHX_ SV *sv1, SV *sv2) +{ + PERL_ARGS_ASSERT_RPP_XPUSH_2; + + rpp_extend(2); + rpp_push_2(sv1, sv2); +} + + +/* +=for apidoc rpp_push_1_norc + +Push C onto the stack without incrementing its reference count, and +update C. On non-PERL_RC_STACK builds, mortalise too. + +This is most useful where an SV has just been created and already has a +reference count of 1, but has not yet been anchored anywhere. + +=cut +*/ + +PERL_STATIC_INLINE void +Perl_rpp_push_1_norc(pTHX_ SV *sv) +{ + PERL_ARGS_ASSERT_RPP_PUSH_1; + + *++PL_stack_sp = sv; +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); +#else + sv_2mortal(sv); +#endif +} + + +/* +=for apidoc rpp_replace_1_1 +=for apidoc_item rpp_replace_1_1_NN +=for apidoc_item rpp_replace_1_IMM_NN + +Replace the current top stack item with C, while suitably adjusting +reference counts. Equivalent to rpp_popfree_1(); rpp_push_1(sv), but +is more efficient and handles both SVs being the same. + +The C<_NN> variant assumes that the pointer on the stack to the SV being +freed is non-NULL. + +The C variant is like the C<_NN> variant, but in addition, assumes +that the single argument is an immortal such as <&PL_sv_undef> and, for +efficiency, will skip incrementing its reference count. + +=cut +*/ + +PERL_STATIC_INLINE void +Perl_rpp_replace_1_1(pTHX_ SV *sv) +{ + PERL_ARGS_ASSERT_RPP_REPLACE_1_1; + + assert(sv); +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + SV *oldsv = *PL_stack_sp; + *PL_stack_sp = sv; + SvREFCNT_inc_simple_void_NN(sv); + SvREFCNT_dec(oldsv); +#else + *PL_stack_sp = sv; +#endif +} + + +PERL_STATIC_INLINE void +Perl_rpp_replace_1_1_NN(pTHX_ SV *sv) +{ + PERL_ARGS_ASSERT_RPP_REPLACE_1_1_NN; + + assert(sv); + assert(*PL_stack_sp); +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + SV *oldsv = *PL_stack_sp; + *PL_stack_sp = sv; + SvREFCNT_inc_simple_void_NN(sv); + SvREFCNT_dec_NN(oldsv); +#else + *PL_stack_sp = sv; +#endif +} + + +PERL_STATIC_INLINE void +Perl_rpp_replace_1_IMM_NN(pTHX_ SV *sv) +{ + PERL_ARGS_ASSERT_RPP_REPLACE_1_IMM_NN; + + assert(sv); + assert(SvIMMORTAL(sv)); + assert(*PL_stack_sp); +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + SV *oldsv = *PL_stack_sp; + *PL_stack_sp = sv; + SvREFCNT_dec_NN(oldsv); +#else + *PL_stack_sp = sv; +#endif +} + + +/* +=for apidoc rpp_replace_2_1 +=for apidoc_item rpp_replace_2_1_NN +=for apidoc_item rpp_replace_2_IMM_NN + +Replace the current top to stacks item with C, while suitably +adjusting reference counts. Equivalent to rpp_popfree_2(); rpp_push_1(sv), +but is more efficient and handles SVs being the same. + +The C<_NN> variant assumes that the pointers on the stack to the SVs being +freed are non-NULL. + +The C variant is like the C<_NN> variant, but in addition, assumes +that the single argument is an immortal such as <&PL_sv_undef> and, for +efficiency, will skip incrementing its reference count. +=cut +*/ + +PERL_STATIC_INLINE void +Perl_rpp_replace_2_1(pTHX_ SV *sv) +{ + PERL_ARGS_ASSERT_RPP_REPLACE_2_1; + +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + /* replace PL_stack_sp[-1] first; leave PL_stack_sp[0] in place while + * we free [-1], so if an exception occurs, [0] will still be freed. + */ + SV *oldsv = PL_stack_sp[-1]; + PL_stack_sp[-1] = sv; + SvREFCNT_inc_simple_void_NN(sv); + SvREFCNT_dec(oldsv); + oldsv = *PL_stack_sp--; + SvREFCNT_dec(oldsv); +#else + *--PL_stack_sp = sv; +#endif +} + + +/* Private helper function for _NN and _IMM_NN variants. + * Assumes sv has already had its ref count incremented, + * ready for being put on the stack. + * Intended to be small and fast, since it's inlined into many hot parts of + * code. + */ + +PERL_STATIC_INLINE void +Perl_rpp_replace_2_1_COMMON(pTHX_ SV *sv) +{ + + assert(sv); +#ifdef PERL_RC_STACK + SV *sv2 = *PL_stack_sp--; + assert(sv2); + SV *sv1 = *PL_stack_sp; + assert(sv1); + + *PL_stack_sp = sv; + assert(rpp_stack_is_rc()); + U32 rc1 = SvREFCNT(sv1); + U32 rc2 = SvREFCNT(sv2); + /* This expression is intended to be true if either of rc1 or rc2 has + * the value 0 or 1, but using only a single branch test, rather + * than the two branches that a compiler would plant for a boolean + * expression. We are working on the assumption that, most of the + * time, neither of the args to a binary function will need to be + * freed - they're likely to lex vars, or PADTMPs or whatever. + * So give the CPU a single branch that is rarely taken. */ + if (UNLIKELY( !(rc1>>1) + !(rc2>>1) )) + /* at least one of the old SVs needs freeing. Do it the long way */ + Perl_rpp_free_2_(aTHX_ sv1, sv2, rc1, rc2); + else { + SvREFCNT(sv1) = rc1 - 1; + SvREFCNT(sv2) = rc2 - 1; + } +#else + *--PL_stack_sp = sv; +#endif +} + + +PERL_STATIC_INLINE void +Perl_rpp_replace_2_1_NN(pTHX_ SV *sv) +{ + PERL_ARGS_ASSERT_RPP_REPLACE_2_1_NN; + + assert(sv); +#ifdef PERL_RC_STACK + SvREFCNT_inc_simple_void_NN(sv); +#endif + rpp_replace_2_1_COMMON(sv); +} + + +PERL_STATIC_INLINE void +Perl_rpp_replace_2_IMM_NN(pTHX_ SV *sv) +{ + PERL_ARGS_ASSERT_RPP_REPLACE_2_IMM_NN; + + assert(sv); + assert(SvIMMORTAL(sv)); + rpp_replace_2_1_COMMON(sv); +} + + +/* +=for apidoc rpp_replace_at + +Replace the SV at address sp within the stack with C, while suitably +adjusting reference counts. Equivalent to C<*sp = sv>, except with proper +reference count handling. + +=cut +*/ + +PERL_STATIC_INLINE void +Perl_rpp_replace_at(pTHX_ SV **sp, SV *sv) +{ + PERL_ARGS_ASSERT_RPP_REPLACE_AT; + +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + SV *oldsv = *sp; + *sp = sv; + SvREFCNT_inc_simple_void_NN(sv); + SvREFCNT_dec(oldsv); +#else + *sp = sv; +#endif +} + + +/* +=for apidoc rpp_replace_at_NN + +A variant of rpp_replace_at() which assumes that the SV pointer on the +stack is non-NULL. + +=cut +*/ + +PERL_STATIC_INLINE void +Perl_rpp_replace_at_NN(pTHX_ SV **sp, SV *sv) +{ + PERL_ARGS_ASSERT_RPP_REPLACE_AT_NN; + + assert(sv); + assert(*sp); +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + SV *oldsv = *sp; + *sp = sv; + SvREFCNT_inc_simple_void_NN(sv); + SvREFCNT_dec_NN(oldsv); +#else + *sp = sv; +#endif +} + + +/* +=for apidoc rpp_replace_at_norc + +Replace the SV at address sp within the stack with C, while suitably +adjusting the reference count of the old SV. Equivalent to C<*sp = sv>, +except with proper reference count handling. + +C's reference count doesn't get incremented. On non-C +builds, it gets mortalised too. + +This is most useful where an SV has just been created and already has a +reference count of 1, but has not yet been anchored anywhere. + +=cut +*/ + +PERL_STATIC_INLINE void +Perl_rpp_replace_at_norc(pTHX_ SV **sp, SV *sv) +{ + PERL_ARGS_ASSERT_RPP_REPLACE_AT_NORC; + +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + SV *oldsv = *sp; + *sp = sv; + SvREFCNT_dec(oldsv); +#else + *sp = sv; + sv_2mortal(sv); +#endif +} + + +/* +=for apidoc rpp_replace_at_norc_NN + +A variant of rpp_replace_at_norc() which assumes that the SV pointer on the +stack is non-NULL. + +=cut +*/ + +PERL_STATIC_INLINE void +Perl_rpp_replace_at_norc_NN(pTHX_ SV **sp, SV *sv) +{ + PERL_ARGS_ASSERT_RPP_REPLACE_AT_NORC_NN; + + assert(*sp); +#ifdef PERL_RC_STACK + assert(rpp_stack_is_rc()); + SV *oldsv = *sp; + *sp = sv; + SvREFCNT_dec_NN(oldsv); +#else + *sp = sv; + sv_2mortal(sv); +#endif +} + + +/* +=for apidoc rpp_context + +Impose void, scalar or list context on the stack. +First, pop C items off the stack, then when C is: +C: return as-is. +C: pop everything back to C +C: move the top stack item (or C<&PL_sv_undef> if none) to +C and free everything above it. + +=cut +*/ + +PERL_STATIC_INLINE void +Perl_rpp_context(pTHX_ SV **mark, U8 gimme, SSize_t extra) +{ + PERL_ARGS_ASSERT_RPP_CONTEXT; + assert(extra >= 0); + assert(mark <= PL_stack_sp - extra); + + if (gimme == G_LIST) + mark = PL_stack_sp - extra; + else if (gimme == G_SCALAR) { + SV **svp = PL_stack_sp - extra; + mark++; + if (mark > svp) { + /* empty list (plus extra) */ + rpp_popfree_to(svp); + rpp_extend(1); + *++PL_stack_sp = &PL_sv_undef; + return; + } + /* swap top and bottom list items */ + SV *top = *svp; + *svp = *mark; + *mark = top; + } + rpp_popfree_to(mark); +} + + + + +/* +=for apidoc rpp_try_AMAGIC_1 +=for apidoc_item rpp_try_AMAGIC_2 + +Check whether either of the one or two SVs at the top of the stack is +magical or a ref, and in either case handle it specially: invoke get +magic, call an overload method, or replace a ref with a temporary numeric +value, as appropriate. If this function returns true, it indicates that +the correct return value is already on the stack. Intended to be used at +the beginning of the PP function for unary or binary ops. + +=cut +*/ + +PERL_STATIC_INLINE bool +Perl_rpp_try_AMAGIC_1(pTHX_ int method, int flags) +{ + return UNLIKELY((SvFLAGS(*PL_stack_sp) & (SVf_ROK|SVs_GMG))) + && Perl_try_amagic_un(aTHX_ method, flags); +} + +PERL_STATIC_INLINE bool +Perl_rpp_try_AMAGIC_2(pTHX_ int method, int flags) +{ + return UNLIKELY(((SvFLAGS(PL_stack_sp[-1])|SvFLAGS(PL_stack_sp[0])) + & (SVf_ROK|SVs_GMG))) + && Perl_try_amagic_bin(aTHX_ method, flags); +} + + +/* +=for apidoc rpp_stack_is_rc + +Returns a boolean value indicating whether the stack is currently +reference-counted. Note that if the stack is split (bottom half RC, top +half non-RC), this function returns false, even if the top half currently +contains zero items. + +=cut +*/ + +PERL_STATIC_INLINE bool +Perl_rpp_stack_is_rc(pTHX) +{ +#ifdef PERL_RC_STACK + return AvREAL(PL_curstack) && !PL_curstackinfo->si_stack_nonrc_base; +#else + return 0; +#endif + +} + + +/* +=for apidoc rpp_is_lone + +Indicates whether the stacked SV C (assumed to be not yet popped off +the stack) is only kept alive due to a single reference from the argument +stack and/or and the temps stack. + +This can used for example to decide whether the copying of return values +in rvalue context can be skipped, or whether it shouldn't be assigned to +in lvalue context. + +=cut +*/ + + +PERL_STATIC_INLINE bool +Perl_rpp_is_lone(pTHX_ SV *sv) +{ +#ifdef PERL_RC_STACK + /* note that rpp_is_lone() can be used in wrapped pp functions, + * where technically the stack is no longer ref-counted; but because + * the args are non-RC copies of RC args further down the stack, we + * can't be in a *completely* non-ref stack. + */ + assert(AvREAL(PL_curstack)); +#endif + + return SvREFCNT(sv) <= cBOOL(SvTEMP(sv)) +#ifdef PERL_RC_STACK + + 1 + && !SvIMMORTAL(sv) /* PL_sv_undef etc are never stealable */ +#endif + ; +} + + +/* +=for apidoc rpp_invoke_xs + +Call the XS function associated with C. Wraps the call if necessary to +handle XS functions which are not aware of reference-counted stacks. + +=cut +*/ + + +PERL_STATIC_INLINE void +Perl_rpp_invoke_xs(pTHX_ CV *cv) +{ + PERL_ARGS_ASSERT_RPP_INVOKE_XS; + +#ifdef PERL_RC_STACK + if (!CvXS_RCSTACK(cv)) + Perl_xs_wrap(aTHX_ CvXSUB(cv), cv); + else +#endif + CvXSUB(cv)(aTHX_ cv); +} + + + + +/* ----------------------------- regexp.h ----------------------------- */ + +/* PVLVs need to act as a superset of all scalar types - they are basically + * PVMGs with a few extra fields. + * REGEXPs are first class scalars, but have many fields that can't be copied + * into a PVLV body. + * + * Hence we take a different approach - instead of a copy, PVLVs store a pointer + * back to the original body. To avoid increasing the size of PVLVs just for the + * rare case of REGEXP assignment, this pointer is stored in the memory usually + * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to + * read the pointer from the two possible locations. The macro SvLEN() wraps the + * access to the union's member xpvlenu_len, but there is no equivalent macro + * for wrapping the union's member xpvlenu_rx, hence the direct reference here. + * + * See commit df6b4bd56551f2d3 for more details. */ + +PERL_STATIC_INLINE struct regexp * +Perl_ReANY(const REGEXP * const re) +{ + XPV* const p = (XPV*)SvANY(re); + + PERL_ARGS_ASSERT_REANY; + assert(isREGEXP(re)); + + return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx + : (struct regexp *)p; +} + +/* ------------------------------- utf8.h ------------------------------- */ + +/* +=for apidoc_section $unicode +*/ + +PERL_STATIC_INLINE void +Perl_append_utf8_from_native_byte(const U8 byte, U8** dest) +{ + /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8 + * encoded string at '*dest', updating '*dest' to include it */ + + PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE; + + if (NATIVE_BYTE_IS_INVARIANT(byte)) + *((*dest)++) = byte; + else { + *((*dest)++) = UTF8_EIGHT_BIT_HI(byte); + *((*dest)++) = UTF8_EIGHT_BIT_LO(byte); + } +} + +/* +=for apidoc valid_utf8_to_uvchr +Like C>, but should only be called when it is +known that the next character in the input UTF-8 string C is well-formed +(I, it passes C>. Surrogates, non-character code +points, and non-Unicode code points are allowed. + +=cut + + */ + +PERL_STATIC_INLINE UV +Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen) +{ + const UV expectlen = UTF8SKIP(s); + const U8* send = s + expectlen; + UV uv = *s; + + PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR; + + if (retlen) { + *retlen = expectlen; + } + + /* An invariant is trivially returned */ + if (expectlen == 1) { + return uv; + } + + /* Remove the leading bits that indicate the number of bytes, leaving just + * the bits that are part of the value */ + uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); + + /* Now, loop through the remaining bytes, accumulating each into the + * working total as we go. (I khw tried unrolling the loop for up to 4 + * bytes, but there was no performance improvement) */ + for (++s; s < send; s++) { + uv = UTF8_ACCUMULATE(uv, *s); + } + + return UNI_TO_NATIVE(uv); + +} + +/* +=for apidoc is_utf8_invariant_string + +Returns TRUE if the first C bytes of the string C are the same +regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on +EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they +are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only +the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range +characters are invariant, but so also are the C1 controls. + +If C is 0, it will be calculated using C, (which means if you +use this option, that C can't have embedded C characters and has to +have a terminating C byte). + +See also C>, C>, C>, @@ -354,50 +1323,644 @@ C>, and C>. -=cut +=cut + +*/ + +#define is_utf8_invariant_string(s, len) \ + is_utf8_invariant_string_loc(s, len, NULL) + +/* +=for apidoc is_utf8_invariant_string_loc + +Like C> but upon failure, stores the location of +the first UTF-8 variant character in the C pointer; if all characters are +UTF-8 invariant, this function does not change the contents of C<*ep>. + +=cut + +*/ + +PERL_STATIC_INLINE bool +Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) +{ + const U8* send; + const U8* x = s; + + PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC; + + if (len == 0) { + len = strlen((const char *)s); + } + + send = s + len; + +/* This looks like 0x010101... */ +# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF) + +/* This looks like 0x808080... */ +# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80) +# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T) +# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1) + +/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by + * or'ing together the lowest bits of 'x'. Hopefully the final term gets + * optimized out completely on a 32-bit system, and its mask gets optimized out + * on a 64-bit system */ +# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \ + | ( PTR2nat(x) >> 1) \ + | ( ( (PTR2nat(x) \ + & PERL_WORD_BOUNDARY_MASK) >> 2)))) + +#ifndef EBCDIC + + /* Do the word-at-a-time iff there is at least one usable full word. That + * means that after advancing to a word boundary, there still is at least a + * full word left. The number of bytes needed to advance is 'wordsize - + * offset' unless offset is 0. */ + if ((STRLEN) (send - x) >= PERL_WORDSIZE + + /* This term is wordsize if subword; 0 if not */ + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) + + /* 'offset' */ + - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) + { + + /* Process per-byte until reach word boundary. XXX This loop could be + * eliminated if we knew that this platform had fast unaligned reads */ + while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { + if (! UTF8_IS_INVARIANT(*x)) { + if (ep) { + *ep = x; + } + + return FALSE; + } + x++; + } + + /* Here, we know we have at least one full word to process. Process + * per-word as long as we have at least a full word left */ + do { + if ((* (const PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) { + + /* Found a variant. Just return if caller doesn't want its + * exact position */ + if (! ep) { + return FALSE; + } + +# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \ + || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 + + *ep = x + variant_byte_number(* (const PERL_UINTMAX_T *) x); + assert(*ep >= s && *ep < send); + + return FALSE; + +# else /* If weird byte order, drop into next loop to do byte-at-a-time + checks. */ + + break; +# endif + } + + x += PERL_WORDSIZE; + + } while (x + PERL_WORDSIZE <= send); + } + +#endif /* End of ! EBCDIC */ + + /* Process per-byte */ + while (x < send) { + if (! UTF8_IS_INVARIANT(*x)) { + if (ep) { + *ep = x; + } + + return FALSE; + } + + x++; + } + + return TRUE; +} + +/* See if the platform has builtins for finding the most/least significant bit, + * and which one is right for using on 32 and 64 bit operands */ +#if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0)) +# if U32SIZE == INTSIZE +# define PERL_CLZ_32 __builtin_clz +# endif +# if defined(U64TYPE) && U64SIZE == INTSIZE +# define PERL_CLZ_64 __builtin_clz +# endif +#endif +#if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0)) +# if U32SIZE == INTSIZE +# define PERL_CTZ_32 __builtin_ctz +# endif +# if defined(U64TYPE) && U64SIZE == INTSIZE +# define PERL_CTZ_64 __builtin_ctz +# endif +#endif + +#if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0)) +# if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32) +# define PERL_CLZ_32 __builtin_clzl +# endif +# if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64) +# define PERL_CLZ_64 __builtin_clzl +# endif +#endif +#if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0)) +# if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32) +# define PERL_CTZ_32 __builtin_ctzl +# endif +# if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64) +# define PERL_CTZ_64 __builtin_ctzl +# endif +#endif + +#if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0)) +# if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32) +# define PERL_CLZ_32 __builtin_clzll +# endif +# if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64) +# define PERL_CLZ_64 __builtin_clzll +# endif +#endif +#if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0)) +# if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32) +# define PERL_CTZ_32 __builtin_ctzll +# endif +# if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64) +# define PERL_CTZ_64 __builtin_ctzll +# endif +#endif + +#if defined(WIN32) +# include +# pragma intrinsic(_BitScanForward) +# pragma intrinsic(_BitScanReverse) +# ifdef _WIN64 +# pragma intrinsic(_BitScanForward64) +# pragma intrinsic(_BitScanReverse64) +# endif +#endif + +/* The reason there are not checks to see if ffs() and ffsl() are available for + * determining the lsb, is because these don't improve on the deBruijn method + * fallback, which is just a branchless integer multiply, array element + * retrieval, and shift. The others, even if the function call overhead is + * optimized out, have to cope with the possibility of the input being all + * zeroes, and almost certainly will have conditionals for this eventuality. + * khw, at the time of this commit, looked at the source for both gcc and clang + * to verify this. (gcc used a method inferior to deBruijn.) */ + +/* Below are functions to find the first, last, or only set bit in a word. On + * platforms with 64-bit capability, there is a pair for each operation; the + * first taking a 64 bit operand, and the second a 32 bit one. The logic is + * the same in each pair, so the second is stripped of most comments. */ + +#ifdef U64TYPE /* HAS_QUAD not usable outside the core */ + +PERL_STATIC_INLINE unsigned +Perl_lsbit_pos64(U64 word) +{ + /* Find the position (0..63) of the least significant set bit in the input + * word */ + + ASSUME(word != 0); + + /* If we can determine that the platform has a usable fast method to get + * this info, use that */ + +# if defined(PERL_CTZ_64) +# define PERL_HAS_FAST_GET_LSB_POS64 + + return (unsigned) PERL_CTZ_64(word); + +# elif U64SIZE == 8 && defined(_WIN64) +# define PERL_HAS_FAST_GET_LSB_POS64 + + { + unsigned long index; + _BitScanForward64(&index, word); + return (unsigned)index; + } + +# else + + /* Here, we didn't find a fast method for finding the lsb. Fall back to + * making the lsb the only set bit in the word, and use our function that + * works on words with a single bit set. + * + * Isolate the lsb; + * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set + * + * The word will look like this, with a rightmost set bit in position 's': + * ('x's are don't cares, and 'y's are their complements) + * s + * x..x100..00 + * y..y011..11 Complement + * y..y100..00 Add 1 + * 0..0100..00 And with the original + * + * (Yes, complementing and adding 1 is just taking the negative on 2's + * complement machines, but not on 1's complement ones, and some compilers + * complain about negating an unsigned.) + */ + return single_1bit_pos64(word & (~word + 1)); + +# endif + +} + +# define lsbit_pos_uintmax_(word) lsbit_pos64(word) +#else /* ! QUAD */ +# define lsbit_pos_uintmax_(word) lsbit_pos32(word) +#endif + +PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */ +Perl_lsbit_pos32(U32 word) +{ + /* Find the position (0..31) of the least significant set bit in the input + * word */ + + ASSUME(word != 0); + +#if defined(PERL_CTZ_32) +# define PERL_HAS_FAST_GET_LSB_POS32 + + return (unsigned) PERL_CTZ_32(word); + +#elif U32SIZE == 4 && defined(WIN32) +# define PERL_HAS_FAST_GET_LSB_POS32 + + { + unsigned long index; + _BitScanForward(&index, word); + return (unsigned)index; + } + +#elif defined(PERL_HAS_FAST_GET_LSB_POS64) +# define PERL_HAS_FAST_GET_LSB_POS32 + + /* Unlikely, but possible for the platform to have a wider fast operation + * but not a narrower one. But easy enough to handle the case by widening + * the parameter size. */ + return lsbit_pos64(word); + +#else + + return single_1bit_pos32(word & (~word + 1)); + +#endif + +} + + +/* Convert the leading zeros count to the bit position of the first set bit. + * This just subtracts from the highest position, 31 or 63. But some compilers + * don't optimize this optimally, and so a bit of bit twiddling encourages them + * to do the right thing. It turns out that subtracting a smaller non-negative + * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of + * the two numbers. To see why, first note that the sum of any number, x, and + * its complement, x', is all ones. So all ones minus x is x'. Then note that + * the xor of x and all ones is x'. */ +#define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc)) + +#ifdef U64TYPE /* HAS_QUAD not usable outside the core */ + +PERL_STATIC_INLINE unsigned +Perl_msbit_pos64(U64 word) +{ + /* Find the position (0..63) of the most significant set bit in the input + * word */ + + ASSUME(word != 0); + + /* If we can determine that the platform has a usable fast method to get + * this, use that */ + +# if defined(PERL_CLZ_64) +# define PERL_HAS_FAST_GET_MSB_POS64 + + return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word)); + +# elif U64SIZE == 8 && defined(_WIN64) +# define PERL_HAS_FAST_GET_MSB_POS64 + + { + unsigned long index; + _BitScanReverse64(&index, word); + return (unsigned)index; + } + +# else + + /* Here, we didn't find a fast method for finding the msb. Fall back to + * making the msb the only set bit in the word, and use our function that + * works on words with a single bit set. + * + * Isolate the msb; http://codeforces.com/blog/entry/10330 + * + * Only the most significant set bit matters. Or'ing word with its right + * shift of 1 makes that bit and the next one to its right both 1. + * Repeating that with the right shift of 2 makes for 4 1-bits in a row. + * ... We end with the msb and all to the right being 1. */ + word |= (word >> 1); + word |= (word >> 2); + word |= (word >> 4); + word |= (word >> 8); + word |= (word >> 16); + word |= (word >> 32); + + /* Then subtracting the right shift by 1 clears all but the left-most of + * the 1 bits, which is our desired result */ + word -= (word >> 1); + + /* Now we have a single bit set */ + return single_1bit_pos64(word); + +# endif + +} + +# define msbit_pos_uintmax_(word) msbit_pos64(word) +#else /* ! QUAD */ +# define msbit_pos_uintmax_(word) msbit_pos32(word) +#endif -*/ +PERL_STATIC_INLINE unsigned +Perl_msbit_pos32(U32 word) +{ + /* Find the position (0..31) of the most significant set bit in the input + * word */ -#define is_utf8_invariant_string(s, len) \ - is_utf8_invariant_string_loc(s, len, NULL) + ASSUME(word != 0); + +#if defined(PERL_CLZ_32) +# define PERL_HAS_FAST_GET_MSB_POS32 + + return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word)); +#elif U32SIZE == 4 && defined(WIN32) +# define PERL_HAS_FAST_GET_MSB_POS32 + + { + unsigned long index; + _BitScanReverse(&index, word); + return (unsigned)index; + } + +#elif defined(PERL_HAS_FAST_GET_MSB_POS64) +# define PERL_HAS_FAST_GET_MSB_POS32 + + return msbit_pos64(word); /* Let compiler widen parameter */ + +#else + + word |= (word >> 1); + word |= (word >> 2); + word |= (word >> 4); + word |= (word >> 8); + word |= (word >> 16); + word -= (word >> 1); + return single_1bit_pos32(word); + +#endif + +} + +/* Note that if you are working through all the 1 bits in a word, and don't + * care which order you process them in, it is better to use lsbit_pos. This + * is because some platforms have a fast way to find the msb but not the lsb, + * and others vice versa. The code above falls back to use the single + * available fast method when the desired one is missing, and it is cheaper to + * fall back from lsb to msb than the other way around */ + +#if UVSIZE == U64SIZE +# define msbit_pos(word) msbit_pos64(word) +# define lsbit_pos(word) lsbit_pos64(word) +#elif UVSIZE == U32SIZE +# define msbit_pos(word) msbit_pos32(word) +# define lsbit_pos(word) lsbit_pos32(word) +#endif + +#ifdef U64TYPE /* HAS_QUAD not usable outside the core */ + +PERL_STATIC_INLINE unsigned +Perl_single_1bit_pos64(U64 word) +{ + /* Given a 64-bit word known to contain all zero bits except one 1 bit, + * find and return the 1's position: 0..63 */ + +# ifdef PERL_CORE /* macro not exported */ + ASSUME(isPOWER_OF_2(word)); +# else + ASSUME(word && (word & (word-1)) == 0); +# endif + + /* The only set bit is both the most and least significant bit. If we have + * a fast way of finding either one, use that. + * + * It may appear at first glance that those functions call this one, but + * they don't if the corresponding #define is set */ + +# ifdef PERL_HAS_FAST_GET_MSB_POS64 + + return msbit_pos64(word); + +# elif defined(PERL_HAS_FAST_GET_LSB_POS64) + + return lsbit_pos64(word); + +# else + + /* The position of the only set bit in a word can be quickly calculated + * using deBruijn sequences. See for example + * https://en.wikipedia.org/wiki/De_Bruijn_sequence */ + return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_) + >> PERL_deBruijnShift64_]; +# endif + +} + +#endif + +PERL_STATIC_INLINE unsigned +Perl_single_1bit_pos32(U32 word) +{ + /* Given a 32-bit word known to contain all zero bits except one 1 bit, + * find and return the 1's position: 0..31 */ + +#ifdef PERL_CORE /* macro not exported */ + ASSUME(isPOWER_OF_2(word)); +#else + ASSUME(word && (word & (word-1)) == 0); +#endif +#ifdef PERL_HAS_FAST_GET_MSB_POS32 + + return msbit_pos32(word); + +#elif defined(PERL_HAS_FAST_GET_LSB_POS32) + + return lsbit_pos32(word); + +#else + + return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_) + >> PERL_deBruijnShift32_]; +#endif + +} + +#ifndef EBCDIC + +PERL_STATIC_INLINE unsigned int +Perl_variant_byte_number(PERL_UINTMAX_T word) +{ + /* This returns the position in a word (0..7) of the first variant byte in + * it. This is a helper function. Note that there are no branches */ + + /* Get just the msb bits of each byte */ + word &= PERL_VARIANTS_WORD_MASK; + + /* This should only be called if we know there is a variant byte in the + * word */ + assert(word); + +# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 + + /* Bytes are stored like + * Byte8 ... Byte2 Byte1 + * 63..56...15...8 7...0 + * so getting the lsb of the whole modified word is getting the msb of the + * first byte that has its msb set */ + word = lsbit_pos_uintmax_(word); + + /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert + * to 0..7 */ + return (unsigned int) ((word + 1) >> 3) - 1; + +# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 + + /* Bytes are stored like + * Byte1 Byte2 ... Byte8 + * 63..56 55..47 ... 7...0 + * so getting the msb of the whole modified word is getting the msb of the + * first byte that has its msb set */ + word = msbit_pos_uintmax_(word); + + /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert + * to 0..7 */ + word = ((word + 1) >> 3) - 1; + + /* And invert the result because of the reversed byte order on this + * platform */ + word = CHARBITS - word - 1; + + return (unsigned int) word; + +# else +# error Unexpected byte order +# endif + +} + +#endif +#if defined(PERL_CORE) || defined(PERL_EXT) /* -=for apidoc is_utf8_invariant_string_loc +=for apidoc variant_under_utf8_count -Like C> but upon failure, stores the location of -the first UTF-8 variant character in the C pointer; if all characters are -UTF-8 invariant, this function does not change the contents of C<*ep>. +This function looks at the sequence of bytes between C and C, which are +assumed to be encoded in ASCII/Latin1, and returns how many of them would +change should the string be translated into UTF-8. Due to the nature of UTF-8, +each of these would occupy two bytes instead of the single one in the input +string. Thus, this function returns the precise number of bytes the string +would expand by when translated to UTF-8. -=cut +Unlike most of the other functions that have C in their name, the input +to this function is NOT a UTF-8-encoded string. The function name is slightly +I to emphasize this. + +This function is internal to Perl because khw thinks that any XS code that +would want this is probably operating too close to the internals. Presenting a +valid use case could change that. + +See also +C> +and +C>, -XXX On ASCII machines this could be sped up by doing word-at-a-time operations +=cut */ -PERL_STATIC_INLINE bool -S_is_utf8_invariant_string_loc(const U8* const s, const STRLEN len, const U8 ** ep) +PERL_STATIC_INLINE Size_t +S_variant_under_utf8_count(const U8* const s, const U8* const e) { - const U8* const send = s + (len ? len : strlen((const char *)s)); const U8* x = s; + Size_t count = 0; - PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC; + PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT; - while (x < send) { - if (UTF8_IS_INVARIANT(*x)) { - x++; - continue; +# ifndef EBCDIC + + /* Test if the string is long enough to use word-at-a-time. (Logic is the + * same as for is_utf8_invariant_string()) */ + if ((STRLEN) (e - x) >= PERL_WORDSIZE + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) + - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) + { + + /* Process per-byte until reach word boundary. XXX This loop could be + * eliminated if we knew that this platform had fast unaligned reads */ + while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { + count += ! UTF8_IS_INVARIANT(*x++); } - if (ep) { - *ep = x; + /* Process per-word as long as we have at least a full word left */ + do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an + explanation of how this works */ + PERL_UINTMAX_T increment + = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7) + * PERL_COUNT_MULTIPLIER) + >> ((PERL_WORDSIZE - 1) * CHARBITS); + count += (Size_t) increment; + x += PERL_WORDSIZE; + } while (x + PERL_WORDSIZE <= e); + } + +# endif + + /* Process per-byte */ + while (x < e) { + if (! UTF8_IS_INVARIANT(*x)) { + count++; } - return FALSE; + x++; } - return TRUE; + return count; } +#endif + + /* Keep these around for these files */ +#if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C) +# undef PERL_WORDSIZE +# undef PERL_COUNT_MULTIPLIER +# undef PERL_WORD_BOUNDARY_MASK +# undef PERL_VARIANTS_WORD_MASK +#endif + /* =for apidoc is_utf8_string @@ -425,28 +1988,53 @@ C>, =cut */ +#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL) + +#if defined(PERL_CORE) || defined (PERL_EXT) + +/* +=for apidoc is_utf8_non_invariant_string + +Returns TRUE if L returns FALSE for the first +C bytes of the string C, but they are, nonetheless, legal Perl-extended +UTF-8; otherwise returns FALSE. + +A TRUE return means that at least one code point represented by the sequence +either is a wide character not representable as a single byte, or the +representation differs depending on whether the sequence is encoded in UTF-8 or +not. + +See also +C>, +C> + +=cut + +This is commonly used to determine if a SV's UTF-8 flag should be turned on. +It generally needn't be if its string is entirely UTF-8 invariant, and it +shouldn't be if it otherwise contains invalid UTF-8. + +It is an internal function because khw thinks that XS code shouldn't be working +at this low a level. A valid use case could change that. + +*/ + PERL_STATIC_INLINE bool -Perl_is_utf8_string(const U8 *s, const STRLEN len) +Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len) { - /* This is now marked pure in embed.fnc, because isUTF8_CHAR now is pure. - * Be aware of possible changes to that */ - - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; + const U8 * first_variant; - PERL_ARGS_ASSERT_IS_UTF8_STRING; + PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING; - while (x < send) { - const STRLEN cur_len = isUTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - return FALSE; - } - x += cur_len; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + return FALSE; } - return TRUE; + return is_utf8_string(first_variant, len - (first_variant - s)); } +#endif + /* =for apidoc is_strict_utf8_string @@ -483,24 +2071,7 @@ C>. =cut */ -PERL_STATIC_INLINE bool -S_is_strict_utf8_string(const U8 *s, const STRLEN len) -{ - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - - PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING; - - while (x < send) { - const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - return FALSE; - } - x += cur_len; - } - - return TRUE; -} +#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL) /* =for apidoc is_c9strict_utf8_string @@ -540,28 +2111,7 @@ C>. =cut */ -PERL_STATIC_INLINE bool -S_is_c9strict_utf8_string(const U8 *s, const STRLEN len) -{ - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - - PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING; - - while (x < send) { - const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - return FALSE; - } - x += cur_len; - } - - return TRUE; -} - -/* The above 3 functions could have been moved into the more general one just - * below, and made #defines that call it with the right 'flags'. They are - * currently kept separate to increase their chances of getting inlined */ +#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0) /* =for apidoc is_utf8_string_flags @@ -604,97 +2154,399 @@ C>. */ PERL_STATIC_INLINE bool -S_is_utf8_string_flags(const U8 *s, const STRLEN len, const U32 flags) +Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS; assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE |UTF8_DISALLOW_PERL_EXTENDED))); - if (flags == 0) { - return is_utf8_string(s, len); - } + if (len == 0) { + len = strlen((const char *)s); + } + + if (flags == 0) { + return is_utf8_string(s, len); + } + + if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) + == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) + { + return is_strict_utf8_string(s, len); + } + + if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) + == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) + { + return is_c9strict_utf8_string(s, len); + } + + if (! is_utf8_invariant_string_loc(s, len, &first_variant)) { + const U8* const send = s + len; + const U8* x = first_variant; + + while (x < send) { + STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); + if (UNLIKELY(! cur_len)) { + return FALSE; + } + x += cur_len; + } + } + + return TRUE; +} + +/* + +=for apidoc is_utf8_string_loc + +Like C> but stores the location of the failure (in the +case of "utf8ness failure") or the location C+C (in the case of +"utf8ness success") in the C pointer. + +See also C>. + +=cut +*/ + +#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0) + +/* + +=for apidoc is_utf8_string_loclen + +Like C> but stores the location of the failure (in the +case of "utf8ness failure") or the location C+C (in the case of +"utf8ness success") in the C pointer, and the number of UTF-8 +encoded characters in the C pointer. + +See also C>. + +=cut +*/ + +PERL_STATIC_INLINE bool +Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) +{ + const U8 * first_variant; + + PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; + + if (len == 0) { + len = strlen((const char *) s); + } + + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } + + return TRUE; + } + + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isUTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } +} + +/* The perl core arranges to never call the DFA below without there being at + * least one byte available to look at. This allows the DFA to use a do {} + * while loop which means that calling it with a UTF-8 invariant has a single + * conditional, same as the calling code checking for invariance ahead of time. + * And having the calling code remove that conditional speeds up by that + * conditional, the case where it wasn't invariant. So there's no reason to + * check before caling this. + * + * But we don't know this for non-core calls, so have to retain the check for + * them. */ +#ifdef PERL_CORE +# define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s)) +#else +# define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE +#endif + +/* + * DFA for checking input is valid UTF-8 syntax. + * + * This uses adaptations of the table and algorithm given in + * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive + * documentation of the original version. A copyright notice for the original + * version is given at the beginning of this file. The Perl adaptations are + * documented at the definition of PL_extended_utf8_dfa_tab[]. + * + * This dfa is fast. There are three exit conditions: + * 1) a well-formed code point, acceptable to the table + * 2) the beginning bytes of an incomplete character, whose completion might + * or might not be acceptable + * 3) unacceptable to the table. Some of the adaptations have certain, + * hopefully less likely to occur, legal inputs be unacceptable to the + * table, so these must be sorted out afterwards. + * + * This macro is a complete implementation of the code executing the DFA. It + * is passed the input sequence bounds and the table to use, and what to do + * for each of the exit conditions. There are three canned actions, likely to + * be the ones you want: + * DFA_RETURN_SUCCESS_ + * DFA_RETURN_FAILURE_ + * DFA_GOTO_TEASE_APART_FF_ + * + * You pass a parameter giving the action to take for each of the three + * possible exit conditions: + * + * 'accept_action' This is executed when the DFA accepts the input. + * DFA_RETURN_SUCCESS_ is the most likely candidate. + * 'reject_action' This is executed when the DFA rejects the input. + * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where + * you have written code to distinguish the rejecting state + * results. Because it happens in several places, and + * involves #ifdefs, the special action + * DFA_GOTO_TEASE_APART_FF_ is what you want with + * PL_extended_utf8_dfa_tab. On platforms without + * EXTRA_LONG_UTF8, there is no need to tease anything apart, + * so this evaluates to DFA_RETURN_FAILURE_; otherwise you + * need to have a label 'tease_apart_FF' that it will transfer + * to. + * 'incomplete_char_action' This is executed when the DFA ran off the end + * before accepting or rejecting the input. + * DFA_RETURN_FAILURE_ is the likely action, but you could + * have a 'goto', or NOOP. In the latter case the DFA drops + * off the end, and you place your code to handle this case + * immediately after it. + */ + +#define DFA_RETURN_SUCCESS_ return s - s0 +#define DFA_RETURN_FAILURE_ return 0 +#ifdef HAS_EXTRA_LONG_UTF8 +# define DFA_TEASE_APART_FF_ goto tease_apart_FF +#else +# define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_ +#endif + +#define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \ + accept_action, \ + reject_action, \ + incomplete_char_action) \ + STMT_START { \ + const U8 * s = s0; \ + const U8 * e_ = e; \ + UV state = 0; \ + \ + PERL_NON_CORE_CHECK_EMPTY(s, e_); \ + \ + do { \ + state = dfa_tab[256 + state + dfa_tab[*s]]; \ + s++; \ + \ + if (state == 0) { /* Accepting state */ \ + accept_action; \ + } \ + \ + if (UNLIKELY(state == 1)) { /* Rejecting state */ \ + reject_action; \ + } \ + } while (s < e_); \ + \ + /* Here, dropped out of loop before end-of-char */ \ + incomplete_char_action; \ + } STMT_END + + +/* + +=for apidoc isUTF8_CHAR + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8, as extended by Perl, +that represents some code point; otherwise it evaluates to 0. If non-zero, the +value gives how many bytes starting at C comprise the code point's +representation. Any bytes remaining before C, but beyond the ones needed to +form the first code point in C, are not examined. + +The code point can be any that will fit in an IV on this machine, using Perl's +extension to official UTF-8 to represent those higher than the Unicode maximum +of 0x10FFFF. That means that this macro is used to efficiently decide if the +next few bytes in C is legal UTF-8 for a single character. + +Use C> to restrict the acceptable code points to those +defined by Unicode to be fully interchangeable across applications; +C> to use the L definition of allowable +code points; and C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. + +Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC +machines) is a valid UTF-8 character. + +=cut + +This uses an adaptation of the table and algorithm given in +https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adaptation is +documented at the definition of PL_extended_utf8_dfa_tab[]. +*/ + +PERL_STATIC_INLINE Size_t +Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + PERL_ARGS_ASSERT_ISUTF8_CHAR; + + PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, + DFA_RETURN_SUCCESS_, + DFA_TEASE_APART_FF_, + DFA_RETURN_FAILURE_); - if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) - == UTF8_DISALLOW_ILLEGAL_INTERCHANGE) - { - return is_strict_utf8_string(s, len); - } + /* Here, we didn't return success, but dropped out of the loop. In the + * case of PL_extended_utf8_dfa_tab, this means the input is either + * malformed, or the start byte was FF on a platform that the dfa doesn't + * handle FF's. Call a helper function. */ - if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED) - == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE) - { - return is_c9strict_utf8_string(s, len); - } +#ifdef HAS_EXTRA_LONG_UTF8 - while (x < send) { - STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); - if (UNLIKELY(! cur_len)) { - return FALSE; - } - x += cur_len; + tease_apart_FF: + + /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is + * either malformed, or was for the largest possible start byte, which we + * now check, not inline */ + if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) { + return 0; } - return TRUE; + return is_utf8_FF_helper_(s0, e, + FALSE /* require full, not partial char */ + ); +#endif + } /* -=for apidoc is_utf8_string_loc +=for apidoc isSTRICT_UTF8_CHAR -Like C> but stores the location of the failure (in the -case of "utf8ness failure") or the location C+C (in the case of -"utf8ness success") in the C pointer. +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8 that represents some +Unicode code point completely acceptable for open interchange between all +applications; otherwise it evaluates to 0. If non-zero, the value gives how +many bytes starting at C comprise the code point's representation. Any +bytes remaining before C, but beyond the ones needed to form the first code +point in C, are not examined. -See also C>. +The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not +be a surrogate nor a non-character code point. Thus this excludes any code +point from Perl's extended UTF-8. + +This is used to efficiently decide if the next few bytes in C is +legal Unicode-acceptable UTF-8 for a single character. + +Use C> to use the L definition of allowable +code points; C> to check for Perl's extended UTF-8; +and C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. =cut + +This uses an adaptation of the tables and algorithm given in +https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adaptation is +documented at the definition of strict_extended_utf8_dfa_tab[]. + */ -#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0) +PERL_STATIC_INLINE Size_t +Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; + + PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab, + DFA_RETURN_SUCCESS_, + goto check_hanguls, + DFA_RETURN_FAILURE_); + check_hanguls: + + /* Here, we didn't return success, but dropped out of the loop. In the + * case of PL_strict_utf8_dfa_tab, this means the input is either + * malformed, or was for certain Hanguls; handle them specially */ + + /* The dfa above drops out for incomplete or illegal inputs, and certain + * legal Hanguls; check and return accordingly */ + return is_HANGUL_ED_utf8_safe(s0, e); +} /* -=for apidoc is_utf8_string_loclen +=for apidoc isC9_STRICT_UTF8_CHAR -Like C> but stores the location of the failure (in the -case of "utf8ness failure") or the location C+C (in the case of -"utf8ness success") in the C pointer, and the number of UTF-8 -encoded characters in the C pointer. +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8 that represents some +Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, +the value gives how many bytes starting at C comprise the code point's +representation. Any bytes remaining before C, but beyond the ones needed to +form the first code point in C, are not examined. -See also C>. +The largest acceptable code point is the Unicode maximum 0x10FFFF. This +differs from C> only in that it accepts non-character +code points. This corresponds to +L. +which said that non-character code points are merely discouraged rather than +completely forbidden in open interchange. See +L. -=cut -*/ +Use C> to check for Perl's extended UTF-8; and +C> for a more customized definition. -PERL_STATIC_INLINE bool -Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) -{ - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; +Use C>, C>, and +C> to check entire strings. - PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; +=cut - while (x < send) { - const STRLEN cur_len = isUTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - break; - } - x += cur_len; - outlen++; - } +This uses an adaptation of the tables and algorithm given in +https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adaptation is +documented at the definition of PL_c9_utf8_dfa_tab[]. - if (el) - *el = outlen; +*/ - if (ep) { - *ep = x; - } +PERL_STATIC_INLINE Size_t +Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR; - return (x == send); + PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab, + DFA_RETURN_SUCCESS_, + DFA_RETURN_FAILURE_, + DFA_RETURN_FAILURE_); } /* @@ -728,31 +2580,50 @@ See also C>. */ PERL_STATIC_INLINE bool -S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) +Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN; - while (x < send) { - const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - break; - } - x += cur_len; - outlen++; + if (len == 0) { + len = strlen((const char *) s); } - if (el) - *el = outlen; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } - if (ep) { - *ep = x; + return TRUE; } - return (x == send); + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } } /* @@ -786,31 +2657,50 @@ See also C>. */ PERL_STATIC_INLINE bool -S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) +Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN; - while (x < send) { - const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - break; - } - x += cur_len; - outlen++; + if (len == 0) { + len = strlen((const char *) s); } - if (el) - *el = outlen; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } - if (ep) { - *ep = x; + return TRUE; } - return (x == send); + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } } /* @@ -849,11 +2739,9 @@ See also C>. */ PERL_STATIC_INLINE bool -S_is_utf8_string_loclen_flags(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) +Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS; assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE @@ -875,23 +2763,44 @@ S_is_utf8_string_loclen_flags(const U8 *s, const STRLEN len, const U8 **ep, STRL return is_c9strict_utf8_string_loclen(s, len, ep, el); } - while (x < send) { - const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); - if (UNLIKELY(! cur_len)) { - break; - } - x += cur_len; - outlen++; + if (len == 0) { + len = strlen((const char *) s); } - if (el) - *el = outlen; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; - if (ep) { - *ep = x; + if (ep) { + *ep = s + len; + } + + return TRUE; } - return (x == send); + { + const U8* send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } } /* @@ -918,11 +2827,16 @@ Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) =for apidoc utf8_hop Return the UTF-8 pointer C displaced by C characters, either -forward or backward. +forward (if C is positive) or backward (if negative). C does not need +to be pointing to the starting byte of a character. If it isn't, one count of +C will be used up to get to the start of the next character for forward +hops, and to the start of the current character for negative ones. -WARNING: do not use the following unless you *know* C is within -the UTF-8 data pointed to by C *and* that on entry C is aligned -on the first byte of character or just after the last byte of a character. +WARNING: Prefer L to this one. + +Do NOT use this function unless you B C is within +the UTF-8 data pointed to by C B that on entry C is aligned +on the first byte of a character or just after the last byte of a character. =cut */ @@ -933,30 +2847,43 @@ Perl_utf8_hop(const U8 *s, SSize_t off) PERL_ARGS_ASSERT_UTF8_HOP; /* Note: cannot use UTF8_IS_...() too eagerly here since e.g - * the bitops (especially ~) can create illegal UTF-8. + * the XXX bitops (especially ~) can create illegal UTF-8. * In other words: in Perl UTF-8 is not just for Unicode. */ - if (off >= 0) { - while (off--) - s += UTF8SKIP(s); + if (off > 0) { + + /* Get to next non-continuation byte */ + if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) { + do { + s++; + } + while (UTF8_IS_CONTINUATION(*s)); + off--; + } + + while (off--) + s += UTF8SKIP(s); } else { - while (off++) { - s--; - while (UTF8_IS_CONTINUATION(*s)) - s--; - } + while (off++) { + s--; + while (UTF8_IS_CONTINUATION(*s)) + s--; + } } - GCC_DIAG_IGNORE(-Wcast-qual); + + GCC_DIAG_IGNORE(-Wcast-qual) return (U8 *)s; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE } /* =for apidoc utf8_hop_forward Return the UTF-8 pointer C displaced by up to C characters, -forward. +forward. C does not need to be pointing to the starting byte of a +character. If it isn't, one count of C will be used up to get to the +start of the next character. C must be non-negative. @@ -981,26 +2908,37 @@ Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) assert(s <= end); assert(off >= 0); + if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) { + /* Get to next non-continuation byte */ + do { + s++; + } + while (UTF8_IS_CONTINUATION(*s)); + off--; + } + while (off--) { STRLEN skip = UTF8SKIP(s); if ((STRLEN)(end - s) <= skip) { - GCC_DIAG_IGNORE(-Wcast-qual); + GCC_DIAG_IGNORE(-Wcast-qual) return (U8 *)end; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE } s += skip; } - GCC_DIAG_IGNORE(-Wcast-qual); + GCC_DIAG_IGNORE(-Wcast-qual) return (U8 *)s; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE } /* =for apidoc utf8_hop_back Return the UTF-8 pointer C displaced by up to C characters, -backward. +backward. C does not need to be pointing to the starting byte of a +character. If it isn't, one count of C will be used up to get to that +start. C must be non-positive. @@ -1025,22 +2963,32 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) assert(start <= s); assert(off <= 0); + /* Note: if we know that the input is well-formed, we can do per-word + * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented + * that. But it was reverted because doing per-word has some + * start-up/tear-down overhead, so only makes sense if the distance to be + * moved is large, and core perl doesn't currently move more than a few + * characters at a time. You can reinstate it if it does become + * advantageous. */ while (off++ && s > start) { - s--; - while (UTF8_IS_CONTINUATION(*s) && s > start) + do { s--; + } while (s > start && UTF8_IS_CONTINUATION(*s)); } - - GCC_DIAG_IGNORE(-Wcast-qual); + + GCC_DIAG_IGNORE(-Wcast-qual) return (U8 *)s; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE } /* =for apidoc utf8_hop_safe Return the UTF-8 pointer C displaced by up to C characters, -either forward or backward. +either forward or backward. C does not need to be pointing to the starting +byte of a character. If it isn't, one count of C will be used up to get +to the start of the next character for forward hops, and to the start of the +current character for negative ones. When moving backward it will not move before C. @@ -1072,6 +3020,73 @@ Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) /* +=for apidoc isUTF8_CHAR_flags + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8, as extended by Perl, +that represents some code point, subject to the restrictions given by C; +otherwise it evaluates to 0. If non-zero, the value gives how many bytes +starting at C comprise the code point's representation. Any bytes remaining +before C, but beyond the ones needed to form the first code point in C, +are not examined. + +If C is 0, this gives the same results as C>; +if C is C, this gives the same results +as C>; +and if C is C, this gives +the same results as C>. +Otherwise C may be any combination of the C> flags +understood by C>, with the same meanings. + +The three alternative macros are for the most commonly needed validations; they +are likely to run somewhat faster than this more general one, as they can be +inlined into your code. + +Use L, L, and +L to check entire strings. + +=cut +*/ + +PERL_STATIC_INLINE STRLEN +Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags) +{ + PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS; + assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE + |UTF8_DISALLOW_PERL_EXTENDED))); + + PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, + goto check_success, + DFA_TEASE_APART_FF_, + DFA_RETURN_FAILURE_); + + check_success: + + return is_utf8_char_helper_(s0, e, flags); + +#ifdef HAS_EXTRA_LONG_UTF8 + + tease_apart_FF: + + /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is + * either malformed, or was for the largest possible start byte, which + * indicates perl extended UTF-8, well above the Unicode maximum */ + if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) + || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) + { + return 0; + } + + /* Otherwise examine the sequence not inline */ + return is_utf8_FF_helper_(s0, e, + FALSE /* require full, not partial char */ + ); +#endif + +} + +/* + =for apidoc is_utf8_valid_partial_char Returns 0 if the sequence of bytes starting at C and looking no further than @@ -1120,18 +3135,47 @@ determined from just the first one or two bytes. */ PERL_STATIC_INLINE bool -S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags) +Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags) { PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS; - assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE |UTF8_DISALLOW_PERL_EXTENDED))); - if (s >= e || s + UTF8SKIP(s) <= e) { - return FALSE; + PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab, + DFA_RETURN_FAILURE_, + DFA_TEASE_APART_FF_, + NOOP); + + /* The NOOP above causes the DFA to drop down here iff the input was a + * partial character. flags=0 => can return TRUE immediately; otherwise we + * need to check (not inline) if the partial character is the beginning of + * a disallowed one */ + if (flags == 0) { + return TRUE; } - return cBOOL(_is_utf8_char_helper(s, e, flags)); + return cBOOL(is_utf8_char_helper_(s0, e, flags)); + +#ifdef HAS_EXTRA_LONG_UTF8 + + tease_apart_FF: + + /* Getting here means the input is either malformed, or, in the case of + * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The + * latter case has to be extended UTF-8, so can fail immediately if that is + * forbidden */ + + if ( *s0 != I8_TO_NATIVE_UTF8(0xFF) + || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED))) + { + return 0; + } + + return is_utf8_FF_helper_(s0, e, + TRUE /* Require to be a partial character */ + ); +#endif + } /* @@ -1188,8 +3232,8 @@ complete, valid characters found in the C pointer. */ PERL_STATIC_INLINE bool -S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, - const STRLEN len, +Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, + STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) @@ -1208,25 +3252,144 @@ S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, || is_utf8_valid_partial_char_flags(*ep, s + len, flags); } +PERL_STATIC_INLINE UV +Perl_utf8n_to_uvchr_msgs(const U8 *s, + STRLEN curlen, + STRLEN *retlen, + const U32 flags, + U32 * errors, + AV ** msgs) +{ + /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the + * simple cases, and, if necessary calls a helper function to deal with the + * more complex ones. Almost all well-formed non-problematic code points + * are considered simple, so that it's unlikely that the helper function + * will need to be called. + * + * This is an adaptation of the tables and algorithm given in + * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides + * comprehensive documentation of the original version. A copyright notice + * for the original version is given at the beginning of this file. The + * Perl adaptation is documented at the definition of PL_strict_utf8_dfa_tab[]. + */ + + const U8 * const s0 = s; + const U8 * send = s0 + curlen; + UV type; + UV uv; + + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; + + /* This dfa is fast. If it accepts the input, it was for a well-formed, + * non-problematic code point, which can be returned immediately. + * Otherwise we call a helper function to figure out the more complicated + * cases. */ + + /* No calls from core pass in an empty string; non-core need a check */ +#ifdef PERL_CORE + assert(curlen > 0); +#else + if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen, + flags, errors, msgs); +#endif + + type = PL_strict_utf8_dfa_tab[*s]; + + /* The table is structured so that 'type' is 0 iff the input byte is + * represented identically regardless of the UTF-8ness of the string */ + if (type == 0) { /* UTF-8 invariants are returned unchanged */ + uv = *s; + } + else { + UV state = PL_strict_utf8_dfa_tab[256 + type]; + uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s); + + while (++s < send) { + type = PL_strict_utf8_dfa_tab[*s]; + state = PL_strict_utf8_dfa_tab[256 + state + type]; + + uv = UTF8_ACCUMULATE(uv, *s); + + if (state == 0) { +#ifdef EBCDIC + uv = UNI_TO_NATIVE(uv); +#endif + goto success; + } + + if (UNLIKELY(state == 1)) { + break; + } + } + + /* Here is potentially problematic. Use the full mechanism */ + return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, + errors, msgs); + } + + success: + if (retlen) { + *retlen = s - s0 + 1; + } + if (errors) { + *errors = 0; + } + if (msgs) { + *msgs = NULL; + } + + return uv; +} + +PERL_STATIC_INLINE UV +Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) +{ + PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER; + + assert(s < send); + + if (! ckWARN_d(WARN_UTF8)) { + + /* EMPTY is not really allowed, and asserts on debugging builds. But + * on non-debugging we have to deal with it, and this causes it to + * return the REPLACEMENT CHARACTER, as the documentation indicates */ + return utf8n_to_uvchr(s, send - s, retlen, + (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY)); + } + else { + UV ret = utf8n_to_uvchr(s, send - s, retlen, 0); + if (retlen && ret == 0 && (send <= s || *s != '\0')) { + *retlen = (STRLEN) -1; + } + + return ret; + } +} + /* ------------------------------- perl.h ----------------------------- */ /* -=head1 Miscellaneous Functions +=for apidoc_section $utility -=for apidoc AiR|bool|is_safe_syscall|const char *pv|STRLEN len|const char *what|const char *op_name +=for apidoc is_safe_syscall -Test that the given C doesn't contain any internal C characters. -If it does, set C to C, optionally warn, and return FALSE. +Test that the given C (with length C) doesn't contain any internal +C characters. +If it does, set C to C, optionally warn using the C +category, and return FALSE. Return TRUE if the name is safe. +C and C are used in any warning. + Used by the C macro. =cut */ PERL_STATIC_INLINE bool -S_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) { +Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) +{ /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs * perl itself uses xce*() functions which accept 8-bit strings. */ @@ -1267,7 +3430,8 @@ then calling: #ifdef PERL_CORE PERL_STATIC_INLINE bool -S_should_warn_nl(const char *pv) { +S_should_warn_nl(const char *pv) +{ STRLEN len; PERL_ARGS_ASSERT_SHOULD_WARN_NL; @@ -1279,13 +3443,53 @@ S_should_warn_nl(const char *pv) { #endif +#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C) + +PERL_STATIC_INLINE bool +S_lossless_NV_to_IV(const NV nv, IV *ivp) +{ + /* This function determines if the input NV 'nv' may be converted without + * loss of data to an IV. If not, it returns FALSE taking no other action. + * But if it is possible, it does the conversion, returning TRUE, and + * storing the converted result in '*ivp' */ + + PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV; + +# if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) + /* Normally any comparison with a NaN returns false; if we can't rely + * on that behaviour, check explicitly */ + if (UNLIKELY(Perl_isnan(nv))) { + return FALSE; + } +# endif + + /* Written this way so that with an always-false NaN comparison we + * return false */ + if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) { + return FALSE; + } + + if ((IV) nv != nv) { + return FALSE; + } + + *ivp = (IV) nv; + return TRUE; +} + +#endif + /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */ +#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) + #define MAX_CHARSET_NAME_LENGTH 2 PERL_STATIC_INLINE const char * -get_regex_charset_name(const U32 flags, STRLEN* const lenp) +S_get_regex_charset_name(const U32 flags, STRLEN* const lenp) { + PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME; + /* Returns a string that corresponds to the name of the regex character set * given by 'flags', and *lenp is set the length of that string, which * cannot exceed MAX_CHARSET_NAME_LENGTH characters */ @@ -1295,10 +3499,10 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp) case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS; case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS; case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS; - case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS; - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - *lenp = 2; - return ASCII_MORE_RESTRICT_PAT_MODS; + case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS; + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + *lenp = 2; + return ASCII_MORE_RESTRICT_PAT_MODS; } /* The NOT_REACHED; hides an assert() which has a rather complex * definition in perl.h. */ @@ -1306,6 +3510,8 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp) return "?"; /* Unknown */ } +#endif + /* Return false if any get magic is on the SV other than taint magic. @@ -1313,7 +3519,8 @@ Return false if any get magic is on the SV other than taint magic. */ PERL_STATIC_INLINE bool -S_sv_only_taint_gmagic(SV *sv) { +Perl_sv_only_taint_gmagic(SV *sv) +{ MAGIC *mg = SvMAGIC(sv); PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC; @@ -1332,11 +3539,28 @@ S_sv_only_taint_gmagic(SV *sv) { /* ------------------ cop.h ------------------------------------------- */ +/* implement GIMME_V() macro */ + +PERL_STATIC_INLINE U8 +Perl_gimme_V(pTHX) +{ + I32 cxix; + U8 gimme = (PL_op->op_flags & OPf_WANT); + + if (gimme) + return gimme; + cxix = PL_curstackinfo->si_cxsubix; + if (cxix < 0) + return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID; + assert(cxstack[cxix].blk_gimme & G_WANT); + return (cxstack[cxix].blk_gimme & G_WANT); +} + /* Enter a block. Push a new base context and return its address. */ PERL_STATIC_INLINE PERL_CONTEXT * -S_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) +Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) { PERL_CONTEXT * cx; @@ -1347,7 +3571,10 @@ S_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) cx->cx_type = type; cx->blk_gimme = gimme; cx->blk_oldsaveix = saveix; - cx->blk_oldsp = (I32)(sp - PL_stack_base); + cx->blk_oldsp = (Stack_off_t)(sp - PL_stack_base); + assert(cxstack_ix <= 0 + || CxTYPE(cx-1) == CXt_SUBST + || cx->blk_oldsp >= (cx-1)->blk_oldsp); cx->blk_oldcop = PL_curcop; cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack); cx->blk_oldscopesp = PL_scopestack_ix; @@ -1363,7 +3590,7 @@ S_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) /* Exit a block (RETURN and LAST). */ PERL_STATIC_INLINE void -S_cx_popblock(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_POPBLOCK; @@ -1388,7 +3615,7 @@ S_cx_popblock(pTHX_ PERL_CONTEXT *cx) * *after* cx_pushblock() was called. */ PERL_STATIC_INLINE void -S_cx_topblock(pTHX_ PERL_CONTEXT *cx) +Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_TOPBLOCK; @@ -1397,19 +3624,20 @@ S_cx_topblock(pTHX_ PERL_CONTEXT *cx) PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp; PL_scopestack_ix = cx->blk_oldscopesp; PL_curpm = cx->blk_oldpm; - - PL_stack_sp = PL_stack_base + cx->blk_oldsp; + Perl_rpp_popfree_to(aTHX_ PL_stack_base + cx->blk_oldsp); } PERL_STATIC_INLINE void -S_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs) +Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs) { U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub); PERL_ARGS_ASSERT_CX_PUSHSUB; PERL_DTRACE_PROBE_ENTRY(cv); + cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix; + PL_curstackinfo->si_cxsubix = (I32)(cx - PL_curstackinfo->si_cxstack); cx->blk_sub.cv = cv; cx->blk_sub.olddepth = CvDEPTH(cv); cx->blk_sub.prevcomppad = PL_comppad; @@ -1423,7 +3651,7 @@ S_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs) /* subsets of cx_popsub() */ PERL_STATIC_INLINE void -S_cx_popsub_common(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx) { CV *cv; @@ -1431,18 +3659,19 @@ S_cx_popsub_common(pTHX_ PERL_CONTEXT *cx) assert(CxTYPE(cx) == CXt_SUB); PL_comppad = cx->blk_sub.prevcomppad; - PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL; + PL_curpad = LIKELY(PL_comppad != NULL) ? AvARRAY(PL_comppad) : NULL; cv = cx->blk_sub.cv; CvDEPTH(cv) = cx->blk_sub.olddepth; cx->blk_sub.cv = NULL; SvREFCNT_dec(cv); + PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix; } /* handle the @_ part of leaving a sub */ PERL_STATIC_INLINE void -S_cx_popsub_args(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx) { AV *av; @@ -1454,17 +3683,20 @@ S_cx_popsub_args(pTHX_ PERL_CONTEXT *cx) CX_POP_SAVEARRAY(cx); av = MUTABLE_AV(PAD_SVl(0)); - if (UNLIKELY(AvREAL(av))) + if (!SvMAGICAL(av) && SvREFCNT(av) == 1 +#ifndef PERL_RC_STACK + && !AvREAL(av) +#endif + ) + clear_defarray_simple(av); + else /* abandon @_ if it got reified */ clear_defarray(av, 0); - else { - CLEAR_ARGARRAY(av); - } } PERL_STATIC_INLINE void -S_cx_popsub(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_POPSUB; assert(CxTYPE(cx) == CXt_SUB); @@ -1478,10 +3710,12 @@ S_cx_popsub(pTHX_ PERL_CONTEXT *cx) PERL_STATIC_INLINE void -S_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv) +Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv) { PERL_ARGS_ASSERT_CX_PUSHFORMAT; + cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix; + PL_curstackinfo->si_cxsubix= (I32)(cx - PL_curstackinfo->si_cxstack); cx->blk_format.cv = cv; cx->blk_format.retop = retop; cx->blk_format.gv = gv; @@ -1496,7 +3730,7 @@ S_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv) PERL_STATIC_INLINE void -S_cx_popformat(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx) { CV *cv; GV *dfout; @@ -1510,19 +3744,18 @@ S_cx_popformat(pTHX_ PERL_CONTEXT *cx) SvREFCNT_dec_NN(dfout); PL_comppad = cx->blk_format.prevcomppad; - PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL; + PL_curpad = LIKELY(PL_comppad != NULL) ? AvARRAY(PL_comppad) : NULL; cv = cx->blk_format.cv; cx->blk_format.cv = NULL; --CvDEPTH(cv); SvREFCNT_dec_NN(cv); + PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix; } PERL_STATIC_INLINE void -S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) +Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) { - PERL_ARGS_ASSERT_CX_PUSHEVAL; - cx->blk_eval.retop = retop; cx->blk_eval.old_namesv = namesv; cx->blk_eval.old_eval_root = PL_eval_root; @@ -1535,9 +3768,32 @@ S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7); } +PERL_STATIC_INLINE void +Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) +{ + PERL_ARGS_ASSERT_CX_PUSHEVAL; + + Perl_push_evalortry_common(aTHX_ cx, retop, namesv); + + cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix; + PL_curstackinfo->si_cxsubix = (I32)(cx - PL_curstackinfo->si_cxstack); +} + +PERL_STATIC_INLINE void +Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop) +{ + PERL_ARGS_ASSERT_CX_PUSHTRY; + + Perl_push_evalortry_common(aTHX_ cx, retop, NULL); + + /* Don't actually change it, just store the current value so it's restored + * by the common popeval */ + cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix; +} + PERL_STATIC_INLINE void -S_cx_popeval(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx) { SV *sv; @@ -1558,6 +3814,7 @@ S_cx_popeval(pTHX_ PERL_CONTEXT *cx) cx->blk_eval.old_namesv = NULL; SvREFCNT_dec_NN(sv); } + PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix; } @@ -1569,7 +3826,7 @@ S_cx_popeval(pTHX_ PERL_CONTEXT *cx) */ PERL_STATIC_INLINE void -S_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx) +Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN; cx->blk_loop.my_op = cLOOP; @@ -1581,7 +3838,7 @@ S_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx) */ PERL_STATIC_INLINE void -S_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave) +Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave) { PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR; @@ -1590,27 +3847,16 @@ S_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave) cx->blk_loop.itervar_u.svp = (SV**)itervarp; cx->blk_loop.itersave = itersave; -#ifdef USE_ITHREADS - cx->blk_loop.oldcomppad = PL_comppad; -#endif -} - - -PERL_STATIC_INLINE void -S_cx_pushloop_given(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv) -{ - PERL_ARGS_ASSERT_CX_PUSHLOOP_GIVEN; - - cx->blk_loop.my_op = cLOOP; - cx->blk_loop.itervar_u.gv = PL_defgv; - cx->blk_loop.itersave = orig_defsv; +#ifdef USE_ITHREADS + cx->blk_loop.oldcomppad = PL_comppad; +#endif } /* pop all loop types, including plain */ PERL_STATIC_INLINE void -S_cx_poploop(pTHX_ PERL_CONTEXT *cx) +Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_POPLOOP; @@ -1639,23 +3885,25 @@ S_cx_poploop(pTHX_ PERL_CONTEXT *cx) cx->blk_loop.itersave = NULL; SvREFCNT_dec(cursv); } + if (cx->cx_type & (CXp_FOR_GV|CXp_FOR_LVREF)) + SvREFCNT_dec(cx->blk_loop.itervar_u.svp); } PERL_STATIC_INLINE void -S_cx_pushwhereso(pTHX_ PERL_CONTEXT *cx) +Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx) { - PERL_ARGS_ASSERT_CX_PUSHWHERESO; + PERL_ARGS_ASSERT_CX_PUSHWHEN; - cx->blk_whereso.leave_op = cLOGOP->op_other; + cx->blk_givwhen.leave_op = cLOGOP->op_other; } PERL_STATIC_INLINE void -S_cx_popwhereso(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx) { - PERL_ARGS_ASSERT_CX_POPWHERESO; - assert(CxTYPE(cx) == CXt_WHERESO); + PERL_ARGS_ASSERT_CX_POPWHEN; + assert(CxTYPE(cx) == CXt_WHEN); PERL_UNUSED_ARG(cx); PERL_UNUSED_CONTEXT; @@ -1663,10 +3911,187 @@ S_cx_popwhereso(pTHX_ PERL_CONTEXT *cx) } +PERL_STATIC_INLINE void +Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv) +{ + PERL_ARGS_ASSERT_CX_PUSHGIVEN; + + cx->blk_givwhen.leave_op = cLOGOP->op_other; + cx->blk_givwhen.defsv_save = orig_defsv; +} + + +PERL_STATIC_INLINE void +Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx) +{ + SV *sv; + + PERL_ARGS_ASSERT_CX_POPGIVEN; + assert(CxTYPE(cx) == CXt_GIVEN); + + sv = GvSV(PL_defgv); + GvSV(PL_defgv) = cx->blk_givwhen.defsv_save; + cx->blk_givwhen.defsv_save = NULL; + SvREFCNT_dec(sv); +} + + +/* Make @_ empty in-place in simple cases: a cheap av_clear(). + * See Perl_clear_defarray() for non-simple cases */ + + +PERL_STATIC_INLINE void +Perl_clear_defarray_simple(pTHX_ AV *av) +{ + PERL_ARGS_ASSERT_CLEAR_DEFARRAY_SIMPLE; + + assert(SvTYPE(av) == SVt_PVAV); + assert(!SvREADONLY(av)); + assert(!SvMAGICAL(av)); + assert(SvREFCNT(av) == 1); + +#ifdef PERL_RC_STACK + assert(AvREAL(av)); + /* this code assumes that destructors called here can't free av + * itself, because pad[0] and/or CX pointers will keep it alive */ + SSize_t i = AvFILLp(av); + while (i >= 0) { + SV *sv = AvARRAY(av)[i]; + AvARRAY(av)[i--] = NULL; + SvREFCNT_dec(sv); + } +#else + assert(!AvREAL(av)); +#endif + AvFILLp(av) = -1; + Perl_av_remove_offset(aTHX_ av); +} + +/* Switch to a different argument stack. + * + * Note that it doesn't update PL_curstackinfo->si_stack_nonrc_base, + * so this should only be used as part of a general switching between + * stackinfos. + */ + +PERL_STATIC_INLINE void +Perl_switch_argstack(pTHX_ AV *to) +{ + PERL_ARGS_ASSERT_SWITCH_ARGSTACK; + + AvFILLp(PL_curstack) = PL_stack_sp - PL_stack_base; + PL_stack_base = AvARRAY(to); + PL_stack_max = PL_stack_base + AvMAX(to); + PL_stack_sp = PL_stack_base + AvFILLp(to); + PL_curstack = to; +} + + +/* Push, and switch to a new stackinfo, allocating one if none are spare, + * to get a fresh set of stacks. + * Update all the interpreter variables like PL_curstackinfo, + * PL_stack_sp, etc. + * current flag meanings: + * 1 make the new arg stack AvREAL + */ + + +PERL_STATIC_INLINE void +Perl_push_stackinfo(pTHX_ I32 type, UV flags) +{ + PERL_ARGS_ASSERT_PUSH_STACKINFO; + + PERL_SI *next = PL_curstackinfo->si_next; + DEBUG_l({ + int i = 0; PERL_SI *p = PL_curstackinfo; + while (p) { i++; p = p->si_prev; } + Perl_deb(aTHX_ "push STACKINFO %d in %s at %s:%d\n", + i, SAFE_FUNCTION__, __FILE__, __LINE__); + }) + + if (!next) { + next = new_stackinfo_flags(32, 2048/sizeof(PERL_CONTEXT) - 1, flags); + next->si_prev = PL_curstackinfo; + PL_curstackinfo->si_next = next; + } + next->si_type = type; + next->si_cxix = -1; + next->si_cxsubix = -1; + PUSHSTACK_INIT_HWM(next); +#ifdef PERL_RC_STACK + next->si_stack_nonrc_base = 0; +#endif + if (flags & 1) + AvREAL_on(next->si_stack); + else + AvREAL_off(next->si_stack); + AvFILLp(next->si_stack) = 0; + switch_argstack(next->si_stack); + PL_curstackinfo = next; + SET_MARK_OFFSET; +} + + +/* Pop, then switch to the previous stackinfo and set of stacks. + * Update all the interpreter variables like PL_curstackinfo, + * PL_stack_sp, etc. */ + +PERL_STATIC_INLINE void +Perl_pop_stackinfo(pTHX) +{ + PERL_ARGS_ASSERT_POP_STACKINFO; + + PERL_SI * const prev = PL_curstackinfo->si_prev; + DEBUG_l({ + int i = -1; PERL_SI *p = PL_curstackinfo; + while (p) { i++; p = p->si_prev; } + Perl_deb(aTHX_ "pop STACKINFO %d in %s at %s:%d\n", + i, SAFE_FUNCTION__, __FILE__, __LINE__);}) + if (!prev) { + Perl_croak_popstack(); + } + + switch_argstack(prev->si_stack); + /* don't free prev here, free them all at the END{} */ + PL_curstackinfo = prev; +} + + + +/* +=for apidoc newPADxVOP + +Constructs, checks and returns an op containing a pad offset. C is +the opcode, which should be one of C, C, C +or C. The returned op will have the C field set by +the C argument. + +This is convenient when constructing a large optree in nested function +calls, as it avoids needing to store the pad op directly to set the +C field as a side-effect. For example + + o = op_append_elem(OP_LINESEQ, o, + newPADxVOP(OP_PADSV, 0, padix)); + +=cut +*/ + +PERL_STATIC_INLINE OP * +Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix) +{ + PERL_ARGS_ASSERT_NEWPADXVOP; + + assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV + || type == OP_PADCV); + OP *o = newOP(type, flags); + o->op_targ = padix; + return o; +} + /* ------------------ util.h ------------------------------------------- */ /* -=head1 Miscellaneous Functions +=for apidoc_section $string =for apidoc foldEQ @@ -1680,8 +4105,10 @@ range bytes match only themselves. */ PERL_STATIC_INLINE I32 -Perl_foldEQ(const char *s1, const char *s2, I32 len) +Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len) { + PERL_UNUSED_CONTEXT; + const U8 *a = (const U8 *)s1; const U8 *b = (const U8 *)s2; @@ -1690,20 +4117,22 @@ Perl_foldEQ(const char *s1, const char *s2, I32 len) assert(len >= 0); while (len--) { - if (*a != *b && *a != PL_fold[*b]) - return 0; - a++,b++; + if (*a != *b && *a != PL_fold[*b]) + return 0; + a++,b++; } return 1; } PERL_STATIC_INLINE I32 -Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len) +Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len) { - /* Compare non-utf8 using Unicode (Latin1) semantics. Does not work on - * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor - * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these. Nor - * does it check that the strings each have at least 'len' characters */ + /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds + * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and + * does not check for this. Nor does it check that the strings each have + * at least 'len' characters. */ + + PERL_UNUSED_CONTEXT; const U8 *a = (const U8 *)s1; const U8 *b = (const U8 *)s2; @@ -1713,15 +4142,16 @@ Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len) assert(len >= 0); while (len--) { - if (*a != *b && *a != PL_fold_latin1[*b]) { - return 0; - } - a++, b++; + if (*a != *b && *a != PL_fold_latin1[*b]) { + return 0; + } + a++, b++; } return 1; } /* +=for apidoc_section $locale =for apidoc foldEQ_locale Returns true if the leading C bytes of the strings C and C are the @@ -1731,9 +4161,8 @@ same case-insensitively in the current locale; false otherwise. */ PERL_STATIC_INLINE I32 -Perl_foldEQ_locale(const char *s1, const char *s2, I32 len) +Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len) { - dVAR; const U8 *a = (const U8 *)s1; const U8 *b = (const U8 *)s2; @@ -1742,13 +4171,50 @@ Perl_foldEQ_locale(const char *s1, const char *s2, I32 len) assert(len >= 0); while (len--) { - if (*a != *b && *a != PL_fold_locale[*b]) - return 0; - a++,b++; + if (*a != *b && *a != PL_fold_locale[*b]) { + DEBUG_Lv(PerlIO_printf(Perl_debug_log, + "%s:%d: Our records indicate %02x is not a fold of %02x" + " or its mate %02x\n", + __FILE__, __LINE__, *a, *b, PL_fold_locale[*b])); + + return 0; + } + a++,b++; } return 1; } +/* +=for apidoc_section $string +=for apidoc my_strnlen + +The C library C if available, or a Perl implementation of it. + +C computes the length of the string, up to C +bytes. It will never attempt to address more than C +bytes, making it suitable for use with strings that are not +guaranteed to be NUL-terminated. + +=cut + +Description stolen from http://man.openbsd.org/strnlen.3, +implementation stolen from PostgreSQL. +*/ +#ifndef HAS_STRNLEN + +PERL_STATIC_INLINE Size_t +Perl_my_strnlen(const char *str, Size_t maxlen) +{ + const char *end = (char *) memchr(str, '\0', maxlen); + + PERL_ARGS_ASSERT_MY_STRNLEN; + + if (end == NULL) return maxlen; + return end - str; +} + +#endif + #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT)) PERL_STATIC_INLINE void * @@ -1772,6 +4238,484 @@ S_my_memrchr(const char * s, const char c, const STRLEN len) #endif +PERL_STATIC_INLINE char * +Perl_mortal_getenv(const char * str) +{ + /* This implements a (mostly) thread-safe, sequential-call-safe getenv(). + * + * It's (mostly) thread-safe because it uses a mutex to prevent other + * threads (that look at this mutex) from destroying the result before this + * routine has a chance to copy the result to a place that won't be + * destroyed before the caller gets a chance to handle it. That place is a + * mortal SV. khw chose this over SAVEFREEPV because he is under the + * impression that the SV will hang around longer under more circumstances + * + * The reason it isn't completely thread-safe is that other code could + * simply not pay attention to the mutex. All of the Perl core uses the + * mutex, but it is possible for code from, say XS, to not use this mutex, + * defeating the safety. + * + * getenv() returns, in some implementations, a pointer to a spot in the + * **environ array, which could be invalidated at any time by this or + * another thread changing the environment. Other implementations copy the + * **environ value to a static buffer, returning a pointer to that. That + * buffer might or might not be invalidated by a getenv() call in another + * thread. If it does get zapped, we need an exclusive lock. Otherwise, + * many getenv() calls can safely be running simultaneously, so a + * many-reader (but no simultaneous writers) lock is ok. There is a + * Configure probe to see if another thread destroys the buffer, and the + * mutex is defined accordingly. + * + * But in all cases, using the mutex prevents these problems, as long as + * all code uses the same mutex. + * + * A complication is that this can be called during phases where the + * mortalization process isn't available. These are in interpreter + * destruction or early in construction. khw believes that at these times + * there shouldn't be anything else going on, so plain getenv is safe AS + * LONG AS the caller acts on the return before calling it again. */ + + char * ret; + dTHX; + + PERL_ARGS_ASSERT_MORTAL_GETENV; + + /* Can't mortalize without stacks. khw believes that no other threads + * should be running, so no need to lock things, and this may be during a + * phase when locking isn't even available */ + if (UNLIKELY(PL_scopestack_ix == 0)) { + return getenv(str); + } + +#ifdef PERL_MEM_LOG + + /* A major complication arises under PERL_MEM_LOG. When that is active, + * every memory allocation may result in logging, depending on the value of + * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for + * saving ENV{foo}'s value (but before saving it), the logging code will + * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some + * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to + * lock a boolean mutex recursively); 3) destroying the getenv() static + * buffer; or 4) destroying the temporary created by this for the copy + * causes a log entry to be made which could cause a new temporary to be + * created, which will need to be destroyed at some point, leading to an + * infinite loop. + * + * The solution adopted here (after some gnashing of teeth) is to detect + * the recursive calls and calls from the logger, and treat them specially. + * Let's say we want to do getenv("foo"). We first find + * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter + * variable, so no temporary is required. Then we do getenv(foo), and in + * the process of creating a temporary to save it, this function will be + * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call, + * we detect that it is such a call and return our saved value instead of + * locking and doing a new getenv(). This solves all of problems 1), 2), + * and 3). Because all the getenv()s are done while the mutex is locked, + * the state cannot have changed. To solve 4), we don't create a temporary + * when this is called from the logging code. That code disposes of the + * return value while the mutex is still locked. + * + * The value of getenv(PERL_MEM_LOG) can be anything, but only initial + * digits and 3 particular letters are significant; the rest are ignored by + * the memory logging code. Thus the per-interpreter variable only needs + * to be large enough to save the significant information, the size of + * which is known at compile time. The first byte is extra, reserved for + * flags for our use. To protect against overflowing, only the reserved + * byte, as many digits as don't overflow, and the three letters are + * stored. + * + * The reserved byte has two bits: + * 0x1 if set indicates that if we get here, it is a recursive call of + * getenv() + * 0x2 if set indicates that the call is from the logging code. + * + * If the flag indicates this is a recursive call, just return the stored + * value of PL_mem_log; An empty value gets turned into NULL. */ + if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) { + if (PL_mem_log[1] == '\0') { + return NULL; + } else { + return PL_mem_log + 1; + } + } + +#endif + + GETENV_LOCK; + +#ifdef PERL_MEM_LOG + + /* Here we are in a critical section. As explained above, we do our own + * getenv(PERL_MEM_LOG), saving the result safely. */ + ret = getenv("PERL_MEM_LOG"); + if (ret == NULL) { /* No logging active */ + + /* Return that immediately if called from the logging code */ + if (PL_mem_log[0] & 0x2) { + GETENV_UNLOCK; + return NULL; + } + + PL_mem_log[1] = '\0'; + } + else { + char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */ + + /* There is nothing to prevent the value of PERL_MEM_LOG from being an + * extremely long string. But we want only a few characters from it. + * PL_mem_log has been made large enough to hold just the ones we need. + * First the file descriptor. */ + if (isDIGIT(*ret)) { + const char * s = ret; + if (UNLIKELY(*s == '0')) { + + /* Reduce multiple leading zeros to a single one. This is to + * allow the caller to change what to do with leading zeros. */ + *mem_log_meat++ = '0'; + s++; + while (*s == '0') { + s++; + } + } + + /* If the input overflows, copy just enough for the result to also + * overflow, plus 1 to make sure */ + while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) { + *mem_log_meat++ = *s++; + } + } + + /* Then each of the four significant characters */ + if (strchr(ret, 'm')) { + *mem_log_meat++ = 'm'; + } + if (strchr(ret, 's')) { + *mem_log_meat++ = 's'; + } + if (strchr(ret, 't')) { + *mem_log_meat++ = 't'; + } + if (strchr(ret, 'c')) { + *mem_log_meat++ = 'c'; + } + *mem_log_meat = '\0'; + + assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log)); + } + + /* If we are being called from the logger, it only needs the significant + * portion of PERL_MEM_LOG, and doesn't need a safe copy */ + if (PL_mem_log[0] & 0x2) { + assert(strEQ(str, "PERL_MEM_LOG")); + GETENV_UNLOCK; + return PL_mem_log + 1; + } + + /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that + * is coming from other than the logging code, so it should be treated the + * same as any other getenv(), returning the full value, not just the + * significant part, and having its value saved. Set the flag that + * indicates any call to this routine will be a recursion from here */ + PL_mem_log[0] = 0x1; + +#endif + + /* Now get the value of the real desired variable, and save a copy */ + ret = getenv(str); + + if (ret != NULL) { + ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) ); + } + + GETENV_UNLOCK; + +#ifdef PERL_MEM_LOG + + /* Clear the buffer */ + Zero(PL_mem_log, sizeof(PL_mem_log), char); + +#endif + + return ret; +} + +PERL_STATIC_INLINE bool +Perl_sv_isbool(pTHX_ const SV *sv) +{ + PERL_UNUSED_CONTEXT; + return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv); +} + +#ifdef USE_ITHREADS + +PERL_STATIC_INLINE AV * +Perl_cop_file_avn(pTHX_ const COP *cop) { + + PERL_ARGS_ASSERT_COP_FILE_AVN; + + const char *file = CopFILE(cop); + if (file) { + GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD); + if (gv) { + return GvAVn(gv); + } + else + return NULL; + } + else + return NULL; +} + +#endif + +PERL_STATIC_INLINE PADNAME * +Perl_padname_refcnt_inc(PADNAME *pn) +{ + PadnameREFCNT(pn)++; + return pn; +} + +PERL_STATIC_INLINE PADNAMELIST * +Perl_padnamelist_refcnt_inc(PADNAMELIST *pnl) +{ + PadnamelistREFCNT(pnl)++; + return pnl; +} + +/* copy a string to a safe spot */ + +/* +=for apidoc_section $string +=for apidoc savepv + +Perl's version of C. Returns a pointer to a newly allocated +string which is a duplicate of C. The size of the string is +determined by C, which means it may not contain embedded C +characters and must have a trailing C. To prevent memory leaks, the +memory allocated for the new string needs to be freed when no longer needed. +This can be done with the C> function, or +L|perlguts/SAVEFREEPV(p)>. + +On some platforms, Windows for example, all allocated memory owned by a thread +is deallocated when that thread ends. So if you need that not to happen, you +need to use the shared memory functions, such as C>. + +=cut +*/ + +PERL_STATIC_INLINE char * +Perl_savepv(pTHX_ const char *pv) +{ + PERL_UNUSED_CONTEXT; + if (!pv) + return NULL; + else { + char *newaddr; + const STRLEN pvlen = strlen(pv)+1; + Newx(newaddr, pvlen, char); + return (char*)memcpy(newaddr, pv, pvlen); + } +} + +/* same thing but with a known length */ + +/* +=for apidoc savepvn + +Perl's version of what C would be if it existed. Returns a +pointer to a newly allocated string which is a duplicate of the first +C bytes from C, plus a trailing +C byte. The memory allocated for +the new string can be freed with the C function. + +On some platforms, Windows for example, all allocated memory owned by a thread +is deallocated when that thread ends. So if you need that not to happen, you +need to use the shared memory functions, such as C>. + +=cut +*/ + +PERL_STATIC_INLINE char * +Perl_savepvn(pTHX_ const char *pv, Size_t len) +{ + char *newaddr; + PERL_UNUSED_CONTEXT; + + Newx(newaddr,len+1,char); + /* Give a meaning to NULL pointer mainly for the use in sv_magic() */ + if (pv) { + /* might not be null terminated */ + newaddr[len] = '\0'; + return (char *) CopyD(pv,newaddr,len,char); + } + else { + return (char *) ZeroD(newaddr,len+1,char); + } +} + +/* +=for apidoc savesvpv + +A version of C/C which gets the string to duplicate from +the passed in SV using C + +On some platforms, Windows for example, all allocated memory owned by a thread +is deallocated when that thread ends. So if you need that not to happen, you +need to use the shared memory functions, such as C>. + +=cut +*/ + +PERL_STATIC_INLINE char * +Perl_savesvpv(pTHX_ SV *sv) +{ + STRLEN len; + const char * const pv = SvPV_const(sv, len); + char *newaddr; + + PERL_ARGS_ASSERT_SAVESVPV; + + ++len; + Newx(newaddr,len,char); + return (char *) CopyD(pv,newaddr,len,char); +} + +/* +=for apidoc savesharedsvpv + +A version of C which allocates the duplicate string in +memory which is shared between threads. + +=cut +*/ + +PERL_STATIC_INLINE char * +Perl_savesharedsvpv(pTHX_ SV *sv) +{ + STRLEN len; + const char * const pv = SvPV_const(sv, len); + + PERL_ARGS_ASSERT_SAVESHAREDSVPV; + + return savesharedpvn(pv, len); +} + +#ifndef PERL_GET_CONTEXT_DEFINED + +/* +=for apidoc_section $embedding +=for apidoc get_context + +Implements L>, which you should use instead. + +=cut +*/ + +PERL_STATIC_INLINE void * +Perl_get_context(void) +{ +# if defined(USE_ITHREADS) +# ifdef OLD_PTHREADS_API + pthread_addr_t t; + int error = pthread_getspecific(PL_thr_key, &t); + if (error) + Perl_croak_nocontext("panic: pthread_getspecific, error=%d", error); + return (void*)t; +# elif defined(I_MACH_CTHREADS) + return (void*)cthread_data(cthread_self()); +# else + return (void*)PTHREAD_GETSPECIFIC(PL_thr_key); +# endif +# else + return (void*)NULL; +# endif +} + +#endif + +PERL_STATIC_INLINE MGVTBL* +Perl_get_vtbl(pTHX_ int vtbl_id) +{ + PERL_UNUSED_CONTEXT; + + return (vtbl_id < 0 || vtbl_id >= magic_vtable_max) + ? NULL : (MGVTBL*)PL_magic_vtables + vtbl_id; +} + +/* +=for apidoc my_strlcat + +The C library C if available, or a Perl implementation of it. +This operates on C C-terminated strings. + +C appends string C to the end of C. It will append at +most S> bytes. It will then C-terminate, +unless C is 0 or the original C string was longer than C (in +practice this should not happen as it means that either C is incorrect or +that C is not a proper C-terminated string). + +Note that C is the full size of the destination buffer and +the result is guaranteed to be C-terminated if there is room. Note that +room for the C should be included in C. + +The return value is the total length that C would have if C is +sufficiently large. Thus it is the initial length of C plus the length of +C. If C is smaller than the return, the excess was not appended. + +=cut + +Description stolen from http://man.openbsd.org/strlcat.3 +*/ +#ifndef HAS_STRLCAT +PERL_STATIC_INLINE Size_t +Perl_my_strlcat(char *dst, const char *src, Size_t size) +{ + Size_t used, length, copy; + + used = strlen(dst); + length = strlen(src); + if (size > 0 && used < size - 1) { + copy = (length >= size - used) ? size - used - 1 : length; + memcpy(dst + used, src, copy); + dst[used + copy] = '\0'; + } + return used + length; +} +#endif + + +/* +=for apidoc my_strlcpy + +The C library C if available, or a Perl implementation of it. +This operates on C C-terminated strings. + +C copies up to S> bytes from the string C +to C, C-terminating the result if C is not 0. + +The return value is the total length C would be if the copy completely +succeeded. If it is larger than C, the excess was not copied. + +=cut + +Description stolen from http://man.openbsd.org/strlcpy.3 +*/ +#ifndef HAS_STRLCPY +PERL_STATIC_INLINE Size_t +Perl_my_strlcpy(char *dst, const char *src, Size_t size) +{ + Size_t length, copy; + + length = strlen(src); + if (size > 0) { + copy = (length >= size) ? size - 1 : length; + memcpy(dst, src, copy); + dst[copy] = '\0'; + } + return length; +} +#endif + /* * ex: set ts=8 sts=4 sw=4 et: */