X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/5eabe374afae7b84aa4ba5e13e5424865bc74fc1..36f453d19563f9476d4310b8310ce4080209b04f:/inline.h diff --git a/inline.h b/inline.h index af26cf5..3b34ad4 100644 --- a/inline.h +++ b/inline.h @@ -5,8 +5,33 @@ * You may distribute under the terms of either the GNU General Public * License or the Artistic License, as specified in the README file. * + * This file contains tables and code adapted from + * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this + * copyright notice: + +Copyright (c) 2008-2009 Bjoern Hoehrmann + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + * * This file is a home for static inline functions that cannot go in other - * headers files, because they depend on proto.h (included after most other + * header files, because they depend on proto.h (included after most other * headers) or struct definitions. * * Each section names the header file that the functions "belong" to. @@ -14,29 +39,49 @@ /* ------------------------------- av.h ------------------------------- */ -PERL_STATIC_INLINE SSize_t -S_av_top_index(pTHX_ AV *av) +/* +=for apidoc_section $AV +=for apidoc av_count +Returns the number of elements in the array C. This is the true length of +the array, including any undefined elements. It is always the same as +S>. + +=cut +*/ +PERL_STATIC_INLINE Size_t +Perl_av_count(pTHX_ AV *av) { - PERL_ARGS_ASSERT_AV_TOP_INDEX; + PERL_ARGS_ASSERT_AV_COUNT; assert(SvTYPE(av) == SVt_PVAV); - return AvFILL(av); + return AvFILL(av) + 1; } /* ------------------------------- cv.h ------------------------------- */ +/* +=for apidoc_section $CV +=for apidoc CvGV +Returns the GV associated with the CV C, reifying it if necessary. + +=cut +*/ PERL_STATIC_INLINE GV * -S_CvGV(pTHX_ CV *sv) +Perl_CvGV(pTHX_ CV *sv) { + PERL_ARGS_ASSERT_CVGV; + return CvNAMED(sv) ? Perl_cvgv_from_hek(aTHX_ sv) : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv; } PERL_STATIC_INLINE I32 * -S_CvDEPTHp(const CV * const sv) +Perl_CvDEPTH(const CV * const sv) { + PERL_ARGS_ASSERT_CVDEPTH; assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM); + return &((XPVCV*)SvANY(sv))->xcv_depth; } @@ -94,8 +139,10 @@ S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len) #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C) PERL_STATIC_INLINE bool -PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq) +S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq) { + PERL_ARGS_ASSERT_PADNAMEIN_SCOPE; + /* is seq within the range _LOW to _HIGH ? * This is complicated by the fact that PL_cop_seqmax * may have wrapped around at some point */ @@ -128,7 +175,7 @@ PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq) /* ------------------------------- pp.h ------------------------------- */ PERL_STATIC_INLINE I32 -S_TOPMARK(pTHX) +Perl_TOPMARK(pTHX) { DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, "MARK top %p %" IVdf "\n", @@ -138,7 +185,7 @@ S_TOPMARK(pTHX) } PERL_STATIC_INLINE I32 -S_POPMARK(pTHX) +Perl_POPMARK(pTHX) { DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log, "MARK pop %p %" IVdf "\n", @@ -151,37 +198,50 @@ S_POPMARK(pTHX) /* ----------------------------- regexp.h ----------------------------- */ PERL_STATIC_INLINE struct regexp * -S_ReANY(const REGEXP * const re) +Perl_ReANY(const REGEXP * const re) { XPV* const p = (XPV*)SvANY(re); + + PERL_ARGS_ASSERT_REANY; assert(isREGEXP(re)); + return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx : (struct regexp *)p; } /* ------------------------------- sv.h ------------------------------- */ +PERL_STATIC_INLINE bool +Perl_SvTRUE(pTHX_ SV *sv) { + if (UNLIKELY(sv == NULL)) + return FALSE; + SvGETMAGIC(sv); + return SvTRUE_nomg_NN(sv); +} + PERL_STATIC_INLINE SV * -S_SvREFCNT_inc(SV *sv) +Perl_SvREFCNT_inc(SV *sv) { if (LIKELY(sv != NULL)) SvREFCNT(sv)++; return sv; } PERL_STATIC_INLINE SV * -S_SvREFCNT_inc_NN(SV *sv) +Perl_SvREFCNT_inc_NN(SV *sv) { + PERL_ARGS_ASSERT_SVREFCNT_INC_NN; + SvREFCNT(sv)++; return sv; } PERL_STATIC_INLINE void -S_SvREFCNT_inc_void(SV *sv) +Perl_SvREFCNT_inc_void(SV *sv) { if (LIKELY(sv != NULL)) SvREFCNT(sv)++; } PERL_STATIC_INLINE void -S_SvREFCNT_dec(pTHX_ SV *sv) +Perl_SvREFCNT_dec(pTHX_ SV *sv) { if (LIKELY(sv != NULL)) { U32 rc = SvREFCNT(sv); @@ -193,9 +253,12 @@ S_SvREFCNT_dec(pTHX_ SV *sv) } PERL_STATIC_INLINE void -S_SvREFCNT_dec_NN(pTHX_ SV *sv) +Perl_SvREFCNT_dec_NN(pTHX_ SV *sv) { U32 rc = SvREFCNT(sv); + + PERL_ARGS_ASSERT_SVREFCNT_DEC_NN; + if (LIKELY(rc > 1)) SvREFCNT(sv) = rc - 1; else @@ -203,26 +266,30 @@ S_SvREFCNT_dec_NN(pTHX_ SV *sv) } PERL_STATIC_INLINE void -SvAMAGIC_on(SV *sv) +Perl_SvAMAGIC_on(SV *sv) { + PERL_ARGS_ASSERT_SVAMAGIC_ON; assert(SvROK(sv)); + if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv))); } PERL_STATIC_INLINE void -SvAMAGIC_off(SV *sv) +Perl_SvAMAGIC_off(SV *sv) { + PERL_ARGS_ASSERT_SVAMAGIC_OFF; + if (SvROK(sv) && SvOBJECT(SvRV(sv))) HvAMAGIC_off(SvSTASH(SvRV(sv))); } PERL_STATIC_INLINE U32 -S_SvPADSTALE_on(SV *sv) +Perl_SvPADSTALE_on(SV *sv) { assert(!(SvFLAGS(sv) & SVs_PADTMP)); return SvFLAGS(sv) |= SVs_PADSTALE; } PERL_STATIC_INLINE U32 -S_SvPADSTALE_off(SV *sv) +Perl_SvPADSTALE_off(SV *sv) { assert(!(SvFLAGS(sv) & SVs_PADTMP)); return SvFLAGS(sv) &= ~SVs_PADSTALE; @@ -241,29 +308,14 @@ S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp) } #endif -/* ------------------------------- handy.h ------------------------------- */ - -/* saves machine code for a common noreturn idiom typically used in Newx*() */ -#ifdef GCC_DIAG_PRAGMA -GCC_DIAG_IGNORE(-Wunused-function) /* Intentionally left semicolonless. */ -#endif -static void -S_croak_memory_wrap(void) -{ - Perl_croak_nocontext("%s",PL_memory_wrap); -} -#ifdef GCC_DIAG_PRAGMA -GCC_DIAG_RESTORE /* Intentionally left semicolonless. */ -#endif - /* ------------------------------- utf8.h ------------------------------- */ /* -=head1 Unicode Support +=for apidoc_section $unicode */ PERL_STATIC_INLINE void -S_append_utf8_from_native_byte(const U8 byte, U8** dest) +Perl_append_utf8_from_native_byte(const U8 byte, U8** dest) { /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8 * encoded string at '*dest', updating '*dest' to include it */ @@ -280,10 +332,10 @@ S_append_utf8_from_native_byte(const U8 byte, U8** dest) /* =for apidoc valid_utf8_to_uvchr -Like C>, but should only be called when it is known that -the next character in the input UTF-8 string C is well-formed (I, -it passes C>. Surrogates, non-character code points, and -non-Unicode code points are allowed. +Like C>, but should only be called when it is +known that the next character in the input UTF-8 string C is well-formed +(I, it passes C>. Surrogates, non-character code +points, and non-Unicode code points are allowed. =cut @@ -373,7 +425,7 @@ UTF-8 invariant, this function does not change the contents of C<*ep>. */ PERL_STATIC_INLINE bool -S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) +Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) { const U8* send; const U8* x = s; @@ -386,25 +438,25 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) send = s + len; -#ifndef EBCDIC - /* This looks like 0x010101... */ -#define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF) +# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF) /* This looks like 0x808080... */ -#define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80) -#define PERL_WORDSIZE sizeof(PERL_COUNT_MULTIPLIER) -#define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1) +# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80) +# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T) +# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1) /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by * or'ing together the lowest bits of 'x'. Hopefully the final term gets * optimized out completely on a 32-bit system, and its mask gets optimized out * on a 64-bit system */ -#define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \ +# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \ | ( PTR2nat(x) >> 1) \ | ( ( (PTR2nat(x) \ & PERL_WORD_BOUNDARY_MASK) >> 2)))) +#ifndef EBCDIC + /* Do the word-at-a-time iff there is at least one usable full word. That * means that after advancing to a word boundary, there still is at least a * full word left. The number of bytes needed to advance is 'wordsize - @@ -442,14 +494,27 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) return FALSE; } - /* Otherwise fall into final loop to find which byte it is */ +# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \ + || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 + + *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x); + assert(*ep >= s && *ep < send); + + return FALSE; + +# else /* If weird byte order, drop into next loop to do byte-at-a-time + checks. */ + break; +# endif } + x += PERL_WORDSIZE; + } while (x + PERL_WORDSIZE <= send); } -#endif +#endif /* End of ! EBCDIC */ /* Process per-byte */ while (x < send) { @@ -467,6 +532,99 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) return TRUE; } +#ifndef EBCDIC + +PERL_STATIC_INLINE unsigned int +Perl_variant_byte_number(PERL_UINTMAX_T word) +{ + + /* This returns the position in a word (0..7) of the first variant byte in + * it. This is a helper function. Note that there are no branches */ + + assert(word); + + /* Get just the msb bits of each byte */ + word &= PERL_VARIANTS_WORD_MASK; + +# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 + + /* Bytes are stored like + * Byte8 ... Byte2 Byte1 + * 63..56...15...8 7...0 + * + * Isolate the lsb; + * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set + * + * The word will look like this, with a rightmost set bit in position 's': + * ('x's are don't cares) + * s + * x..x100..0 + * x..xx10..0 Right shift (rightmost 0 is shifted off) + * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and + * the 1 just to their left into a 0; the remainder is + * untouched + * 0..0011..1 The xor with the original, x..xx10..0, clears that + * remainder, sets the bottom to all 1 + * 0..0100..0 Add 1 to clear the word except for the bit in 's' + * + * Another method is to do 'word &= -word'; but it generates a compiler + * message on some platforms about taking the negative of an unsigned */ + + word >>= 1; + word = 1 + (word ^ (word - 1)); + +# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 + + /* Bytes are stored like + * Byte1 Byte2 ... Byte8 + * 63..56 55..47 ... 7...0 + * + * Isolate the msb; http://codeforces.com/blog/entry/10330 + * + * Only the most significant set bit matters. Or'ing word with its right + * shift of 1 makes that bit and the next one to its right both 1. Then + * right shifting by 2 makes for 4 1-bits in a row. ... We end with the + * msb and all to the right being 1. */ + word |= word >> 1; + word |= word >> 2; + word |= word >> 4; + word |= word >> 8; + word |= word >> 16; + word |= word >> 32; /* This should get optimized out on 32-bit systems. */ + + /* Then subtracting the right shift by 1 clears all but the left-most of + * the 1 bits, which is our desired result */ + word -= (word >> 1); + +# else +# error Unexpected byte order +# endif + + /* Here 'word' has a single bit set: the msb of the first byte in which it + * is set. Calculate that position in the word. We can use this + * specialized solution: https://stackoverflow.com/a/32339674/1626653, + * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should + * just get shifted off at compile time) */ + word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48) + | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32) + | (39 << 24) | (47 << 16) + | (55 << 8) | (63 << 0)); + word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */ + + /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */ + word = ((word + 1) >> 3) - 1; + +# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 + + /* And invert the result */ + word = CHARBITS - word - 1; + +# endif + + return (unsigned int) word; +} + +#endif #if defined(PERL_CORE) || defined(PERL_EXT) /* @@ -506,6 +664,8 @@ S_variant_under_utf8_count(const U8* const s, const U8* const e) # ifndef EBCDIC + /* Test if the string is long enough to use word-at-a-time. (Logic is the + * same as for is_utf8_invariant_string()) */ if ((STRLEN) (e - x) >= PERL_WORDSIZE + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) @@ -520,9 +680,11 @@ S_variant_under_utf8_count(const U8* const s, const U8* const e) /* Process per-word as long as we have at least a full word left */ do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an explanation of how this works */ - count += ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7) + PERL_UINTMAX_T increment + = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7) * PERL_COUNT_MULTIPLIER) >> ((PERL_WORDSIZE - 1) * CHARBITS); + count += (Size_t) increment; x += PERL_WORDSIZE; } while (x + PERL_WORDSIZE <= e); } @@ -543,10 +705,12 @@ S_variant_under_utf8_count(const U8* const s, const U8* const e) #endif -#undef PERL_WORDSIZE -#undef PERL_COUNT_MULTIPLIER -#undef PERL_WORD_BOUNDARY_MASK -#undef PERL_VARIANTS_WORD_MASK +#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */ +# undef PERL_WORDSIZE +# undef PERL_COUNT_MULTIPLIER +# undef PERL_WORD_BOUNDARY_MASK +# undef PERL_VARIANTS_WORD_MASK +#endif /* =for apidoc is_utf8_string @@ -598,8 +762,8 @@ C> =cut This is commonly used to determine if a SV's UTF-8 flag should be turned on. -It needn't be if its string is entirely UTF-8 invariant, and it shouldn't be if -it otherwise contains invalid UTF-8. +It generally needn't be if its string is entirely UTF-8 invariant, and it +shouldn't be if it otherwise contains invalid UTF-8. It is an internal function because khw thinks that XS code shouldn't be working at this low a level. A valid use case could change that. @@ -607,7 +771,7 @@ at this low a level. A valid use case could change that. */ PERL_STATIC_INLINE bool -S_is_utf8_non_invariant_string(const U8* const s, STRLEN len) +Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len) { const U8 * first_variant; @@ -741,7 +905,7 @@ C>. */ PERL_STATIC_INLINE bool -S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) +Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) { const U8 * first_variant; @@ -863,6 +1027,206 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) /* +=for apidoc isUTF8_CHAR + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8, as extended by Perl, +that represents some code point; otherwise it evaluates to 0. If non-zero, the +value gives how many bytes starting at C comprise the code point's +representation. Any bytes remaining before C, but beyond the ones needed to +form the first code point in C, are not examined. + +The code point can be any that will fit in an IV on this machine, using Perl's +extension to official UTF-8 to represent those higher than the Unicode maximum +of 0x10FFFF. That means that this macro is used to efficiently decide if the +next few bytes in C is legal UTF-8 for a single character. + +Use C> to restrict the acceptable code points to those +defined by Unicode to be fully interchangeable across applications; +C> to use the L definition of allowable +code points; and C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. + +Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC +machines) is a valid UTF-8 character. + +=cut + +This uses an adaptation of the table and algorithm given in +https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of PL_extended_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISUTF8_CHAR; + + /* This dfa is fast. If it accepts the input, it was for a well-formed, + * code point, which can be returned immediately. Otherwise, it is either + * malformed, or for the start byte FF which the dfa doesn't handle (except + * on 32-bit ASCII platforms where it trivially is an error). Call a + * helper function for the other platforms. */ + + while (s < e && LIKELY(state != 1)) { + state = PL_extended_utf8_dfa_tab[256 + + state + + PL_extended_utf8_dfa_tab[*s]]; + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + +#if defined(UV_IS_QUAD) || defined(EBCDIC) + + if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) { + return is_utf8_char_helper(s0, e, 0); + } + +#endif + + return 0; +} + +/* + +=for apidoc isSTRICT_UTF8_CHAR + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8 that represents some +Unicode code point completely acceptable for open interchange between all +applications; otherwise it evaluates to 0. If non-zero, the value gives how +many bytes starting at C comprise the code point's representation. Any +bytes remaining before C, but beyond the ones needed to form the first code +point in C, are not examined. + +The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not +be a surrogate nor a non-character code point. Thus this excludes any code +point from Perl's extended UTF-8. + +This is used to efficiently decide if the next few bytes in C is +legal Unicode-acceptable UTF-8 for a single character. + +Use C> to use the L definition of allowable +code points; C> to check for Perl's extended UTF-8; +and C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. + +=cut + +This uses an adaptation of the tables and algorithm given in +https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of strict_extended_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; + + while (s < e && LIKELY(state != 1)) { + state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]]; + + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + +#ifndef EBCDIC + + /* The dfa above drops out for certain Hanguls; handle them specially */ + if (is_HANGUL_ED_utf8_safe(s0, e)) { + return 3; + } + +#endif + + return 0; +} + +/* + +=for apidoc isC9_STRICT_UTF8_CHAR + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8 that represents some +Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, +the value gives how many bytes starting at C comprise the code point's +representation. Any bytes remaining before C, but beyond the ones needed to +form the first code point in C, are not examined. + +The largest acceptable code point is the Unicode maximum 0x10FFFF. This +differs from C> only in that it accepts non-character +code points. This corresponds to +L. +which said that non-character code points are merely discouraged rather than +completely forbidden in open interchange. See +L. + +Use C> to check for Perl's extended UTF-8; and +C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. + +=cut + +This uses an adaptation of the tables and algorithm given in +https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of PL_c9_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR; + + while (s < e && LIKELY(state != 1)) { + state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]]; + + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + + return 0; +} + +/* + =for apidoc is_strict_utf8_string_loc Like C> but stores the location of the failure (in the @@ -892,7 +1256,7 @@ See also C>. */ PERL_STATIC_INLINE bool -S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) +Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { const U8 * first_variant; @@ -969,7 +1333,7 @@ See also C>. */ PERL_STATIC_INLINE bool -S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) +Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { const U8 * first_variant; @@ -1051,7 +1415,7 @@ See also C>. */ PERL_STATIC_INLINE bool -S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) +Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) { const U8 * first_variant; @@ -1168,9 +1532,9 @@ Perl_utf8_hop(const U8 *s, SSize_t off) s--; } } - GCC_DIAG_IGNORE(-Wcast-qual); + GCC_DIAG_IGNORE(-Wcast-qual) return (U8 *)s; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE } /* @@ -1205,16 +1569,16 @@ Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) while (off--) { STRLEN skip = UTF8SKIP(s); if ((STRLEN)(end - s) <= skip) { - GCC_DIAG_IGNORE(-Wcast-qual); + GCC_DIAG_IGNORE(-Wcast-qual) return (U8 *)end; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE } s += skip; } - GCC_DIAG_IGNORE(-Wcast-qual); + GCC_DIAG_IGNORE(-Wcast-qual) return (U8 *)s; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE } /* @@ -1247,14 +1611,14 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) assert(off <= 0); while (off++ && s > start) { - s--; - while (UTF8_IS_CONTINUATION(*s) && s > start) + do { s--; + } while (UTF8_IS_CONTINUATION(*s) && s > start); } - - GCC_DIAG_IGNORE(-Wcast-qual); + + GCC_DIAG_IGNORE(-Wcast-qual) return (U8 *)s; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE } /* @@ -1341,7 +1705,7 @@ determined from just the first one or two bytes. */ PERL_STATIC_INLINE bool -S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags) +Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags) { PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS; @@ -1352,7 +1716,7 @@ S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const return FALSE; } - return cBOOL(_is_utf8_char_helper(s, e, flags)); + return cBOOL(is_utf8_char_helper(s, e, flags)); } /* @@ -1409,7 +1773,7 @@ complete, valid characters found in the C pointer. */ PERL_STATIC_INLINE bool -S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, +Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, STRLEN len, const U8 **ep, STRLEN *el, @@ -1429,25 +1793,118 @@ S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, || is_utf8_valid_partial_char_flags(*ep, s + len, flags); } +PERL_STATIC_INLINE UV +Perl_utf8n_to_uvchr_msgs(const U8 *s, + STRLEN curlen, + STRLEN *retlen, + const U32 flags, + U32 * errors, + AV ** msgs) +{ + /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the + * simple cases, and, if necessary calls a helper function to deal with the + * more complex ones. Almost all well-formed non-problematic code points + * are considered simple, so that it's unlikely that the helper function + * will need to be called. + * + * This is an adaptation of the tables and algorithm given in + * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides + * comprehensive documentation of the original version. A copyright notice + * for the original version is given at the beginning of this file. The + * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[]. + */ + + const U8 * const s0 = s; + const U8 * send = s0 + curlen; + UV uv = 0; /* The 0 silences some stupid compilers */ + UV state = 0; + + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; + + /* This dfa is fast. If it accepts the input, it was for a well-formed, + * non-problematic code point, which can be returned immediately. + * Otherwise we call a helper function to figure out the more complicated + * cases. */ + + while (s < send && LIKELY(state != 1)) { + UV type = PL_strict_utf8_dfa_tab[*s]; + + uv = (state == 0) + ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s)) + : UTF8_ACCUMULATE(uv, *s); + state = PL_strict_utf8_dfa_tab[256 + state + type]; + + if (state != 0) { + s++; + continue; + } + + if (retlen) { + *retlen = s - s0 + 1; + } + if (errors) { + *errors = 0; + } + if (msgs) { + *msgs = NULL; + } + + return UNI_TO_NATIVE(uv); + } + + /* Here is potentially problematic. Use the full mechanism */ + return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs); +} + +PERL_STATIC_INLINE UV +Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) +{ + PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER; + + assert(s < send); + + if (! ckWARN_d(WARN_UTF8)) { + + /* EMPTY is not really allowed, and asserts on debugging builds. But + * on non-debugging we have to deal with it, and this causes it to + * return the REPLACEMENT CHARACTER, as the documentation indicates */ + return utf8n_to_uvchr(s, send - s, retlen, + (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY)); + } + else { + UV ret = utf8n_to_uvchr(s, send - s, retlen, 0); + if (retlen && ret == 0 && *s != '\0') { + *retlen = (STRLEN) -1; + } + + return ret; + } +} + /* ------------------------------- perl.h ----------------------------- */ /* -=head1 Miscellaneous Functions +=for apidoc_section $utility -=for apidoc AiR|bool|is_safe_syscall|const char *pv|STRLEN len|const char *what|const char *op_name +=for apidoc is_safe_syscall -Test that the given C doesn't contain any internal C characters. -If it does, set C to C, optionally warn, and return FALSE. +Test that the given C (with length C) doesn't contain any internal +C characters. +If it does, set C to C, optionally warn using the C +category, and return FALSE. Return TRUE if the name is safe. +C and C are used in any warning. + Used by the C macro. =cut */ PERL_STATIC_INLINE bool -S_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) { +Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) +{ /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs * perl itself uses xce*() functions which accept 8-bit strings. */ @@ -1488,7 +1945,8 @@ then calling: #ifdef PERL_CORE PERL_STATIC_INLINE bool -S_should_warn_nl(const char *pv) { +S_should_warn_nl(const char *pv) +{ STRLEN len; PERL_ARGS_ASSERT_SHOULD_WARN_NL; @@ -1500,13 +1958,53 @@ S_should_warn_nl(const char *pv) { #endif +#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C) + +PERL_STATIC_INLINE bool +S_lossless_NV_to_IV(const NV nv, IV *ivp) +{ + /* This function determines if the input NV 'nv' may be converted without + * loss of data to an IV. If not, it returns FALSE taking no other action. + * But if it is possible, it does the conversion, returning TRUE, and + * storing the converted result in '*ivp' */ + + PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV; + +# if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) + /* Normally any comparison with a NaN returns false; if we can't rely + * on that behaviour, check explicitly */ + if (UNLIKELY(Perl_isnan(nv))) { + return FALSE; + } +# endif + + /* Written this way so that with an always-false NaN comparison we + * return false */ + if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) { + return FALSE; + } + + if ((IV) nv != nv) { + return FALSE; + } + + *ivp = (IV) nv; + return TRUE; +} + +#endif + /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */ +#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) + #define MAX_CHARSET_NAME_LENGTH 2 PERL_STATIC_INLINE const char * -get_regex_charset_name(const U32 flags, STRLEN* const lenp) +S_get_regex_charset_name(const U32 flags, STRLEN* const lenp) { + PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME; + /* Returns a string that corresponds to the name of the regex character set * given by 'flags', and *lenp is set the length of that string, which * cannot exceed MAX_CHARSET_NAME_LENGTH characters */ @@ -1527,6 +2025,8 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp) return "?"; /* Unknown */ } +#endif + /* Return false if any get magic is on the SV other than taint magic. @@ -1534,7 +2034,8 @@ Return false if any get magic is on the SV other than taint magic. */ PERL_STATIC_INLINE bool -S_sv_only_taint_gmagic(SV *sv) { +Perl_sv_only_taint_gmagic(SV *sv) +{ MAGIC *mg = SvMAGIC(sv); PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC; @@ -1553,11 +2054,28 @@ S_sv_only_taint_gmagic(SV *sv) { /* ------------------ cop.h ------------------------------------------- */ +/* implement GIMME_V() macro */ + +PERL_STATIC_INLINE U8 +Perl_gimme_V(pTHX) +{ + I32 cxix; + U8 gimme = (PL_op->op_flags & OPf_WANT); + + if (gimme) + return gimme; + cxix = PL_curstackinfo->si_cxsubix; + if (cxix < 0) + return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID; + assert(cxstack[cxix].blk_gimme & G_WANT); + return (cxstack[cxix].blk_gimme & G_WANT); +} + /* Enter a block. Push a new base context and return its address. */ PERL_STATIC_INLINE PERL_CONTEXT * -S_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) +Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) { PERL_CONTEXT * cx; @@ -1584,7 +2102,7 @@ S_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix) /* Exit a block (RETURN and LAST). */ PERL_STATIC_INLINE void -S_cx_popblock(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_POPBLOCK; @@ -1609,7 +2127,7 @@ S_cx_popblock(pTHX_ PERL_CONTEXT *cx) * *after* cx_pushblock() was called. */ PERL_STATIC_INLINE void -S_cx_topblock(pTHX_ PERL_CONTEXT *cx) +Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_TOPBLOCK; @@ -1624,13 +2142,15 @@ S_cx_topblock(pTHX_ PERL_CONTEXT *cx) PERL_STATIC_INLINE void -S_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs) +Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs) { U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub); PERL_ARGS_ASSERT_CX_PUSHSUB; PERL_DTRACE_PROBE_ENTRY(cv); + cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix; + PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack; cx->blk_sub.cv = cv; cx->blk_sub.olddepth = CvDEPTH(cv); cx->blk_sub.prevcomppad = PL_comppad; @@ -1644,7 +2164,7 @@ S_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs) /* subsets of cx_popsub() */ PERL_STATIC_INLINE void -S_cx_popsub_common(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx) { CV *cv; @@ -1657,13 +2177,14 @@ S_cx_popsub_common(pTHX_ PERL_CONTEXT *cx) CvDEPTH(cv) = cx->blk_sub.olddepth; cx->blk_sub.cv = NULL; SvREFCNT_dec(cv); + PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix; } /* handle the @_ part of leaving a sub */ PERL_STATIC_INLINE void -S_cx_popsub_args(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx) { AV *av; @@ -1685,7 +2206,7 @@ S_cx_popsub_args(pTHX_ PERL_CONTEXT *cx) PERL_STATIC_INLINE void -S_cx_popsub(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_POPSUB; assert(CxTYPE(cx) == CXt_SUB); @@ -1699,10 +2220,12 @@ S_cx_popsub(pTHX_ PERL_CONTEXT *cx) PERL_STATIC_INLINE void -S_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv) +Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv) { PERL_ARGS_ASSERT_CX_PUSHFORMAT; + cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix; + PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack; cx->blk_format.cv = cv; cx->blk_format.retop = retop; cx->blk_format.gv = gv; @@ -1717,7 +2240,7 @@ S_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv) PERL_STATIC_INLINE void -S_cx_popformat(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx) { CV *cv; GV *dfout; @@ -1736,14 +2259,17 @@ S_cx_popformat(pTHX_ PERL_CONTEXT *cx) cx->blk_format.cv = NULL; --CvDEPTH(cv); SvREFCNT_dec_NN(cv); + PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix; } PERL_STATIC_INLINE void -S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) +Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) { PERL_ARGS_ASSERT_CX_PUSHEVAL; + cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix; + PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack; cx->blk_eval.retop = retop; cx->blk_eval.old_namesv = namesv; cx->blk_eval.old_eval_root = PL_eval_root; @@ -1758,7 +2284,7 @@ S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv) PERL_STATIC_INLINE void -S_cx_popeval(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx) { SV *sv; @@ -1779,6 +2305,7 @@ S_cx_popeval(pTHX_ PERL_CONTEXT *cx) cx->blk_eval.old_namesv = NULL; SvREFCNT_dec_NN(sv); } + PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix; } @@ -1790,7 +2317,7 @@ S_cx_popeval(pTHX_ PERL_CONTEXT *cx) */ PERL_STATIC_INLINE void -S_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx) +Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN; cx->blk_loop.my_op = cLOOP; @@ -1802,7 +2329,7 @@ S_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx) */ PERL_STATIC_INLINE void -S_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave) +Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave) { PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR; @@ -1820,7 +2347,7 @@ S_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave) /* pop all loop types, including plain */ PERL_STATIC_INLINE void -S_cx_poploop(pTHX_ PERL_CONTEXT *cx) +Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_POPLOOP; @@ -1853,7 +2380,7 @@ S_cx_poploop(pTHX_ PERL_CONTEXT *cx) PERL_STATIC_INLINE void -S_cx_pushwhen(pTHX_ PERL_CONTEXT *cx) +Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_PUSHWHEN; @@ -1862,7 +2389,7 @@ S_cx_pushwhen(pTHX_ PERL_CONTEXT *cx) PERL_STATIC_INLINE void -S_cx_popwhen(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx) { PERL_ARGS_ASSERT_CX_POPWHEN; assert(CxTYPE(cx) == CXt_WHEN); @@ -1874,7 +2401,7 @@ S_cx_popwhen(pTHX_ PERL_CONTEXT *cx) PERL_STATIC_INLINE void -S_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv) +Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv) { PERL_ARGS_ASSERT_CX_PUSHGIVEN; @@ -1884,7 +2411,7 @@ S_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv) PERL_STATIC_INLINE void -S_cx_popgiven(pTHX_ PERL_CONTEXT *cx) +Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx) { SV *sv; @@ -1900,7 +2427,7 @@ S_cx_popgiven(pTHX_ PERL_CONTEXT *cx) /* ------------------ util.h ------------------------------------------- */ /* -=head1 Miscellaneous Functions +=for apidoc_section $string =for apidoc foldEQ @@ -1934,10 +2461,10 @@ Perl_foldEQ(const char *s1, const char *s2, I32 len) PERL_STATIC_INLINE I32 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len) { - /* Compare non-utf8 using Unicode (Latin1) semantics. Does not work on - * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor - * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these. Nor - * does it check that the strings each have at least 'len' characters */ + /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds + * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and + * does not check for this. Nor does it check that the strings each have + * at least 'len' characters. */ const U8 *a = (const U8 *)s1; const U8 *b = (const U8 *)s2; @@ -1956,6 +2483,7 @@ Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len) } /* +=for apidoc_section $locale =for apidoc foldEQ_locale Returns true if the leading C bytes of the strings C and C are the @@ -1967,7 +2495,6 @@ same case-insensitively in the current locale; false otherwise. PERL_STATIC_INLINE I32 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len) { - dVAR; const U8 *a = (const U8 *)s1; const U8 *b = (const U8 *)s2; @@ -1983,6 +2510,37 @@ Perl_foldEQ_locale(const char *s1, const char *s2, I32 len) return 1; } +/* +=for apidoc_section $string +=for apidoc my_strnlen + +The C library C if available, or a Perl implementation of it. + +C computes the length of the string, up to C +characters. It will never attempt to address more than C +characters, making it suitable for use with strings that are not +guaranteed to be NUL-terminated. + +=cut + +Description stolen from http://man.openbsd.org/strnlen.3, +implementation stolen from PostgreSQL. +*/ +#ifndef HAS_STRNLEN + +PERL_STATIC_INLINE Size_t +Perl_my_strnlen(const char *str, Size_t maxlen) +{ + const char *end = (char *) memchr(str, '\0', maxlen); + + PERL_ARGS_ASSERT_MY_STRNLEN; + + if (end == NULL) return maxlen; + return end - str; +} + +#endif + #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT)) PERL_STATIC_INLINE void * @@ -2006,6 +2564,59 @@ S_my_memrchr(const char * s, const char c, const STRLEN len) #endif +PERL_STATIC_INLINE char * +Perl_mortal_getenv(const char * str) +{ + /* This implements a (mostly) thread-safe, sequential-call-safe getenv(). + * + * It's (mostly) thread-safe because it uses a mutex to prevent + * simultaneous access from other threads that use the same mutex, and + * makes a copy of the result before releasing that mutex. All of the Perl + * core uses that mutex, but, like all mutexes, everything has to cooperate + * for it to completely work. It is possible for code from, say XS, to not + * use this mutex, defeating the safety. + * + * On some platforms, getenv() is not sequential-call-safe, because + * subsequent calls destroy the static storage inside the C library + * returned by an earlier call. The result must be copied or completely + * acted upon before a subsequent getenv call. Those calls could come from + * another thread. Again, making a copy while controlling the mutex + * prevents these problems.. + * + * To prevent leaks, the copy is made by creating a new SV containing it, + * mortalizing the SV, and returning the SV's string (the copy). Thus this + * is a drop-in replacement for getenv(). + * + * A complication is that this can be called during phases where the + * mortalization process isn't available. These are in interpreter + * destruction or early in construction. khw believes that at these times + * there shouldn't be anything else going on, so plain getenv is safe AS + * LONG AS the caller acts on the return before calling it again. */ + + char * ret; + dTHX; + + PERL_ARGS_ASSERT_MORTAL_GETENV; + + /* Can't mortalize without stacks. khw believes that no other threads + * should be running, so no need to lock things, and this may be during a + * phase when locking isn't even available */ + if (UNLIKELY(PL_scopestack_ix == 0)) { + return getenv(str); + } + + ENV_LOCK; + + ret = getenv(str); + + if (ret != NULL) { + ret = SvPVX(sv_2mortal(newSVpv(ret, 0))); + } + + ENV_UNLOCK; + return ret; +} + /* * ex: set ts=8 sts=4 sw=4 et: */