3 * Copyright (C) 2012 by Larry Wall and others
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
8 * This file contains tables and code adapted from
9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * This file is a home for static inline functions that cannot go in other
34 * header files, because they depend on proto.h (included after most other
35 * headers) or struct definitions.
37 * Each section names the header file that the functions "belong" to.
40 /* ------------------------------- av.h ------------------------------- */
43 =for apidoc_section $AV
45 Returns the number of elements in the array C<av>. This is the true length of
46 the array, including any undefined elements. It is always the same as
47 S<C<av_top_index(av) + 1>>.
51 PERL_STATIC_INLINE Size_t
52 Perl_av_count(pTHX_ AV *av)
54 PERL_ARGS_ASSERT_AV_COUNT;
55 assert(SvTYPE(av) == SVt_PVAV);
57 return AvFILL(av) + 1;
60 /* ------------------------------- av.c ------------------------------- */
63 =for apidoc av_store_simple
65 This is a cut-down version of av_store that assumes that the array is
66 very straightforward - no magic, not readonly, and AvREAL - and that
67 C<key> is not negative. This function MUST NOT be used in situations
68 where any of those assumptions may not hold.
70 Stores an SV in an array. The array index is specified as C<key>. It
71 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
73 Note that the caller is responsible for suitably incrementing the reference
74 count of C<val> before the call.
76 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
81 PERL_STATIC_INLINE SV**
82 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
86 PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
87 assert(SvTYPE(av) == SVt_PVAV);
88 assert(!SvMAGICAL(av));
89 assert(!SvREADONLY(av));
95 if (AvFILLp(av) < key) {
96 if (key > AvMAX(av)) {
102 SvREFCNT_dec(ary[key]);
109 =for apidoc av_fetch_simple
111 This is a cut-down version of av_fetch that assumes that the array is
112 very straightforward - no magic, not readonly, and AvREAL - and that
113 C<key> is not negative. This function MUST NOT be used in situations
114 where any of those assumptions may not hold.
116 Returns the SV at the specified index in the array. The C<key> is the
117 index. If lval is true, you are guaranteed to get a real SV back (in case
118 it wasn't real before), which you can then modify. Check that the return
119 value is non-null before dereferencing it to a C<SV*>.
121 The rough perl equivalent is C<$myarray[$key]>.
126 PERL_STATIC_INLINE SV**
127 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
129 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
130 assert(SvTYPE(av) == SVt_PVAV);
131 assert(!SvMAGICAL(av));
132 assert(!SvREADONLY(av));
136 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
137 return lval ? av_store_simple(av,key,newSV(0)) : NULL;
139 return &AvARRAY(av)[key];
143 /* ------------------------------- cv.h ------------------------------- */
146 =for apidoc_section $CV
148 Returns the GV associated with the CV C<sv>, reifying it if necessary.
152 PERL_STATIC_INLINE GV *
153 Perl_CvGV(pTHX_ CV *sv)
155 PERL_ARGS_ASSERT_CVGV;
158 ? Perl_cvgv_from_hek(aTHX_ sv)
159 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
162 PERL_STATIC_INLINE I32 *
163 Perl_CvDEPTH(const CV * const sv)
165 PERL_ARGS_ASSERT_CVDEPTH;
166 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
168 return &((XPVCV*)SvANY(sv))->xcv_depth;
172 CvPROTO returns the prototype as stored, which is not necessarily what
173 the interpreter should be using. Specifically, the interpreter assumes
174 that spaces have been stripped, which has been the case if the prototype
175 was added by toke.c, but is generally not the case if it was added elsewhere.
176 Since we can't enforce the spacelessness at assignment time, this routine
177 provides a temporary copy at parse time with spaces removed.
178 I<orig> is the start of the original buffer, I<len> is the length of the
179 prototype and will be updated when this returns.
183 PERL_STATIC_INLINE char *
184 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
188 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
196 *len = tmps - SvPVX(tmpsv);
201 /* ------------------------------- mg.h ------------------------------- */
203 #if defined(PERL_CORE) || defined(PERL_EXT)
204 /* assumes get-magic and stringification have already occurred */
205 PERL_STATIC_INLINE STRLEN
206 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
208 assert(mg->mg_type == PERL_MAGIC_regex_global);
209 assert(mg->mg_len != -1);
210 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
211 return (STRLEN)mg->mg_len;
213 const STRLEN pos = (STRLEN)mg->mg_len;
214 /* Without this check, we may read past the end of the buffer: */
215 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
216 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
221 /* ------------------------------- pad.h ------------------------------ */
223 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
224 PERL_STATIC_INLINE bool
225 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
227 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
229 /* is seq within the range _LOW to _HIGH ?
230 * This is complicated by the fact that PL_cop_seqmax
231 * may have wrapped around at some point */
232 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
233 return FALSE; /* not yet introduced */
235 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
236 /* in compiling scope */
238 (seq > COP_SEQ_RANGE_LOW(pn))
239 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
240 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
245 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
247 ( seq > COP_SEQ_RANGE_LOW(pn)
248 || seq <= COP_SEQ_RANGE_HIGH(pn))
250 : ( seq > COP_SEQ_RANGE_LOW(pn)
251 && seq <= COP_SEQ_RANGE_HIGH(pn))
258 /* ------------------------------- pp.h ------------------------------- */
260 PERL_STATIC_INLINE I32
263 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
264 "MARK top %p %" IVdf "\n",
266 (IV)*PL_markstack_ptr)));
267 return *PL_markstack_ptr;
270 PERL_STATIC_INLINE I32
273 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
274 "MARK pop %p %" IVdf "\n",
275 (PL_markstack_ptr-1),
276 (IV)*(PL_markstack_ptr-1))));
277 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
278 return *PL_markstack_ptr--;
281 /* ----------------------------- regexp.h ----------------------------- */
283 PERL_STATIC_INLINE struct regexp *
284 Perl_ReANY(const REGEXP * const re)
286 XPV* const p = (XPV*)SvANY(re);
288 PERL_ARGS_ASSERT_REANY;
289 assert(isREGEXP(re));
291 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
292 : (struct regexp *)p;
295 /* ------------------------------- sv.h ------------------------------- */
297 PERL_STATIC_INLINE bool
298 Perl_SvTRUE(pTHX_ SV *sv)
300 PERL_ARGS_ASSERT_SVTRUE;
302 if (UNLIKELY(sv == NULL))
305 return SvTRUE_nomg_NN(sv);
308 PERL_STATIC_INLINE bool
309 Perl_SvTRUE_nomg(pTHX_ SV *sv)
311 PERL_ARGS_ASSERT_SVTRUE_NOMG;
313 if (UNLIKELY(sv == NULL))
315 return SvTRUE_nomg_NN(sv);
318 PERL_STATIC_INLINE bool
319 Perl_SvTRUE_NN(pTHX_ SV *sv)
321 PERL_ARGS_ASSERT_SVTRUE_NN;
324 return SvTRUE_nomg_NN(sv);
327 PERL_STATIC_INLINE bool
328 Perl_SvTRUE_common(pTHX_ SV * sv, const bool sv_2bool_is_fallback)
330 PERL_ARGS_ASSERT_SVTRUE_COMMON;
332 if (UNLIKELY(SvIMMORTAL_INTERP(sv)))
333 return SvIMMORTAL_TRUE(sv);
339 return SvPVXtrue(sv);
342 return SvIVX(sv) != 0; /* casts to bool */
344 if (SvROK(sv) && !(SvOBJECT(SvRV(sv)) && HvAMAGIC(SvSTASH(SvRV(sv)))))
347 if (sv_2bool_is_fallback)
348 return sv_2bool_nomg(sv);
350 return isGV_with_GP(sv);
354 PERL_STATIC_INLINE SV *
355 Perl_SvREFCNT_inc(SV *sv)
357 if (LIKELY(sv != NULL))
361 PERL_STATIC_INLINE SV *
362 Perl_SvREFCNT_inc_NN(SV *sv)
364 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
369 PERL_STATIC_INLINE void
370 Perl_SvREFCNT_inc_void(SV *sv)
372 if (LIKELY(sv != NULL))
375 PERL_STATIC_INLINE void
376 Perl_SvREFCNT_dec(pTHX_ SV *sv)
378 if (LIKELY(sv != NULL)) {
379 U32 rc = SvREFCNT(sv);
381 SvREFCNT(sv) = rc - 1;
383 Perl_sv_free2(aTHX_ sv, rc);
387 PERL_STATIC_INLINE void
388 Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
390 U32 rc = SvREFCNT(sv);
392 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
395 SvREFCNT(sv) = rc - 1;
397 Perl_sv_free2(aTHX_ sv, rc);
400 PERL_STATIC_INLINE void
401 Perl_SvAMAGIC_on(SV *sv)
403 PERL_ARGS_ASSERT_SVAMAGIC_ON;
406 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
408 PERL_STATIC_INLINE void
409 Perl_SvAMAGIC_off(SV *sv)
411 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
413 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
414 HvAMAGIC_off(SvSTASH(SvRV(sv)));
417 PERL_STATIC_INLINE U32
418 Perl_SvPADSTALE_on(SV *sv)
420 assert(!(SvFLAGS(sv) & SVs_PADTMP));
421 return SvFLAGS(sv) |= SVs_PADSTALE;
423 PERL_STATIC_INLINE U32
424 Perl_SvPADSTALE_off(SV *sv)
426 assert(!(SvFLAGS(sv) & SVs_PADTMP));
427 return SvFLAGS(sv) &= ~SVs_PADSTALE;
429 #if defined(PERL_CORE) || defined (PERL_EXT)
430 PERL_STATIC_INLINE STRLEN
431 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
433 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
435 U8 *hopped = utf8_hop((U8 *)pv, pos);
436 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
437 return (STRLEN)(hopped - (U8 *)pv);
439 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
443 /* ------------------------------- utf8.h ------------------------------- */
446 =for apidoc_section $unicode
449 PERL_STATIC_INLINE void
450 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
452 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
453 * encoded string at '*dest', updating '*dest' to include it */
455 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
457 if (NATIVE_BYTE_IS_INVARIANT(byte))
460 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
461 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
466 =for apidoc valid_utf8_to_uvchr
467 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
468 known that the next character in the input UTF-8 string C<s> is well-formed
469 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
470 points, and non-Unicode code points are allowed.
476 PERL_STATIC_INLINE UV
477 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
479 const UV expectlen = UTF8SKIP(s);
480 const U8* send = s + expectlen;
483 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
489 /* An invariant is trivially returned */
490 if (expectlen == 1) {
494 /* Remove the leading bits that indicate the number of bytes, leaving just
495 * the bits that are part of the value */
496 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
498 /* Now, loop through the remaining bytes, accumulating each into the
499 * working total as we go. (I khw tried unrolling the loop for up to 4
500 * bytes, but there was no performance improvement) */
501 for (++s; s < send; s++) {
502 uv = UTF8_ACCUMULATE(uv, *s);
505 return UNI_TO_NATIVE(uv);
510 =for apidoc is_utf8_invariant_string
512 Returns TRUE if the first C<len> bytes of the string C<s> are the same
513 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
514 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
515 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
516 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
517 characters are invariant, but so also are the C1 controls.
519 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
520 use this option, that C<s> can't have embedded C<NUL> characters and has to
521 have a terminating C<NUL> byte).
524 C<L</is_utf8_string>>,
525 C<L</is_utf8_string_flags>>,
526 C<L</is_utf8_string_loc>>,
527 C<L</is_utf8_string_loc_flags>>,
528 C<L</is_utf8_string_loclen>>,
529 C<L</is_utf8_string_loclen_flags>>,
530 C<L</is_utf8_fixed_width_buf_flags>>,
531 C<L</is_utf8_fixed_width_buf_loc_flags>>,
532 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
533 C<L</is_strict_utf8_string>>,
534 C<L</is_strict_utf8_string_loc>>,
535 C<L</is_strict_utf8_string_loclen>>,
536 C<L</is_c9strict_utf8_string>>,
537 C<L</is_c9strict_utf8_string_loc>>,
539 C<L</is_c9strict_utf8_string_loclen>>.
545 #define is_utf8_invariant_string(s, len) \
546 is_utf8_invariant_string_loc(s, len, NULL)
549 =for apidoc is_utf8_invariant_string_loc
551 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
552 the first UTF-8 variant character in the C<ep> pointer; if all characters are
553 UTF-8 invariant, this function does not change the contents of C<*ep>.
559 PERL_STATIC_INLINE bool
560 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
565 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
568 len = strlen((const char *)s);
573 /* This looks like 0x010101... */
574 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
576 /* This looks like 0x808080... */
577 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
578 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
579 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
581 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
582 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
583 * optimized out completely on a 32-bit system, and its mask gets optimized out
584 * on a 64-bit system */
585 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
586 | ( PTR2nat(x) >> 1) \
588 & PERL_WORD_BOUNDARY_MASK) >> 2))))
592 /* Do the word-at-a-time iff there is at least one usable full word. That
593 * means that after advancing to a word boundary, there still is at least a
594 * full word left. The number of bytes needed to advance is 'wordsize -
595 * offset' unless offset is 0. */
596 if ((STRLEN) (send - x) >= PERL_WORDSIZE
598 /* This term is wordsize if subword; 0 if not */
599 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
602 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
605 /* Process per-byte until reach word boundary. XXX This loop could be
606 * eliminated if we knew that this platform had fast unaligned reads */
607 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
608 if (! UTF8_IS_INVARIANT(*x)) {
618 /* Here, we know we have at least one full word to process. Process
619 * per-word as long as we have at least a full word left */
621 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
623 /* Found a variant. Just return if caller doesn't want its
629 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
630 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
632 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
633 assert(*ep >= s && *ep < send);
637 # else /* If weird byte order, drop into next loop to do byte-at-a-time
646 } while (x + PERL_WORDSIZE <= send);
649 #endif /* End of ! EBCDIC */
651 /* Process per-byte */
653 if (! UTF8_IS_INVARIANT(*x)) {
669 PERL_STATIC_INLINE unsigned int
670 Perl_variant_byte_number(PERL_UINTMAX_T word)
673 /* This returns the position in a word (0..7) of the first variant byte in
674 * it. This is a helper function. Note that there are no branches */
678 /* Get just the msb bits of each byte */
679 word &= PERL_VARIANTS_WORD_MASK;
681 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
683 /* Bytes are stored like
684 * Byte8 ... Byte2 Byte1
685 * 63..56...15...8 7...0
688 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
690 * The word will look like this, with a rightmost set bit in position 's':
691 * ('x's are don't cares)
694 * x..xx10..0 Right shift (rightmost 0 is shifted off)
695 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
696 * the 1 just to their left into a 0; the remainder is
698 * 0..0011..1 The xor with the original, x..xx10..0, clears that
699 * remainder, sets the bottom to all 1
700 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
702 * Another method is to do 'word &= -word'; but it generates a compiler
703 * message on some platforms about taking the negative of an unsigned */
706 word = 1 + (word ^ (word - 1));
708 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
710 /* Bytes are stored like
711 * Byte1 Byte2 ... Byte8
712 * 63..56 55..47 ... 7...0
714 * Isolate the msb; http://codeforces.com/blog/entry/10330
716 * Only the most significant set bit matters. Or'ing word with its right
717 * shift of 1 makes that bit and the next one to its right both 1. Then
718 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
719 * msb and all to the right being 1. */
725 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
727 /* Then subtracting the right shift by 1 clears all but the left-most of
728 * the 1 bits, which is our desired result */
732 # error Unexpected byte order
735 /* Here 'word' has a single bit set: the msb of the first byte in which it
736 * is set. Calculate that position in the word. We can use this
737 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
738 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
739 * just get shifted off at compile time) */
740 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
741 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
742 | (39 << 24) | (47 << 16)
743 | (55 << 8) | (63 << 0));
744 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
746 /* Here, word contains the position 7,15,23,...,63 of that bit. Convert to
748 word = ((word + 1) >> 3) - 1;
750 # if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
752 /* And invert the result */
753 word = CHARBITS - word - 1;
757 return (unsigned int) word;
761 #if defined(PERL_CORE) || defined(PERL_EXT)
764 =for apidoc variant_under_utf8_count
766 This function looks at the sequence of bytes between C<s> and C<e>, which are
767 assumed to be encoded in ASCII/Latin1, and returns how many of them would
768 change should the string be translated into UTF-8. Due to the nature of UTF-8,
769 each of these would occupy two bytes instead of the single one in the input
770 string. Thus, this function returns the precise number of bytes the string
771 would expand by when translated to UTF-8.
773 Unlike most of the other functions that have C<utf8> in their name, the input
774 to this function is NOT a UTF-8-encoded string. The function name is slightly
775 I<odd> to emphasize this.
777 This function is internal to Perl because khw thinks that any XS code that
778 would want this is probably operating too close to the internals. Presenting a
779 valid use case could change that.
782 C<L<perlapi/is_utf8_invariant_string>>
784 C<L<perlapi/is_utf8_invariant_string_loc>>,
790 PERL_STATIC_INLINE Size_t
791 S_variant_under_utf8_count(const U8* const s, const U8* const e)
796 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
800 /* Test if the string is long enough to use word-at-a-time. (Logic is the
801 * same as for is_utf8_invariant_string()) */
802 if ((STRLEN) (e - x) >= PERL_WORDSIZE
803 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
804 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
807 /* Process per-byte until reach word boundary. XXX This loop could be
808 * eliminated if we knew that this platform had fast unaligned reads */
809 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
810 count += ! UTF8_IS_INVARIANT(*x++);
813 /* Process per-word as long as we have at least a full word left */
814 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
815 explanation of how this works */
816 PERL_UINTMAX_T increment
817 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
818 * PERL_COUNT_MULTIPLIER)
819 >> ((PERL_WORDSIZE - 1) * CHARBITS);
820 count += (Size_t) increment;
822 } while (x + PERL_WORDSIZE <= e);
827 /* Process per-byte */
829 if (! UTF8_IS_INVARIANT(*x)) {
841 #ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
842 # undef PERL_WORDSIZE
843 # undef PERL_COUNT_MULTIPLIER
844 # undef PERL_WORD_BOUNDARY_MASK
845 # undef PERL_VARIANTS_WORD_MASK
849 =for apidoc is_utf8_string
851 Returns TRUE if the first C<len> bytes of string C<s> form a valid
852 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
853 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
854 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
855 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
857 This function considers Perl's extended UTF-8 to be valid. That means that
858 code points above Unicode, surrogates, and non-character code points are
859 considered valid by this function. Use C<L</is_strict_utf8_string>>,
860 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
861 code points are considered valid.
864 C<L</is_utf8_invariant_string>>,
865 C<L</is_utf8_invariant_string_loc>>,
866 C<L</is_utf8_string_loc>>,
867 C<L</is_utf8_string_loclen>>,
868 C<L</is_utf8_fixed_width_buf_flags>>,
869 C<L</is_utf8_fixed_width_buf_loc_flags>>,
870 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
875 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
877 #if defined(PERL_CORE) || defined (PERL_EXT)
880 =for apidoc is_utf8_non_invariant_string
882 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
883 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
884 UTF-8; otherwise returns FALSE.
886 A TRUE return means that at least one code point represented by the sequence
887 either is a wide character not representable as a single byte, or the
888 representation differs depending on whether the sequence is encoded in UTF-8 or
892 C<L<perlapi/is_utf8_invariant_string>>,
893 C<L<perlapi/is_utf8_string>>
897 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
898 It generally needn't be if its string is entirely UTF-8 invariant, and it
899 shouldn't be if it otherwise contains invalid UTF-8.
901 It is an internal function because khw thinks that XS code shouldn't be working
902 at this low a level. A valid use case could change that.
906 PERL_STATIC_INLINE bool
907 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
909 const U8 * first_variant;
911 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
913 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
917 return is_utf8_string(first_variant, len - (first_variant - s));
923 =for apidoc is_strict_utf8_string
925 Returns TRUE if the first C<len> bytes of string C<s> form a valid
926 UTF-8-encoded string that is fully interchangeable by any application using
927 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
928 calculated using C<strlen(s)> (which means if you use this option, that C<s>
929 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
930 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
932 This function returns FALSE for strings containing any
933 code points above the Unicode max of 0x10FFFF, surrogate code points, or
934 non-character code points.
937 C<L</is_utf8_invariant_string>>,
938 C<L</is_utf8_invariant_string_loc>>,
939 C<L</is_utf8_string>>,
940 C<L</is_utf8_string_flags>>,
941 C<L</is_utf8_string_loc>>,
942 C<L</is_utf8_string_loc_flags>>,
943 C<L</is_utf8_string_loclen>>,
944 C<L</is_utf8_string_loclen_flags>>,
945 C<L</is_utf8_fixed_width_buf_flags>>,
946 C<L</is_utf8_fixed_width_buf_loc_flags>>,
947 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
948 C<L</is_strict_utf8_string_loc>>,
949 C<L</is_strict_utf8_string_loclen>>,
950 C<L</is_c9strict_utf8_string>>,
951 C<L</is_c9strict_utf8_string_loc>>,
953 C<L</is_c9strict_utf8_string_loclen>>.
958 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
961 =for apidoc is_c9strict_utf8_string
963 Returns TRUE if the first C<len> bytes of string C<s> form a valid
964 UTF-8-encoded string that conforms to
965 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
966 otherwise it returns FALSE. If C<len> is 0, it will be calculated using
967 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
968 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
969 characters being ASCII constitute 'a valid UTF-8 string'.
971 This function returns FALSE for strings containing any code points above the
972 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
974 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
977 C<L</is_utf8_invariant_string>>,
978 C<L</is_utf8_invariant_string_loc>>,
979 C<L</is_utf8_string>>,
980 C<L</is_utf8_string_flags>>,
981 C<L</is_utf8_string_loc>>,
982 C<L</is_utf8_string_loc_flags>>,
983 C<L</is_utf8_string_loclen>>,
984 C<L</is_utf8_string_loclen_flags>>,
985 C<L</is_utf8_fixed_width_buf_flags>>,
986 C<L</is_utf8_fixed_width_buf_loc_flags>>,
987 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
988 C<L</is_strict_utf8_string>>,
989 C<L</is_strict_utf8_string_loc>>,
990 C<L</is_strict_utf8_string_loclen>>,
991 C<L</is_c9strict_utf8_string_loc>>,
993 C<L</is_c9strict_utf8_string_loclen>>.
998 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
1001 =for apidoc is_utf8_string_flags
1003 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1004 UTF-8 string, subject to the restrictions imposed by C<flags>;
1005 returns FALSE otherwise. If C<len> is 0, it will be calculated
1006 using C<strlen(s)> (which means if you use this option, that C<s> can't have
1007 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
1008 that all characters being ASCII constitute 'a valid UTF-8 string'.
1010 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1011 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1012 as C<L</is_strict_utf8_string>>; and if C<flags> is
1013 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1014 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
1015 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1016 C<L</utf8n_to_uvchr>>, with the same meanings.
1019 C<L</is_utf8_invariant_string>>,
1020 C<L</is_utf8_invariant_string_loc>>,
1021 C<L</is_utf8_string>>,
1022 C<L</is_utf8_string_loc>>,
1023 C<L</is_utf8_string_loc_flags>>,
1024 C<L</is_utf8_string_loclen>>,
1025 C<L</is_utf8_string_loclen_flags>>,
1026 C<L</is_utf8_fixed_width_buf_flags>>,
1027 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1028 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1029 C<L</is_strict_utf8_string>>,
1030 C<L</is_strict_utf8_string_loc>>,
1031 C<L</is_strict_utf8_string_loclen>>,
1032 C<L</is_c9strict_utf8_string>>,
1033 C<L</is_c9strict_utf8_string_loc>>,
1035 C<L</is_c9strict_utf8_string_loclen>>.
1040 PERL_STATIC_INLINE bool
1041 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
1043 const U8 * first_variant;
1045 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1046 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1047 |UTF8_DISALLOW_PERL_EXTENDED)));
1050 len = strlen((const char *)s);
1054 return is_utf8_string(s, len);
1057 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1058 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1060 return is_strict_utf8_string(s, len);
1063 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1064 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1066 return is_c9strict_utf8_string(s, len);
1069 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1070 const U8* const send = s + len;
1071 const U8* x = first_variant;
1074 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1075 if (UNLIKELY(! cur_len)) {
1087 =for apidoc is_utf8_string_loc
1089 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1090 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1091 "utf8ness success") in the C<ep> pointer.
1093 See also C<L</is_utf8_string_loclen>>.
1098 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
1102 =for apidoc is_utf8_string_loclen
1104 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1105 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1106 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1107 encoded characters in the C<el> pointer.
1109 See also C<L</is_utf8_string_loc>>.
1114 PERL_STATIC_INLINE bool
1115 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1117 const U8 * first_variant;
1119 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1122 len = strlen((const char *) s);
1125 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1137 const U8* const send = s + len;
1138 const U8* x = first_variant;
1139 STRLEN outlen = first_variant - s;
1142 const STRLEN cur_len = isUTF8_CHAR(x, send);
1143 if (UNLIKELY(! cur_len)) {
1163 =for apidoc isUTF8_CHAR
1165 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1166 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1167 that represents some code point; otherwise it evaluates to 0. If non-zero, the
1168 value gives how many bytes starting at C<s> comprise the code point's
1169 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1170 form the first code point in C<s>, are not examined.
1172 The code point can be any that will fit in an IV on this machine, using Perl's
1173 extension to official UTF-8 to represent those higher than the Unicode maximum
1174 of 0x10FFFF. That means that this macro is used to efficiently decide if the
1175 next few bytes in C<s> is legal UTF-8 for a single character.
1177 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1178 defined by Unicode to be fully interchangeable across applications;
1179 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1180 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1181 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1183 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1184 C<L</is_utf8_string_loclen>> to check entire strings.
1186 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1187 machines) is a valid UTF-8 character.
1191 This uses an adaptation of the table and algorithm given in
1192 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1193 documentation of the original version. A copyright notice for the original
1194 version is given at the beginning of this file. The Perl adapation is
1195 documented at the definition of PL_extended_utf8_dfa_tab[].
1199 PERL_STATIC_INLINE Size_t
1200 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1205 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1207 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1208 * code point, which can be returned immediately. Otherwise, it is either
1209 * malformed, or for the start byte FF which the dfa doesn't handle (except
1210 * on 32-bit ASCII platforms where it trivially is an error). Call a
1211 * helper function for the other platforms. */
1214 state = PL_extended_utf8_dfa_tab[ 256
1216 + PL_extended_utf8_dfa_tab[*s]];
1223 if (UNLIKELY(state == 1)) {
1228 #if defined(UV_IS_QUAD) || defined(EBCDIC)
1230 if (e - s0 >= UTF8_MAXBYTES && NATIVE_UTF8_TO_I8(*s0) == 0xFF) {
1231 return is_utf8_char_helper(s0, e, 0);
1241 =for apidoc isSTRICT_UTF8_CHAR
1243 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1244 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1245 Unicode code point completely acceptable for open interchange between all
1246 applications; otherwise it evaluates to 0. If non-zero, the value gives how
1247 many bytes starting at C<s> comprise the code point's representation. Any
1248 bytes remaining before C<e>, but beyond the ones needed to form the first code
1249 point in C<s>, are not examined.
1251 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1252 be a surrogate nor a non-character code point. Thus this excludes any code
1253 point from Perl's extended UTF-8.
1255 This is used to efficiently decide if the next few bytes in C<s> is
1256 legal Unicode-acceptable UTF-8 for a single character.
1258 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1259 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1260 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1261 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1263 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1264 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1268 This uses an adaptation of the tables and algorithm given in
1269 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1270 documentation of the original version. A copyright notice for the original
1271 version is given at the beginning of this file. The Perl adapation is
1272 documented at the definition of strict_extended_utf8_dfa_tab[].
1276 PERL_STATIC_INLINE Size_t
1277 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1282 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1285 state = PL_strict_utf8_dfa_tab[ 256
1287 + PL_strict_utf8_dfa_tab[*s]];
1294 if (UNLIKELY(state == 1)) {
1301 /* The dfa above drops out for certain Hanguls; handle them specially */
1302 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1313 =for apidoc isC9_STRICT_UTF8_CHAR
1315 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1316 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1317 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1318 the value gives how many bytes starting at C<s> comprise the code point's
1319 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1320 form the first code point in C<s>, are not examined.
1322 The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1323 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1324 code points. This corresponds to
1325 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1326 which said that non-character code points are merely discouraged rather than
1327 completely forbidden in open interchange. See
1328 L<perlunicode/Noncharacter code points>.
1330 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1331 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1333 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1334 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1338 This uses an adaptation of the tables and algorithm given in
1339 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1340 documentation of the original version. A copyright notice for the original
1341 version is given at the beginning of this file. The Perl adapation is
1342 documented at the definition of PL_c9_utf8_dfa_tab[].
1346 PERL_STATIC_INLINE Size_t
1347 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1352 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1355 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
1362 if (UNLIKELY(state == 1)) {
1372 =for apidoc is_strict_utf8_string_loc
1374 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1375 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1376 "utf8ness success") in the C<ep> pointer.
1378 See also C<L</is_strict_utf8_string_loclen>>.
1383 #define is_strict_utf8_string_loc(s, len, ep) \
1384 is_strict_utf8_string_loclen(s, len, ep, 0)
1388 =for apidoc is_strict_utf8_string_loclen
1390 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1391 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1392 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1393 encoded characters in the C<el> pointer.
1395 See also C<L</is_strict_utf8_string_loc>>.
1400 PERL_STATIC_INLINE bool
1401 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1403 const U8 * first_variant;
1405 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1408 len = strlen((const char *) s);
1411 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1423 const U8* const send = s + len;
1424 const U8* x = first_variant;
1425 STRLEN outlen = first_variant - s;
1428 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1429 if (UNLIKELY(! cur_len)) {
1449 =for apidoc is_c9strict_utf8_string_loc
1451 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1452 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1453 "utf8ness success") in the C<ep> pointer.
1455 See also C<L</is_c9strict_utf8_string_loclen>>.
1460 #define is_c9strict_utf8_string_loc(s, len, ep) \
1461 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1465 =for apidoc is_c9strict_utf8_string_loclen
1467 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1468 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1469 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1470 characters in the C<el> pointer.
1472 See also C<L</is_c9strict_utf8_string_loc>>.
1477 PERL_STATIC_INLINE bool
1478 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1480 const U8 * first_variant;
1482 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1485 len = strlen((const char *) s);
1488 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1500 const U8* const send = s + len;
1501 const U8* x = first_variant;
1502 STRLEN outlen = first_variant - s;
1505 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1506 if (UNLIKELY(! cur_len)) {
1526 =for apidoc is_utf8_string_loc_flags
1528 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1529 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1530 "utf8ness success") in the C<ep> pointer.
1532 See also C<L</is_utf8_string_loclen_flags>>.
1537 #define is_utf8_string_loc_flags(s, len, ep, flags) \
1538 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1541 /* The above 3 actual functions could have been moved into the more general one
1542 * just below, and made #defines that call it with the right 'flags'. They are
1543 * currently kept separate to increase their chances of getting inlined */
1547 =for apidoc is_utf8_string_loclen_flags
1549 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1550 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1551 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1552 encoded characters in the C<el> pointer.
1554 See also C<L</is_utf8_string_loc_flags>>.
1559 PERL_STATIC_INLINE bool
1560 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1562 const U8 * first_variant;
1564 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1565 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1566 |UTF8_DISALLOW_PERL_EXTENDED)));
1569 len = strlen((const char *) s);
1573 return is_utf8_string_loclen(s, len, ep, el);
1576 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1577 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1579 return is_strict_utf8_string_loclen(s, len, ep, el);
1582 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1583 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1585 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1588 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1600 const U8* send = s + len;
1601 const U8* x = first_variant;
1602 STRLEN outlen = first_variant - s;
1605 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1606 if (UNLIKELY(! cur_len)) {
1625 =for apidoc utf8_distance
1627 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1630 WARNING: use only if you *know* that the pointers point inside the
1636 PERL_STATIC_INLINE IV
1637 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1639 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1641 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1645 =for apidoc utf8_hop
1647 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1648 forward or backward.
1650 WARNING: do not use the following unless you *know* C<off> is within
1651 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1652 on the first byte of character or just after the last byte of a character.
1657 PERL_STATIC_INLINE U8 *
1658 Perl_utf8_hop(const U8 *s, SSize_t off)
1660 PERL_ARGS_ASSERT_UTF8_HOP;
1662 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1663 * the bitops (especially ~) can create illegal UTF-8.
1664 * In other words: in Perl UTF-8 is not just for Unicode. */
1673 while (UTF8_IS_CONTINUATION(*s))
1677 GCC_DIAG_IGNORE(-Wcast-qual)
1683 =for apidoc utf8_hop_forward
1685 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1688 C<off> must be non-negative.
1690 C<s> must be before or equal to C<end>.
1692 When moving forward it will not move beyond C<end>.
1694 Will not exceed this limit even if the string is not valid "UTF-8".
1699 PERL_STATIC_INLINE U8 *
1700 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1702 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1704 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1705 * the bitops (especially ~) can create illegal UTF-8.
1706 * In other words: in Perl UTF-8 is not just for Unicode. */
1712 STRLEN skip = UTF8SKIP(s);
1713 if ((STRLEN)(end - s) <= skip) {
1714 GCC_DIAG_IGNORE(-Wcast-qual)
1721 GCC_DIAG_IGNORE(-Wcast-qual)
1727 =for apidoc utf8_hop_back
1729 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1732 C<off> must be non-positive.
1734 C<s> must be after or equal to C<start>.
1736 When moving backward it will not move before C<start>.
1738 Will not exceed this limit even if the string is not valid "UTF-8".
1743 PERL_STATIC_INLINE U8 *
1744 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1746 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1748 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1749 * the bitops (especially ~) can create illegal UTF-8.
1750 * In other words: in Perl UTF-8 is not just for Unicode. */
1755 while (off++ && s > start) {
1758 } while (UTF8_IS_CONTINUATION(*s) && s > start);
1761 GCC_DIAG_IGNORE(-Wcast-qual)
1767 =for apidoc utf8_hop_safe
1769 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1770 either forward or backward.
1772 When moving backward it will not move before C<start>.
1774 When moving forward it will not move beyond C<end>.
1776 Will not exceed those limits even if the string is not valid "UTF-8".
1781 PERL_STATIC_INLINE U8 *
1782 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1784 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1786 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1787 * the bitops (especially ~) can create illegal UTF-8.
1788 * In other words: in Perl UTF-8 is not just for Unicode. */
1790 assert(start <= s && s <= end);
1793 return utf8_hop_forward(s, off, end);
1796 return utf8_hop_back(s, off, start);
1802 =for apidoc is_utf8_valid_partial_char
1804 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1805 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1806 points. Otherwise, it returns 1 if there exists at least one non-empty
1807 sequence of bytes that when appended to sequence C<s>, starting at position
1808 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1809 otherwise returns 0.
1811 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1814 This is useful when a fixed-length buffer is being tested for being well-formed
1815 UTF-8, but the final few bytes in it don't comprise a full character; that is,
1816 it is split somewhere in the middle of the final code point's UTF-8
1817 representation. (Presumably when the buffer is refreshed with the next chunk
1818 of data, the new first bytes will complete the partial code point.) This
1819 function is used to verify that the final bytes in the current buffer are in
1820 fact the legal beginning of some code point, so that if they aren't, the
1821 failure can be signalled without having to wait for the next read.
1825 #define is_utf8_valid_partial_char(s, e) \
1826 is_utf8_valid_partial_char_flags(s, e, 0)
1830 =for apidoc is_utf8_valid_partial_char_flags
1832 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1833 or not the input is a valid UTF-8 encoded partial character, but it takes an
1834 extra parameter, C<flags>, which can further restrict which code points are
1837 If C<flags> is 0, this behaves identically to
1838 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1839 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1840 there is any sequence of bytes that can complete the input partial character in
1841 such a way that a non-prohibited character is formed, the function returns
1842 TRUE; otherwise FALSE. Non character code points cannot be determined based on
1843 partial character input. But many of the other possible excluded types can be
1844 determined from just the first one or two bytes.
1849 PERL_STATIC_INLINE bool
1850 Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
1852 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
1854 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1855 |UTF8_DISALLOW_PERL_EXTENDED)));
1857 if (s >= e || s + UTF8SKIP(s) <= e) {
1861 return cBOOL(is_utf8_char_helper(s, e, flags));
1866 =for apidoc is_utf8_fixed_width_buf_flags
1868 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1869 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1870 otherwise it returns FALSE.
1872 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1873 without restriction. If the final few bytes of the buffer do not form a
1874 complete code point, this will return TRUE anyway, provided that
1875 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1877 If C<flags> in non-zero, it can be any combination of the
1878 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1881 This function differs from C<L</is_utf8_string_flags>> only in that the latter
1882 returns FALSE if the final few bytes of the string don't form a complete code
1887 #define is_utf8_fixed_width_buf_flags(s, len, flags) \
1888 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1892 =for apidoc is_utf8_fixed_width_buf_loc_flags
1894 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1895 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1896 to the beginning of any partial character at the end of the buffer; if there is
1897 no partial character C<*ep> will contain C<s>+C<len>.
1899 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1904 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1905 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1909 =for apidoc is_utf8_fixed_width_buf_loclen_flags
1911 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1912 complete, valid characters found in the C<el> pointer.
1917 PERL_STATIC_INLINE bool
1918 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
1924 const U8 * maybe_partial;
1926 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1929 ep = &maybe_partial;
1932 /* If it's entirely valid, return that; otherwise see if the only error is
1933 * that the final few bytes are for a partial character */
1934 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1935 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1938 PERL_STATIC_INLINE UV
1939 Perl_utf8n_to_uvchr_msgs(const U8 *s,
1946 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1947 * simple cases, and, if necessary calls a helper function to deal with the
1948 * more complex ones. Almost all well-formed non-problematic code points
1949 * are considered simple, so that it's unlikely that the helper function
1950 * will need to be called.
1952 * This is an adaptation of the tables and algorithm given in
1953 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
1954 * comprehensive documentation of the original version. A copyright notice
1955 * for the original version is given at the beginning of this file. The
1956 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
1959 const U8 * const s0 = s;
1960 const U8 * send = s0 + curlen;
1961 UV uv = 0; /* The 0 silences some stupid compilers */
1964 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1966 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1967 * non-problematic code point, which can be returned immediately.
1968 * Otherwise we call a helper function to figure out the more complicated
1971 while (s < send && LIKELY(state != 1)) {
1972 UV type = PL_strict_utf8_dfa_tab[*s];
1975 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1976 : UTF8_ACCUMULATE(uv, *s);
1977 state = PL_strict_utf8_dfa_tab[256 + state + type];
1985 *retlen = s - s0 + 1;
1994 return UNI_TO_NATIVE(uv);
1997 /* Here is potentially problematic. Use the full mechanism */
1998 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
2001 PERL_STATIC_INLINE UV
2002 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2004 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
2008 if (! ckWARN_d(WARN_UTF8)) {
2010 /* EMPTY is not really allowed, and asserts on debugging builds. But
2011 * on non-debugging we have to deal with it, and this causes it to
2012 * return the REPLACEMENT CHARACTER, as the documentation indicates */
2013 return utf8n_to_uvchr(s, send - s, retlen,
2014 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
2017 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
2018 if (retlen && ret == 0 && *s != '\0') {
2019 *retlen = (STRLEN) -1;
2026 /* ------------------------------- perl.h ----------------------------- */
2029 =for apidoc_section $utility
2031 =for apidoc is_safe_syscall
2033 Test that the given C<pv> (with length C<len>) doesn't contain any internal
2035 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2036 category, and return FALSE.
2038 Return TRUE if the name is safe.
2040 C<what> and C<op_name> are used in any warning.
2042 Used by the C<IS_SAFE_SYSCALL()> macro.
2047 PERL_STATIC_INLINE bool
2048 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2050 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2051 * perl itself uses xce*() functions which accept 8-bit strings.
2054 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2058 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
2059 SETERRNO(ENOENT, LIB_INVARG);
2060 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
2061 "Invalid \\0 character in %s for %s: %s\\0%s",
2062 what, op_name, pv, null_at+1);
2072 Return true if the supplied filename has a newline character
2073 immediately before the first (hopefully only) NUL.
2075 My original look at this incorrectly used the len from SvPV(), but
2076 that's incorrect, since we allow for a NUL in pv[len-1].
2078 So instead, strlen() and work from there.
2080 This allow for the user reading a filename, forgetting to chomp it,
2083 open my $foo, "$file\0";
2089 PERL_STATIC_INLINE bool
2090 S_should_warn_nl(const char *pv)
2094 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2098 return len > 0 && pv[len-1] == '\n';
2103 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2105 PERL_STATIC_INLINE bool
2106 S_lossless_NV_to_IV(const NV nv, IV *ivp)
2108 /* This function determines if the input NV 'nv' may be converted without
2109 * loss of data to an IV. If not, it returns FALSE taking no other action.
2110 * But if it is possible, it does the conversion, returning TRUE, and
2111 * storing the converted result in '*ivp' */
2113 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2115 # if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2116 /* Normally any comparison with a NaN returns false; if we can't rely
2117 * on that behaviour, check explicitly */
2118 if (UNLIKELY(Perl_isnan(nv))) {
2123 /* Written this way so that with an always-false NaN comparison we
2125 if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
2129 if ((IV) nv != nv) {
2139 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2141 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2143 #define MAX_CHARSET_NAME_LENGTH 2
2145 PERL_STATIC_INLINE const char *
2146 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2148 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2150 /* Returns a string that corresponds to the name of the regex character set
2151 * given by 'flags', and *lenp is set the length of that string, which
2152 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2155 switch (get_regex_charset(flags)) {
2156 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2157 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2158 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2159 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2160 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2162 return ASCII_MORE_RESTRICT_PAT_MODS;
2164 /* The NOT_REACHED; hides an assert() which has a rather complex
2165 * definition in perl.h. */
2166 NOT_REACHED; /* NOTREACHED */
2167 return "?"; /* Unknown */
2174 Return false if any get magic is on the SV other than taint magic.
2178 PERL_STATIC_INLINE bool
2179 Perl_sv_only_taint_gmagic(SV *sv)
2181 MAGIC *mg = SvMAGIC(sv);
2183 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2186 if (mg->mg_type != PERL_MAGIC_taint
2187 && !(mg->mg_flags & MGf_GSKIP)
2188 && mg->mg_virtual->svt_get) {
2191 mg = mg->mg_moremagic;
2197 /* ------------------ cop.h ------------------------------------------- */
2199 /* implement GIMME_V() macro */
2201 PERL_STATIC_INLINE U8
2205 U8 gimme = (PL_op->op_flags & OPf_WANT);
2209 cxix = PL_curstackinfo->si_cxsubix;
2211 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
2212 assert(cxstack[cxix].blk_gimme & G_WANT);
2213 return (cxstack[cxix].blk_gimme & G_WANT);
2217 /* Enter a block. Push a new base context and return its address. */
2219 PERL_STATIC_INLINE PERL_CONTEXT *
2220 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2224 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2229 cx->blk_gimme = gimme;
2230 cx->blk_oldsaveix = saveix;
2231 cx->blk_oldsp = (I32)(sp - PL_stack_base);
2232 cx->blk_oldcop = PL_curcop;
2233 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
2234 cx->blk_oldscopesp = PL_scopestack_ix;
2235 cx->blk_oldpm = PL_curpm;
2236 cx->blk_old_tmpsfloor = PL_tmps_floor;
2238 PL_tmps_floor = PL_tmps_ix;
2239 CX_DEBUG(cx, "PUSH");
2244 /* Exit a block (RETURN and LAST). */
2246 PERL_STATIC_INLINE void
2247 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2249 PERL_ARGS_ASSERT_CX_POPBLOCK;
2251 CX_DEBUG(cx, "POP");
2252 /* these 3 are common to cx_popblock and cx_topblock */
2253 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2254 PL_scopestack_ix = cx->blk_oldscopesp;
2255 PL_curpm = cx->blk_oldpm;
2257 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2258 * and leaves a CX entry lying around for repeated use, so
2259 * skip for multicall */ \
2260 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2261 || PL_savestack_ix == cx->blk_oldsaveix);
2262 PL_curcop = cx->blk_oldcop;
2263 PL_tmps_floor = cx->blk_old_tmpsfloor;
2266 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2267 * Whereas cx_popblock() restores the state to the point just before
2268 * cx_pushblock() was called, cx_topblock() restores it to the point just
2269 * *after* cx_pushblock() was called. */
2271 PERL_STATIC_INLINE void
2272 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2274 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2276 CX_DEBUG(cx, "TOP");
2277 /* these 3 are common to cx_popblock and cx_topblock */
2278 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2279 PL_scopestack_ix = cx->blk_oldscopesp;
2280 PL_curpm = cx->blk_oldpm;
2282 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2286 PERL_STATIC_INLINE void
2287 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2289 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2291 PERL_ARGS_ASSERT_CX_PUSHSUB;
2293 PERL_DTRACE_PROBE_ENTRY(cv);
2294 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2295 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2296 cx->blk_sub.cv = cv;
2297 cx->blk_sub.olddepth = CvDEPTH(cv);
2298 cx->blk_sub.prevcomppad = PL_comppad;
2299 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2300 cx->blk_sub.retop = retop;
2301 SvREFCNT_inc_simple_void_NN(cv);
2302 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2306 /* subsets of cx_popsub() */
2308 PERL_STATIC_INLINE void
2309 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2313 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2314 assert(CxTYPE(cx) == CXt_SUB);
2316 PL_comppad = cx->blk_sub.prevcomppad;
2317 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2318 cv = cx->blk_sub.cv;
2319 CvDEPTH(cv) = cx->blk_sub.olddepth;
2320 cx->blk_sub.cv = NULL;
2322 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2326 /* handle the @_ part of leaving a sub */
2328 PERL_STATIC_INLINE void
2329 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2333 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2334 assert(CxTYPE(cx) == CXt_SUB);
2335 assert(AvARRAY(MUTABLE_AV(
2336 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2337 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2339 CX_POP_SAVEARRAY(cx);
2340 av = MUTABLE_AV(PAD_SVl(0));
2341 if (UNLIKELY(AvREAL(av)))
2342 /* abandon @_ if it got reified */
2343 clear_defarray(av, 0);
2350 PERL_STATIC_INLINE void
2351 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2353 PERL_ARGS_ASSERT_CX_POPSUB;
2354 assert(CxTYPE(cx) == CXt_SUB);
2356 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2360 cx_popsub_common(cx);
2364 PERL_STATIC_INLINE void
2365 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2367 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2369 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2370 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2371 cx->blk_format.cv = cv;
2372 cx->blk_format.retop = retop;
2373 cx->blk_format.gv = gv;
2374 cx->blk_format.dfoutgv = PL_defoutgv;
2375 cx->blk_format.prevcomppad = PL_comppad;
2378 SvREFCNT_inc_simple_void_NN(cv);
2380 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2384 PERL_STATIC_INLINE void
2385 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2390 PERL_ARGS_ASSERT_CX_POPFORMAT;
2391 assert(CxTYPE(cx) == CXt_FORMAT);
2393 dfout = cx->blk_format.dfoutgv;
2395 cx->blk_format.dfoutgv = NULL;
2396 SvREFCNT_dec_NN(dfout);
2398 PL_comppad = cx->blk_format.prevcomppad;
2399 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2400 cv = cx->blk_format.cv;
2401 cx->blk_format.cv = NULL;
2403 SvREFCNT_dec_NN(cv);
2404 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2408 PERL_STATIC_INLINE void
2409 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2411 cx->blk_eval.retop = retop;
2412 cx->blk_eval.old_namesv = namesv;
2413 cx->blk_eval.old_eval_root = PL_eval_root;
2414 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2415 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2416 cx->blk_eval.cur_top_env = PL_top_env;
2418 assert(!(PL_in_eval & ~ 0x3F));
2419 assert(!(PL_op->op_type & ~0x1FF));
2420 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2423 PERL_STATIC_INLINE void
2424 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2426 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2428 Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2430 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2431 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2434 PERL_STATIC_INLINE void
2435 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2437 PERL_ARGS_ASSERT_CX_PUSHTRY;
2439 Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2441 /* Don't actually change it, just store the current value so it's restored
2442 * by the common popeval */
2443 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2447 PERL_STATIC_INLINE void
2448 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2452 PERL_ARGS_ASSERT_CX_POPEVAL;
2453 assert(CxTYPE(cx) == CXt_EVAL);
2455 PL_in_eval = CxOLD_IN_EVAL(cx);
2456 assert(!(PL_in_eval & 0xc0));
2457 PL_eval_root = cx->blk_eval.old_eval_root;
2458 sv = cx->blk_eval.cur_text;
2459 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2460 cx->blk_eval.cur_text = NULL;
2461 SvREFCNT_dec_NN(sv);
2464 sv = cx->blk_eval.old_namesv;
2466 cx->blk_eval.old_namesv = NULL;
2467 SvREFCNT_dec_NN(sv);
2469 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2473 /* push a plain loop, i.e.
2475 * while (cond) { block }
2476 * for (init;cond;continue) { block }
2477 * This loop can be last/redo'ed etc.
2480 PERL_STATIC_INLINE void
2481 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
2483 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2484 cx->blk_loop.my_op = cLOOP;
2488 /* push a true for loop, i.e.
2489 * for var (list) { block }
2492 PERL_STATIC_INLINE void
2493 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
2495 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2497 /* this one line is common with cx_pushloop_plain */
2498 cx->blk_loop.my_op = cLOOP;
2500 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2501 cx->blk_loop.itersave = itersave;
2503 cx->blk_loop.oldcomppad = PL_comppad;
2508 /* pop all loop types, including plain */
2510 PERL_STATIC_INLINE void
2511 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
2513 PERL_ARGS_ASSERT_CX_POPLOOP;
2515 assert(CxTYPE_is_LOOP(cx));
2516 if ( CxTYPE(cx) == CXt_LOOP_ARY
2517 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2519 /* Free ary or cur. This assumes that state_u.ary.ary
2520 * aligns with state_u.lazysv.cur. See cx_dup() */
2521 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2522 cx->blk_loop.state_u.lazysv.cur = NULL;
2523 SvREFCNT_dec_NN(sv);
2524 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2525 sv = cx->blk_loop.state_u.lazysv.end;
2526 cx->blk_loop.state_u.lazysv.end = NULL;
2527 SvREFCNT_dec_NN(sv);
2530 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2532 SV **svp = (cx)->blk_loop.itervar_u.svp;
2533 if ((cx->cx_type & CXp_FOR_GV))
2534 svp = &GvSV((GV*)svp);
2536 *svp = cx->blk_loop.itersave;
2537 cx->blk_loop.itersave = NULL;
2538 SvREFCNT_dec(cursv);
2543 PERL_STATIC_INLINE void
2544 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2546 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2548 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2552 PERL_STATIC_INLINE void
2553 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2555 PERL_ARGS_ASSERT_CX_POPWHEN;
2556 assert(CxTYPE(cx) == CXt_WHEN);
2558 PERL_UNUSED_ARG(cx);
2559 PERL_UNUSED_CONTEXT;
2560 /* currently NOOP */
2564 PERL_STATIC_INLINE void
2565 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
2567 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2569 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2570 cx->blk_givwhen.defsv_save = orig_defsv;
2574 PERL_STATIC_INLINE void
2575 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
2579 PERL_ARGS_ASSERT_CX_POPGIVEN;
2580 assert(CxTYPE(cx) == CXt_GIVEN);
2582 sv = GvSV(PL_defgv);
2583 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2584 cx->blk_givwhen.defsv_save = NULL;
2588 /* ------------------ util.h ------------------------------------------- */
2591 =for apidoc_section $string
2595 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2597 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2598 match themselves and their opposite case counterparts. Non-cased and non-ASCII
2599 range bytes match only themselves.
2604 PERL_STATIC_INLINE I32
2605 Perl_foldEQ(const char *s1, const char *s2, I32 len)
2607 const U8 *a = (const U8 *)s1;
2608 const U8 *b = (const U8 *)s2;
2610 PERL_ARGS_ASSERT_FOLDEQ;
2615 if (*a != *b && *a != PL_fold[*b])
2622 PERL_STATIC_INLINE I32
2623 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2625 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2626 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2627 * does not check for this. Nor does it check that the strings each have
2628 * at least 'len' characters. */
2630 const U8 *a = (const U8 *)s1;
2631 const U8 *b = (const U8 *)s2;
2633 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2638 if (*a != *b && *a != PL_fold_latin1[*b]) {
2647 =for apidoc_section $locale
2648 =for apidoc foldEQ_locale
2650 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2651 same case-insensitively in the current locale; false otherwise.
2656 PERL_STATIC_INLINE I32
2657 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2659 const U8 *a = (const U8 *)s1;
2660 const U8 *b = (const U8 *)s2;
2662 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2667 if (*a != *b && *a != PL_fold_locale[*b])
2675 =for apidoc_section $string
2676 =for apidoc my_strnlen
2678 The C library C<strnlen> if available, or a Perl implementation of it.
2680 C<my_strnlen()> computes the length of the string, up to C<maxlen>
2681 characters. It will never attempt to address more than C<maxlen>
2682 characters, making it suitable for use with strings that are not
2683 guaranteed to be NUL-terminated.
2687 Description stolen from http://man.openbsd.org/strnlen.3,
2688 implementation stolen from PostgreSQL.
2692 PERL_STATIC_INLINE Size_t
2693 Perl_my_strnlen(const char *str, Size_t maxlen)
2695 const char *end = (char *) memchr(str, '\0', maxlen);
2697 PERL_ARGS_ASSERT_MY_STRNLEN;
2699 if (end == NULL) return maxlen;
2705 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2707 PERL_STATIC_INLINE void *
2708 S_my_memrchr(const char * s, const char c, const STRLEN len)
2710 /* memrchr(), since many platforms lack it */
2712 const char * t = s + len - 1;
2714 PERL_ARGS_ASSERT_MY_MEMRCHR;
2728 PERL_STATIC_INLINE char *
2729 Perl_mortal_getenv(const char * str)
2731 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
2733 * It's (mostly) thread-safe because it uses a mutex to prevent other
2734 * threads (that look at this mutex) from destroying the result before this
2735 * routine has a chance to copy the result to a place that won't be
2736 * destroyed before the caller gets a chance to handle it. That place is a
2737 * mortal SV. khw chose this over SAVEFREEPV because he is under the
2738 * impression that the SV will hang around longer under more circumstances
2740 * The reason it isn't completely thread-safe is that other code could
2741 * simply not pay attention to the mutex. All of the Perl core uses the
2742 * mutex, but it is possible for code from, say XS, to not use this mutex,
2743 * defeating the safety.
2745 * getenv() returns, in some implementations, a pointer to a spot in the
2746 * **environ array, which could be invalidated at any time by this or
2747 * another thread changing the environment. Other implementations copy the
2748 * **environ value to a static buffer, returning a pointer to that. That
2749 * buffer might or might not be invalidated by a getenv() call in another
2750 * thread. If it does get zapped, we need an exclusive lock. Otherwise,
2751 * many getenv() calls can safely be running simultaneously, so a
2752 * many-reader (but no simultaneous writers) lock is ok. There is a
2753 * Configure probe to see if another thread destroys the buffer, and the
2754 * mutex is defined accordingly.
2756 * But in all cases, using the mutex prevents these problems, as long as
2757 * all code uses the same mutex..
2759 * A complication is that this can be called during phases where the
2760 * mortalization process isn't available. These are in interpreter
2761 * destruction or early in construction. khw believes that at these times
2762 * there shouldn't be anything else going on, so plain getenv is safe AS
2763 * LONG AS the caller acts on the return before calling it again. */
2768 PERL_ARGS_ASSERT_MORTAL_GETENV;
2770 /* Can't mortalize without stacks. khw believes that no other threads
2771 * should be running, so no need to lock things, and this may be during a
2772 * phase when locking isn't even available */
2773 if (UNLIKELY(PL_scopestack_ix == 0)) {
2779 /* A major complication arises under PERL_MEM_LOG. When that is active,
2780 * every memory allocation may result in logging, depending on the value of
2781 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for
2782 * saving ENV{foo}'s value (but before saving it), the logging code will
2783 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some
2784 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
2785 * lock a boolean mutex recursively); 3) destroying the getenv() static
2786 * buffer; or 4) destroying the temporary created by this for the copy
2787 * causes a log entry to be made which could cause a new temporary to be
2788 * created, which will need to be destroyed at some point, leading to an
2791 * The solution adopted here (after some gnashing of teeth) is to detect
2792 * the recursive calls and calls from the logger, and treat them specially.
2793 * Let's say we want to do getenv("foo"). We first find
2794 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
2795 * variable, so no temporary is required. Then we do getenv(foo}, and in
2796 * the process of creating a temporary to save it, this function will be
2797 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call,
2798 * we detect that it is such a call and return our saved value instead of
2799 * locking and doing a new getenv(). This solves all of problems 1), 2),
2800 * and 3). Because all the getenv()s are done while the mutex is locked,
2801 * the state cannot have changed. To solve 4), we don't create a temporary
2802 * when this is called from the logging code. That code disposes of the
2803 * return value while the mutex is still locked.
2805 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
2806 * digits and 3 particular letters are significant; the rest are ignored by
2807 * the memory logging code. Thus the per-interpreter variable only needs
2808 * to be large enough to save the significant information, the size of
2809 * which is known at compile time. The first byte is extra, reserved for
2810 * flags for our use. To protect against overflowing, only the reserved
2811 * byte, as many digits as don't overflow, and the three letters are
2814 * The reserved byte has two bits:
2815 * 0x1 if set indicates that if we get here, it is a recursive call of
2817 * 0x2 if set indicates that the call is from the logging code.
2819 * If the flag indicates this is a recursive call, just return the stored
2820 * value of PL_mem_log; An empty value gets turned into NULL. */
2821 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
2822 if (PL_mem_log[1] == '\0') {
2825 return PL_mem_log + 1;
2835 /* Here we are in a critical section. As explained above, we do our own
2836 * getenv(PERL_MEM_LOG), saving the result safely. */
2837 ret = getenv("PERL_MEM_LOG");
2838 if (ret == NULL) { /* No logging active */
2840 /* Return that immediately if called from the logging code */
2841 if (PL_mem_log[0] & 0x2) {
2846 PL_mem_log[1] = '\0';
2849 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */
2851 /* There is nothing to prevent the value of PERL_MEM_LOG from being an
2852 * extremely long string. But we want only a few characters from it.
2853 * PL_mem_log has been made large enough to hold just the ones we need.
2854 * First the file descriptor. */
2855 if (isDIGIT(*ret)) {
2856 const char * s = ret;
2857 if (UNLIKELY(*s == '0')) {
2859 /* Reduce multiple leading zeros to a single one. This is to
2860 * allow the caller to change what to do with leading zeros. */
2861 *mem_log_meat++ = '0';
2868 /* If the input overflows, copy just enough for the result to also
2869 * overflow, plus 1 to make sure */
2870 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
2871 *mem_log_meat++ = *s++;
2875 /* Then each of the three significant characters */
2876 if (strchr(ret, 'm')) {
2877 *mem_log_meat++ = 'm';
2879 if (strchr(ret, 's')) {
2880 *mem_log_meat++ = 's';
2882 if (strchr(ret, 't')) {
2883 *mem_log_meat++ = 't';
2885 *mem_log_meat = '\0';
2887 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
2890 /* If we are being called from the logger, it only needs the significant
2891 * portion of PERL_MEM_LOG, and doesn't need a safe copy */
2892 if (PL_mem_log[0] & 0x2) {
2893 assert(strEQ(str, "PERL_MEM_LOG"));
2895 return PL_mem_log + 1;
2898 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that
2899 * is coming from other than the logging code, so it should be treated the
2900 * same as any other getenv(), returning the full value, not just the
2901 * significant part, and having its value saved. Set the flag that
2902 * indicates any call to this routine will be a recursion from here */
2903 PL_mem_log[0] = 0x1;
2907 /* Now get the value of the real desired variable, and save a copy */
2911 ret = SvPVX(sv_2mortal(newSVpv(ret, 0)));
2918 /* Clear the buffer */
2919 Zero(PL_mem_log, sizeof(PL_mem_log), char);
2927 * ex: set ts=8 sts=4 sw=4 et: