3 * Copyright (C) 2012 by Larry Wall and others
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
8 * This file contains tables and code adapted from
9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * This file is a home for static inline functions that cannot go in other
34 * header files, because they depend on proto.h (included after most other
35 * headers) or struct definitions.
37 * Note also perlstatic.h for functions that can't or shouldn't be inlined, but
38 * whose details should be exposed to the compiler, for such things as tail
41 * Each section names the header file that the functions "belong" to.
44 /* ------------------------------- av.h ------------------------------- */
47 =for apidoc_section $AV
49 Returns the number of elements in the array C<av>. This is the true length of
50 the array, including any undefined elements. It is always the same as
51 S<C<av_top_index(av) + 1>>.
55 PERL_STATIC_INLINE Size_t
56 Perl_av_count(pTHX_ AV *av)
58 PERL_ARGS_ASSERT_AV_COUNT;
59 assert(SvTYPE(av) == SVt_PVAV);
61 return AvFILL(av) + 1;
64 /* ------------------------------- av.c ------------------------------- */
67 =for apidoc av_store_simple
69 This is a cut-down version of av_store that assumes that the array is
70 very straightforward - no magic, not readonly, and AvREAL - and that
71 C<key> is not negative. This function MUST NOT be used in situations
72 where any of those assumptions may not hold.
74 Stores an SV in an array. The array index is specified as C<key>. It
75 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
77 Note that the caller is responsible for suitably incrementing the reference
78 count of C<val> before the call.
80 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
85 PERL_STATIC_INLINE SV**
86 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
90 PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
91 assert(SvTYPE(av) == SVt_PVAV);
92 assert(!SvMAGICAL(av));
93 assert(!SvREADONLY(av));
99 if (AvFILLp(av) < key) {
100 if (key > AvMAX(av)) {
106 SvREFCNT_dec(ary[key]);
113 =for apidoc av_fetch_simple
115 This is a cut-down version of av_fetch that assumes that the array is
116 very straightforward - no magic, not readonly, and AvREAL - and that
117 C<key> is not negative. This function MUST NOT be used in situations
118 where any of those assumptions may not hold.
120 Returns the SV at the specified index in the array. The C<key> is the
121 index. If lval is true, you are guaranteed to get a real SV back (in case
122 it wasn't real before), which you can then modify. Check that the return
123 value is non-null before dereferencing it to a C<SV*>.
125 The rough perl equivalent is C<$myarray[$key]>.
130 PERL_STATIC_INLINE SV**
131 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
133 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
134 assert(SvTYPE(av) == SVt_PVAV);
135 assert(!SvMAGICAL(av));
136 assert(!SvREADONLY(av));
140 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
141 return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
143 return &AvARRAY(av)[key];
148 =for apidoc av_push_simple
150 This is a cut-down version of av_push that assumes that the array is very
151 straightforward - no magic, not readonly, and AvREAL - and that C<key> is
152 not less than -1. This function MUST NOT be used in situations where any
153 of those assumptions may not hold.
155 Pushes an SV (transferring control of one reference count) onto the end of the
156 array. The array will grow automatically to accommodate the addition.
158 Perl equivalent: C<push @myarray, $val;>.
163 PERL_STATIC_INLINE void
164 Perl_av_push_simple(pTHX_ AV *av, SV *val)
166 PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
167 assert(SvTYPE(av) == SVt_PVAV);
168 assert(!SvMAGICAL(av));
169 assert(!SvREADONLY(av));
171 assert(AvFILLp(av) > -2);
173 (void)av_store_simple(av,AvFILLp(av)+1,val);
177 =for apidoc av_new_alloc
179 This implements L<perlapi/C<newAV_alloc_x>>
180 and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this
183 Creates a new AV and allocates its SV* array.
185 This is similar to, but more efficient than doing:
190 The size parameter is used to pre-allocate a SV* array large enough to
191 hold at least elements C<0..(size-1)>. C<size> must be at least 1.
193 The C<zeroflag> parameter controls whether or not the array is NULL
199 PERL_STATIC_INLINE AV *
200 Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag)
202 AV * const av = newAV();
204 PERL_ARGS_ASSERT_AV_NEW_ALLOC;
207 Newx(ary, size, SV*); /* Newx performs the memwrap check */
210 AvMAX(av) = size - 1;
213 Zero(ary, size, SV*);
219 /* ------------------------------- cv.h ------------------------------- */
222 =for apidoc_section $CV
224 Returns the GV associated with the CV C<sv>, reifying it if necessary.
228 PERL_STATIC_INLINE GV *
229 Perl_CvGV(pTHX_ CV *sv)
231 PERL_ARGS_ASSERT_CVGV;
234 ? Perl_cvgv_from_hek(aTHX_ sv)
235 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
240 Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a
245 PERL_STATIC_INLINE I32 *
246 Perl_CvDEPTH(const CV * const sv)
248 PERL_ARGS_ASSERT_CVDEPTH;
249 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
251 return &((XPVCV*)SvANY(sv))->xcv_depth;
255 CvPROTO returns the prototype as stored, which is not necessarily what
256 the interpreter should be using. Specifically, the interpreter assumes
257 that spaces have been stripped, which has been the case if the prototype
258 was added by toke.c, but is generally not the case if it was added elsewhere.
259 Since we can't enforce the spacelessness at assignment time, this routine
260 provides a temporary copy at parse time with spaces removed.
261 I<orig> is the start of the original buffer, I<len> is the length of the
262 prototype and will be updated when this returns.
266 PERL_STATIC_INLINE char *
267 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
271 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
279 *len = tmps - SvPVX(tmpsv);
284 /* ------------------------------- iperlsys.h ------------------------------- */
285 #if ! defined(PERL_IMPLICIT_SYS) && defined(USE_ITHREADS)
287 /* Otherwise this function is implemented as macros in iperlsys.h */
289 PERL_STATIC_INLINE bool
290 S_PerlEnv_putenv(pTHX_ char * str)
292 PERL_ARGS_ASSERT_PERLENV_PUTENV;
295 bool retval = putenv(str);
303 /* ------------------------------- mg.h ------------------------------- */
305 #if defined(PERL_CORE) || defined(PERL_EXT)
306 /* assumes get-magic and stringification have already occurred */
307 PERL_STATIC_INLINE STRLEN
308 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
310 assert(mg->mg_type == PERL_MAGIC_regex_global);
311 assert(mg->mg_len != -1);
312 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
313 return (STRLEN)mg->mg_len;
315 const STRLEN pos = (STRLEN)mg->mg_len;
316 /* Without this check, we may read past the end of the buffer: */
317 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
318 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
323 /* ------------------------------- pad.h ------------------------------ */
325 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
326 PERL_STATIC_INLINE bool
327 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
329 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
331 /* is seq within the range _LOW to _HIGH ?
332 * This is complicated by the fact that PL_cop_seqmax
333 * may have wrapped around at some point */
334 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
335 return FALSE; /* not yet introduced */
337 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
338 /* in compiling scope */
340 (seq > COP_SEQ_RANGE_LOW(pn))
341 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
342 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
347 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
349 ( seq > COP_SEQ_RANGE_LOW(pn)
350 || seq <= COP_SEQ_RANGE_HIGH(pn))
352 : ( seq > COP_SEQ_RANGE_LOW(pn)
353 && seq <= COP_SEQ_RANGE_HIGH(pn))
360 /* ------------------------------- pp.h ------------------------------- */
362 PERL_STATIC_INLINE I32
365 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
366 "MARK top %p %" IVdf "\n",
368 (IV)*PL_markstack_ptr)));
369 return *PL_markstack_ptr;
372 PERL_STATIC_INLINE I32
375 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
376 "MARK pop %p %" IVdf "\n",
377 (PL_markstack_ptr-1),
378 (IV)*(PL_markstack_ptr-1))));
379 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
380 return *PL_markstack_ptr--;
383 /* ----------------------------- regexp.h ----------------------------- */
385 /* PVLVs need to act as a superset of all scalar types - they are basically
386 * PVMGs with a few extra fields.
387 * REGEXPs are first class scalars, but have many fields that can't be copied
390 * Hence we take a different approach - instead of a copy, PVLVs store a pointer
391 * back to the original body. To avoid increasing the size of PVLVs just for the
392 * rare case of REGEXP assignment, this pointer is stored in the memory usually
393 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
394 * read the pointer from the two possible locations. The macro SvLEN() wraps the
395 * access to the union's member xpvlenu_len, but there is no equivalent macro
396 * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
398 * See commit df6b4bd56551f2d3 for more details. */
400 PERL_STATIC_INLINE struct regexp *
401 Perl_ReANY(const REGEXP * const re)
403 XPV* const p = (XPV*)SvANY(re);
405 PERL_ARGS_ASSERT_REANY;
406 assert(isREGEXP(re));
408 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
409 : (struct regexp *)p;
412 /* ------------------------------- utf8.h ------------------------------- */
415 =for apidoc_section $unicode
418 PERL_STATIC_INLINE void
419 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
421 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
422 * encoded string at '*dest', updating '*dest' to include it */
424 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
426 if (NATIVE_BYTE_IS_INVARIANT(byte))
429 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
430 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
435 =for apidoc valid_utf8_to_uvchr
436 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
437 known that the next character in the input UTF-8 string C<s> is well-formed
438 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
439 points, and non-Unicode code points are allowed.
445 PERL_STATIC_INLINE UV
446 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
448 const UV expectlen = UTF8SKIP(s);
449 const U8* send = s + expectlen;
452 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
458 /* An invariant is trivially returned */
459 if (expectlen == 1) {
463 /* Remove the leading bits that indicate the number of bytes, leaving just
464 * the bits that are part of the value */
465 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
467 /* Now, loop through the remaining bytes, accumulating each into the
468 * working total as we go. (I khw tried unrolling the loop for up to 4
469 * bytes, but there was no performance improvement) */
470 for (++s; s < send; s++) {
471 uv = UTF8_ACCUMULATE(uv, *s);
474 return UNI_TO_NATIVE(uv);
479 =for apidoc is_utf8_invariant_string
481 Returns TRUE if the first C<len> bytes of the string C<s> are the same
482 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
483 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
484 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
485 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
486 characters are invariant, but so also are the C1 controls.
488 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
489 use this option, that C<s> can't have embedded C<NUL> characters and has to
490 have a terminating C<NUL> byte).
493 C<L</is_utf8_string>>,
494 C<L</is_utf8_string_flags>>,
495 C<L</is_utf8_string_loc>>,
496 C<L</is_utf8_string_loc_flags>>,
497 C<L</is_utf8_string_loclen>>,
498 C<L</is_utf8_string_loclen_flags>>,
499 C<L</is_utf8_fixed_width_buf_flags>>,
500 C<L</is_utf8_fixed_width_buf_loc_flags>>,
501 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
502 C<L</is_strict_utf8_string>>,
503 C<L</is_strict_utf8_string_loc>>,
504 C<L</is_strict_utf8_string_loclen>>,
505 C<L</is_c9strict_utf8_string>>,
506 C<L</is_c9strict_utf8_string_loc>>,
508 C<L</is_c9strict_utf8_string_loclen>>.
514 #define is_utf8_invariant_string(s, len) \
515 is_utf8_invariant_string_loc(s, len, NULL)
518 =for apidoc is_utf8_invariant_string_loc
520 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
521 the first UTF-8 variant character in the C<ep> pointer; if all characters are
522 UTF-8 invariant, this function does not change the contents of C<*ep>.
528 PERL_STATIC_INLINE bool
529 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
534 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
537 len = strlen((const char *)s);
542 /* This looks like 0x010101... */
543 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
545 /* This looks like 0x808080... */
546 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
547 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
548 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
550 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
551 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
552 * optimized out completely on a 32-bit system, and its mask gets optimized out
553 * on a 64-bit system */
554 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
555 | ( PTR2nat(x) >> 1) \
557 & PERL_WORD_BOUNDARY_MASK) >> 2))))
561 /* Do the word-at-a-time iff there is at least one usable full word. That
562 * means that after advancing to a word boundary, there still is at least a
563 * full word left. The number of bytes needed to advance is 'wordsize -
564 * offset' unless offset is 0. */
565 if ((STRLEN) (send - x) >= PERL_WORDSIZE
567 /* This term is wordsize if subword; 0 if not */
568 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
571 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
574 /* Process per-byte until reach word boundary. XXX This loop could be
575 * eliminated if we knew that this platform had fast unaligned reads */
576 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
577 if (! UTF8_IS_INVARIANT(*x)) {
587 /* Here, we know we have at least one full word to process. Process
588 * per-word as long as we have at least a full word left */
590 if ((* (const PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
592 /* Found a variant. Just return if caller doesn't want its
598 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
599 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
601 *ep = x + variant_byte_number(* (const PERL_UINTMAX_T *) x);
602 assert(*ep >= s && *ep < send);
606 # else /* If weird byte order, drop into next loop to do byte-at-a-time
615 } while (x + PERL_WORDSIZE <= send);
618 #endif /* End of ! EBCDIC */
620 /* Process per-byte */
622 if (! UTF8_IS_INVARIANT(*x)) {
636 /* See if the platform has builtins for finding the most/least significant bit,
637 * and which one is right for using on 32 and 64 bit operands */
638 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
639 # if U32SIZE == INTSIZE
640 # define PERL_CLZ_32 __builtin_clz
642 # if defined(U64TYPE) && U64SIZE == INTSIZE
643 # define PERL_CLZ_64 __builtin_clz
646 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
647 # if U32SIZE == INTSIZE
648 # define PERL_CTZ_32 __builtin_ctz
650 # if defined(U64TYPE) && U64SIZE == INTSIZE
651 # define PERL_CTZ_64 __builtin_ctz
655 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
656 # if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
657 # define PERL_CLZ_32 __builtin_clzl
659 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
660 # define PERL_CLZ_64 __builtin_clzl
663 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
664 # if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
665 # define PERL_CTZ_32 __builtin_ctzl
667 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
668 # define PERL_CTZ_64 __builtin_ctzl
672 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
673 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
674 # define PERL_CLZ_32 __builtin_clzll
676 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
677 # define PERL_CLZ_64 __builtin_clzll
680 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
681 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
682 # define PERL_CTZ_32 __builtin_ctzll
684 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
685 # define PERL_CTZ_64 __builtin_ctzll
689 #if defined(_MSC_VER)
691 # pragma intrinsic(_BitScanForward)
692 # pragma intrinsic(_BitScanReverse)
694 # pragma intrinsic(_BitScanForward64)
695 # pragma intrinsic(_BitScanReverse64)
699 /* The reason there are not checks to see if ffs() and ffsl() are available for
700 * determining the lsb, is because these don't improve on the deBruijn method
701 * fallback, which is just a branchless integer multiply, array element
702 * retrieval, and shift. The others, even if the function call overhead is
703 * optimized out, have to cope with the possibility of the input being all
704 * zeroes, and almost certainly will have conditionals for this eventuality.
705 * khw, at the time of this commit, looked at the source for both gcc and clang
706 * to verify this. (gcc used a method inferior to deBruijn.) */
708 /* Below are functions to find the first, last, or only set bit in a word. On
709 * platforms with 64-bit capability, there is a pair for each operation; the
710 * first taking a 64 bit operand, and the second a 32 bit one. The logic is
711 * the same in each pair, so the second is stripped of most comments. */
713 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
715 PERL_STATIC_INLINE unsigned
716 Perl_lsbit_pos64(U64 word)
718 /* Find the position (0..63) of the least significant set bit in the input
723 /* If we can determine that the platform has a usable fast method to get
724 * this info, use that */
726 # if defined(PERL_CTZ_64)
727 # define PERL_HAS_FAST_GET_LSB_POS64
729 return (unsigned) PERL_CTZ_64(word);
731 # elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
732 # define PERL_HAS_FAST_GET_LSB_POS64
736 _BitScanForward64(&index, word);
737 return (unsigned)index;
742 /* Here, we didn't find a fast method for finding the lsb. Fall back to
743 * making the lsb the only set bit in the word, and use our function that
744 * works on words with a single bit set.
747 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
749 * The word will look like this, with a rightmost set bit in position 's':
750 * ('x's are don't cares, and 'y's are their complements)
753 * y..y011..11 Complement
755 * 0..0100..00 And with the original
757 * (Yes, complementing and adding 1 is just taking the negative on 2's
758 * complement machines, but not on 1's complement ones, and some compilers
759 * complain about negating an unsigned.)
761 return single_1bit_pos64(word & (~word + 1));
767 # define lsbit_pos_uintmax_(word) lsbit_pos64(word)
769 # define lsbit_pos_uintmax_(word) lsbit_pos32(word)
772 PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */
773 Perl_lsbit_pos32(U32 word)
775 /* Find the position (0..31) of the least significant set bit in the input
780 #if defined(PERL_CTZ_32)
781 # define PERL_HAS_FAST_GET_LSB_POS32
783 return (unsigned) PERL_CTZ_32(word);
785 #elif U32SIZE == 4 && defined(_MSC_VER)
786 # define PERL_HAS_FAST_GET_LSB_POS32
790 _BitScanForward(&index, word);
791 return (unsigned)index;
796 return single_1bit_pos32(word & (~word + 1));
803 /* Convert the leading zeros count to the bit position of the first set bit.
804 * This just subtracts from the highest position, 31 or 63. But some compilers
805 * don't optimize this optimally, and so a bit of bit twiddling encourages them
806 * to do the right thing. It turns out that subtracting a smaller non-negative
807 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
808 * the two numbers. To see why, first note that the sum of any number, x, and
809 * its complement, x', is all ones. So all ones minus x is x'. Then note that
810 * the xor of x and all ones is x'. */
811 #define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc))
813 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
815 PERL_STATIC_INLINE unsigned
816 Perl_msbit_pos64(U64 word)
818 /* Find the position (0..63) of the most significant set bit in the input
823 /* If we can determine that the platform has a usable fast method to get
826 # if defined(PERL_CLZ_64)
827 # define PERL_HAS_FAST_GET_MSB_POS64
829 return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
831 # elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
832 # define PERL_HAS_FAST_GET_MSB_POS64
836 _BitScanReverse64(&index, word);
837 return (unsigned)index;
842 /* Here, we didn't find a fast method for finding the msb. Fall back to
843 * making the msb the only set bit in the word, and use our function that
844 * works on words with a single bit set.
846 * Isolate the msb; http://codeforces.com/blog/entry/10330
848 * Only the most significant set bit matters. Or'ing word with its right
849 * shift of 1 makes that bit and the next one to its right both 1.
850 * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
851 * ... We end with the msb and all to the right being 1. */
856 word |= (word >> 16);
857 word |= (word >> 32);
859 /* Then subtracting the right shift by 1 clears all but the left-most of
860 * the 1 bits, which is our desired result */
863 /* Now we have a single bit set */
864 return single_1bit_pos64(word);
870 # define msbit_pos_uintmax_(word) msbit_pos64(word)
872 # define msbit_pos_uintmax_(word) msbit_pos32(word)
875 PERL_STATIC_INLINE unsigned
876 Perl_msbit_pos32(U32 word)
878 /* Find the position (0..31) of the most significant set bit in the input
883 #if defined(PERL_CLZ_32)
884 # define PERL_HAS_FAST_GET_MSB_POS32
886 return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
888 #elif U32SIZE == 4 && defined(_MSC_VER)
889 # define PERL_HAS_FAST_GET_MSB_POS32
893 _BitScanReverse(&index, word);
894 return (unsigned)index;
903 word |= (word >> 16);
905 return single_1bit_pos32(word);
911 #if UVSIZE == U64SIZE
912 # define msbit_pos(word) msbit_pos64(word)
913 # define lsbit_pos(word) lsbit_pos64(word)
914 #elif UVSIZE == U32SIZE
915 # define msbit_pos(word) msbit_pos32(word)
916 # define lsbit_pos(word) lsbit_pos32(word)
919 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
921 PERL_STATIC_INLINE unsigned
922 Perl_single_1bit_pos64(U64 word)
924 /* Given a 64-bit word known to contain all zero bits except one 1 bit,
925 * find and return the 1's position: 0..63 */
927 # ifdef PERL_CORE /* macro not exported */
928 ASSUME(isPOWER_OF_2(word));
930 ASSUME(word && (word & (word-1)) == 0);
933 /* The only set bit is both the most and least significant bit. If we have
934 * a fast way of finding either one, use that.
936 * It may appear at first glance that those functions call this one, but
937 * they don't if the corresponding #define is set */
939 # ifdef PERL_HAS_FAST_GET_MSB_POS64
941 return msbit_pos64(word);
943 # elif defined(PERL_HAS_FAST_GET_LSB_POS64)
945 return lsbit_pos64(word);
949 /* The position of the only set bit in a word can be quickly calculated
950 * using deBruijn sequences. See for example
951 * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
952 return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
953 >> PERL_deBruijnShift64_];
960 PERL_STATIC_INLINE unsigned
961 Perl_single_1bit_pos32(U32 word)
963 /* Given a 32-bit word known to contain all zero bits except one 1 bit,
964 * find and return the 1's position: 0..31 */
966 #ifdef PERL_CORE /* macro not exported */
967 ASSUME(isPOWER_OF_2(word));
969 ASSUME(word && (word & (word-1)) == 0);
971 #ifdef PERL_HAS_FAST_GET_MSB_POS32
973 return msbit_pos32(word);
975 #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
977 return lsbit_pos32(word);
979 /* Unlikely, but possible for the platform to have a wider fast operation but
980 * not a narrower one. But easy enough to handle the case by widening the
981 * parameter size. (Going the other way, emulating 64 bit by two 32 bit ops
982 * would be slower than the deBruijn method.) */
983 #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
985 return msbit_pos64(word);
987 #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
989 return lsbit_pos64(word);
993 return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
994 >> PERL_deBruijnShift32_];
1001 PERL_STATIC_INLINE unsigned int
1002 Perl_variant_byte_number(PERL_UINTMAX_T word)
1004 /* This returns the position in a word (0..7) of the first variant byte in
1005 * it. This is a helper function. Note that there are no branches */
1007 /* Get just the msb bits of each byte */
1008 word &= PERL_VARIANTS_WORD_MASK;
1010 /* This should only be called if we know there is a variant byte in the
1014 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1016 /* Bytes are stored like
1017 * Byte8 ... Byte2 Byte1
1018 * 63..56...15...8 7...0
1019 * so getting the lsb of the whole modified word is getting the msb of the
1020 * first byte that has its msb set */
1021 word = lsbit_pos_uintmax_(word);
1023 /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert
1025 return (unsigned int) ((word + 1) >> 3) - 1;
1027 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1029 /* Bytes are stored like
1030 * Byte1 Byte2 ... Byte8
1031 * 63..56 55..47 ... 7...0
1032 * so getting the msb of the whole modified word is getting the msb of the
1033 * first byte that has its msb set */
1034 word = msbit_pos_uintmax_(word);
1036 /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert
1038 word = ((word + 1) >> 3) - 1;
1040 /* And invert the result because of the reversed byte order on this
1042 word = CHARBITS - word - 1;
1044 return (unsigned int) word;
1047 # error Unexpected byte order
1053 #if defined(PERL_CORE) || defined(PERL_EXT)
1056 =for apidoc variant_under_utf8_count
1058 This function looks at the sequence of bytes between C<s> and C<e>, which are
1059 assumed to be encoded in ASCII/Latin1, and returns how many of them would
1060 change should the string be translated into UTF-8. Due to the nature of UTF-8,
1061 each of these would occupy two bytes instead of the single one in the input
1062 string. Thus, this function returns the precise number of bytes the string
1063 would expand by when translated to UTF-8.
1065 Unlike most of the other functions that have C<utf8> in their name, the input
1066 to this function is NOT a UTF-8-encoded string. The function name is slightly
1067 I<odd> to emphasize this.
1069 This function is internal to Perl because khw thinks that any XS code that
1070 would want this is probably operating too close to the internals. Presenting a
1071 valid use case could change that.
1074 C<L<perlapi/is_utf8_invariant_string>>
1076 C<L<perlapi/is_utf8_invariant_string_loc>>,
1082 PERL_STATIC_INLINE Size_t
1083 S_variant_under_utf8_count(const U8* const s, const U8* const e)
1088 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1092 /* Test if the string is long enough to use word-at-a-time. (Logic is the
1093 * same as for is_utf8_invariant_string()) */
1094 if ((STRLEN) (e - x) >= PERL_WORDSIZE
1095 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1096 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1099 /* Process per-byte until reach word boundary. XXX This loop could be
1100 * eliminated if we knew that this platform had fast unaligned reads */
1101 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1102 count += ! UTF8_IS_INVARIANT(*x++);
1105 /* Process per-word as long as we have at least a full word left */
1106 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1107 explanation of how this works */
1108 PERL_UINTMAX_T increment
1109 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1110 * PERL_COUNT_MULTIPLIER)
1111 >> ((PERL_WORDSIZE - 1) * CHARBITS);
1112 count += (Size_t) increment;
1114 } while (x + PERL_WORDSIZE <= e);
1119 /* Process per-byte */
1121 if (! UTF8_IS_INVARIANT(*x)) {
1133 /* Keep these around for these files */
1134 #if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C)
1135 # undef PERL_WORDSIZE
1136 # undef PERL_COUNT_MULTIPLIER
1137 # undef PERL_WORD_BOUNDARY_MASK
1138 # undef PERL_VARIANTS_WORD_MASK
1142 =for apidoc is_utf8_string
1144 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1145 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
1146 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1147 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1148 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1150 This function considers Perl's extended UTF-8 to be valid. That means that
1151 code points above Unicode, surrogates, and non-character code points are
1152 considered valid by this function. Use C<L</is_strict_utf8_string>>,
1153 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1154 code points are considered valid.
1157 C<L</is_utf8_invariant_string>>,
1158 C<L</is_utf8_invariant_string_loc>>,
1159 C<L</is_utf8_string_loc>>,
1160 C<L</is_utf8_string_loclen>>,
1161 C<L</is_utf8_fixed_width_buf_flags>>,
1162 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1163 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1168 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
1170 #if defined(PERL_CORE) || defined (PERL_EXT)
1173 =for apidoc is_utf8_non_invariant_string
1175 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1176 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1177 UTF-8; otherwise returns FALSE.
1179 A TRUE return means that at least one code point represented by the sequence
1180 either is a wide character not representable as a single byte, or the
1181 representation differs depending on whether the sequence is encoded in UTF-8 or
1185 C<L<perlapi/is_utf8_invariant_string>>,
1186 C<L<perlapi/is_utf8_string>>
1190 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
1191 It generally needn't be if its string is entirely UTF-8 invariant, and it
1192 shouldn't be if it otherwise contains invalid UTF-8.
1194 It is an internal function because khw thinks that XS code shouldn't be working
1195 at this low a level. A valid use case could change that.
1199 PERL_STATIC_INLINE bool
1200 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
1202 const U8 * first_variant;
1204 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1206 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1210 return is_utf8_string(first_variant, len - (first_variant - s));
1216 =for apidoc is_strict_utf8_string
1218 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1219 UTF-8-encoded string that is fully interchangeable by any application using
1220 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
1221 calculated using C<strlen(s)> (which means if you use this option, that C<s>
1222 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1223 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1225 This function returns FALSE for strings containing any
1226 code points above the Unicode max of 0x10FFFF, surrogate code points, or
1227 non-character code points.
1230 C<L</is_utf8_invariant_string>>,
1231 C<L</is_utf8_invariant_string_loc>>,
1232 C<L</is_utf8_string>>,
1233 C<L</is_utf8_string_flags>>,
1234 C<L</is_utf8_string_loc>>,
1235 C<L</is_utf8_string_loc_flags>>,
1236 C<L</is_utf8_string_loclen>>,
1237 C<L</is_utf8_string_loclen_flags>>,
1238 C<L</is_utf8_fixed_width_buf_flags>>,
1239 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1240 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1241 C<L</is_strict_utf8_string_loc>>,
1242 C<L</is_strict_utf8_string_loclen>>,
1243 C<L</is_c9strict_utf8_string>>,
1244 C<L</is_c9strict_utf8_string_loc>>,
1246 C<L</is_c9strict_utf8_string_loclen>>.
1251 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
1254 =for apidoc is_c9strict_utf8_string
1256 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1257 UTF-8-encoded string that conforms to
1258 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1259 otherwise it returns FALSE. If C<len> is 0, it will be calculated using
1260 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1261 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
1262 characters being ASCII constitute 'a valid UTF-8 string'.
1264 This function returns FALSE for strings containing any code points above the
1265 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1267 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1270 C<L</is_utf8_invariant_string>>,
1271 C<L</is_utf8_invariant_string_loc>>,
1272 C<L</is_utf8_string>>,
1273 C<L</is_utf8_string_flags>>,
1274 C<L</is_utf8_string_loc>>,
1275 C<L</is_utf8_string_loc_flags>>,
1276 C<L</is_utf8_string_loclen>>,
1277 C<L</is_utf8_string_loclen_flags>>,
1278 C<L</is_utf8_fixed_width_buf_flags>>,
1279 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1280 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1281 C<L</is_strict_utf8_string>>,
1282 C<L</is_strict_utf8_string_loc>>,
1283 C<L</is_strict_utf8_string_loclen>>,
1284 C<L</is_c9strict_utf8_string_loc>>,
1286 C<L</is_c9strict_utf8_string_loclen>>.
1291 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
1294 =for apidoc is_utf8_string_flags
1296 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1297 UTF-8 string, subject to the restrictions imposed by C<flags>;
1298 returns FALSE otherwise. If C<len> is 0, it will be calculated
1299 using C<strlen(s)> (which means if you use this option, that C<s> can't have
1300 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
1301 that all characters being ASCII constitute 'a valid UTF-8 string'.
1303 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1304 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1305 as C<L</is_strict_utf8_string>>; and if C<flags> is
1306 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1307 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
1308 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1309 C<L</utf8n_to_uvchr>>, with the same meanings.
1312 C<L</is_utf8_invariant_string>>,
1313 C<L</is_utf8_invariant_string_loc>>,
1314 C<L</is_utf8_string>>,
1315 C<L</is_utf8_string_loc>>,
1316 C<L</is_utf8_string_loc_flags>>,
1317 C<L</is_utf8_string_loclen>>,
1318 C<L</is_utf8_string_loclen_flags>>,
1319 C<L</is_utf8_fixed_width_buf_flags>>,
1320 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1321 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1322 C<L</is_strict_utf8_string>>,
1323 C<L</is_strict_utf8_string_loc>>,
1324 C<L</is_strict_utf8_string_loclen>>,
1325 C<L</is_c9strict_utf8_string>>,
1326 C<L</is_c9strict_utf8_string_loc>>,
1328 C<L</is_c9strict_utf8_string_loclen>>.
1333 PERL_STATIC_INLINE bool
1334 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
1336 const U8 * first_variant;
1338 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1339 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1340 |UTF8_DISALLOW_PERL_EXTENDED)));
1343 len = strlen((const char *)s);
1347 return is_utf8_string(s, len);
1350 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1351 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1353 return is_strict_utf8_string(s, len);
1356 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1357 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1359 return is_c9strict_utf8_string(s, len);
1362 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1363 const U8* const send = s + len;
1364 const U8* x = first_variant;
1367 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1368 if (UNLIKELY(! cur_len)) {
1380 =for apidoc is_utf8_string_loc
1382 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1383 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1384 "utf8ness success") in the C<ep> pointer.
1386 See also C<L</is_utf8_string_loclen>>.
1391 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
1395 =for apidoc is_utf8_string_loclen
1397 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1398 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1399 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1400 encoded characters in the C<el> pointer.
1402 See also C<L</is_utf8_string_loc>>.
1407 PERL_STATIC_INLINE bool
1408 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1410 const U8 * first_variant;
1412 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1415 len = strlen((const char *) s);
1418 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1430 const U8* const send = s + len;
1431 const U8* x = first_variant;
1432 STRLEN outlen = first_variant - s;
1435 const STRLEN cur_len = isUTF8_CHAR(x, send);
1436 if (UNLIKELY(! cur_len)) {
1454 /* The perl core arranges to never call the DFA below without there being at
1455 * least one byte available to look at. This allows the DFA to use a do {}
1456 * while loop which means that calling it with a UTF-8 invariant has a single
1457 * conditional, same as the calling code checking for invariance ahead of time.
1458 * And having the calling code remove that conditional speeds up by that
1459 * conditional, the case where it wasn't invariant. So there's no reason to
1460 * check before caling this.
1462 * But we don't know this for non-core calls, so have to retain the check for
1465 # define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s))
1467 # define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE
1471 * DFA for checking input is valid UTF-8 syntax.
1473 * This uses adaptations of the table and algorithm given in
1474 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1475 * documentation of the original version. A copyright notice for the original
1476 * version is given at the beginning of this file. The Perl adaptations are
1477 * documented at the definition of PL_extended_utf8_dfa_tab[].
1479 * This dfa is fast. There are three exit conditions:
1480 * 1) a well-formed code point, acceptable to the table
1481 * 2) the beginning bytes of an incomplete character, whose completion might
1482 * or might not be acceptable
1483 * 3) unacceptable to the table. Some of the adaptations have certain,
1484 * hopefully less likely to occur, legal inputs be unacceptable to the
1485 * table, so these must be sorted out afterwards.
1487 * This macro is a complete implementation of the code executing the DFA. It
1488 * is passed the input sequence bounds and the table to use, and what to do
1489 * for each of the exit conditions. There are three canned actions, likely to
1490 * be the ones you want:
1491 * DFA_RETURN_SUCCESS_
1492 * DFA_RETURN_FAILURE_
1493 * DFA_GOTO_TEASE_APART_FF_
1495 * You pass a parameter giving the action to take for each of the three
1496 * possible exit conditions:
1498 * 'accept_action' This is executed when the DFA accepts the input.
1499 * DFA_RETURN_SUCCESS_ is the most likely candidate.
1500 * 'reject_action' This is executed when the DFA rejects the input.
1501 * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1502 * you have written code to distinguish the rejecting state
1503 * results. Because it happens in several places, and
1504 * involves #ifdefs, the special action
1505 * DFA_GOTO_TEASE_APART_FF_ is what you want with
1506 * PL_extended_utf8_dfa_tab. On platforms without
1507 * EXTRA_LONG_UTF8, there is no need to tease anything apart,
1508 * so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1509 * need to have a label 'tease_apart_FF' that it will transfer
1511 * 'incomplete_char_action' This is executed when the DFA ran off the end
1512 * before accepting or rejecting the input.
1513 * DFA_RETURN_FAILURE_ is the likely action, but you could
1514 * have a 'goto', or NOOP. In the latter case the DFA drops
1515 * off the end, and you place your code to handle this case
1516 * immediately after it.
1519 #define DFA_RETURN_SUCCESS_ return s - s0
1520 #define DFA_RETURN_FAILURE_ return 0
1521 #ifdef HAS_EXTRA_LONG_UTF8
1522 # define DFA_TEASE_APART_FF_ goto tease_apart_FF
1524 # define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_
1527 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \
1530 incomplete_char_action) \
1532 const U8 * s = s0; \
1533 const U8 * e_ = e; \
1536 PERL_NON_CORE_CHECK_EMPTY(s, e_); \
1539 state = dfa_tab[256 + state + dfa_tab[*s]]; \
1542 if (state == 0) { /* Accepting state */ \
1546 if (UNLIKELY(state == 1)) { /* Rejecting state */ \
1551 /* Here, dropped out of loop before end-of-char */ \
1552 incomplete_char_action; \
1558 =for apidoc isUTF8_CHAR
1560 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1561 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1562 that represents some code point; otherwise it evaluates to 0. If non-zero, the
1563 value gives how many bytes starting at C<s> comprise the code point's
1564 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1565 form the first code point in C<s>, are not examined.
1567 The code point can be any that will fit in an IV on this machine, using Perl's
1568 extension to official UTF-8 to represent those higher than the Unicode maximum
1569 of 0x10FFFF. That means that this macro is used to efficiently decide if the
1570 next few bytes in C<s> is legal UTF-8 for a single character.
1572 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1573 defined by Unicode to be fully interchangeable across applications;
1574 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1575 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1576 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1578 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1579 C<L</is_utf8_string_loclen>> to check entire strings.
1581 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1582 machines) is a valid UTF-8 character.
1586 This uses an adaptation of the table and algorithm given in
1587 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1588 documentation of the original version. A copyright notice for the original
1589 version is given at the beginning of this file. The Perl adaptation is
1590 documented at the definition of PL_extended_utf8_dfa_tab[].
1593 PERL_STATIC_INLINE Size_t
1594 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1596 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1598 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1599 DFA_RETURN_SUCCESS_,
1600 DFA_TEASE_APART_FF_,
1601 DFA_RETURN_FAILURE_);
1603 /* Here, we didn't return success, but dropped out of the loop. In the
1604 * case of PL_extended_utf8_dfa_tab, this means the input is either
1605 * malformed, or the start byte was FF on a platform that the dfa doesn't
1606 * handle FF's. Call a helper function. */
1608 #ifdef HAS_EXTRA_LONG_UTF8
1612 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1613 * either malformed, or was for the largest possible start byte, which we
1614 * now check, not inline */
1615 if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1619 return is_utf8_FF_helper_(s0, e,
1620 FALSE /* require full, not partial char */
1628 =for apidoc isSTRICT_UTF8_CHAR
1630 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1631 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1632 Unicode code point completely acceptable for open interchange between all
1633 applications; otherwise it evaluates to 0. If non-zero, the value gives how
1634 many bytes starting at C<s> comprise the code point's representation. Any
1635 bytes remaining before C<e>, but beyond the ones needed to form the first code
1636 point in C<s>, are not examined.
1638 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1639 be a surrogate nor a non-character code point. Thus this excludes any code
1640 point from Perl's extended UTF-8.
1642 This is used to efficiently decide if the next few bytes in C<s> is
1643 legal Unicode-acceptable UTF-8 for a single character.
1645 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1646 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1647 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1648 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1650 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1651 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1655 This uses an adaptation of the tables and algorithm given in
1656 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1657 documentation of the original version. A copyright notice for the original
1658 version is given at the beginning of this file. The Perl adaptation is
1659 documented at the definition of strict_extended_utf8_dfa_tab[].
1663 PERL_STATIC_INLINE Size_t
1664 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1666 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1668 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1669 DFA_RETURN_SUCCESS_,
1671 DFA_RETURN_FAILURE_);
1674 /* Here, we didn't return success, but dropped out of the loop. In the
1675 * case of PL_strict_utf8_dfa_tab, this means the input is either
1676 * malformed, or was for certain Hanguls; handle them specially */
1678 /* The dfa above drops out for incomplete or illegal inputs, and certain
1679 * legal Hanguls; check and return accordingly */
1680 return is_HANGUL_ED_utf8_safe(s0, e);
1685 =for apidoc isC9_STRICT_UTF8_CHAR
1687 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1688 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1689 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1690 the value gives how many bytes starting at C<s> comprise the code point's
1691 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1692 form the first code point in C<s>, are not examined.
1694 The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1695 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1696 code points. This corresponds to
1697 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1698 which said that non-character code points are merely discouraged rather than
1699 completely forbidden in open interchange. See
1700 L<perlunicode/Noncharacter code points>.
1702 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1703 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1705 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1706 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1710 This uses an adaptation of the tables and algorithm given in
1711 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1712 documentation of the original version. A copyright notice for the original
1713 version is given at the beginning of this file. The Perl adaptation is
1714 documented at the definition of PL_c9_utf8_dfa_tab[].
1718 PERL_STATIC_INLINE Size_t
1719 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1721 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1723 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1724 DFA_RETURN_SUCCESS_,
1725 DFA_RETURN_FAILURE_,
1726 DFA_RETURN_FAILURE_);
1731 =for apidoc is_strict_utf8_string_loc
1733 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1734 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1735 "utf8ness success") in the C<ep> pointer.
1737 See also C<L</is_strict_utf8_string_loclen>>.
1742 #define is_strict_utf8_string_loc(s, len, ep) \
1743 is_strict_utf8_string_loclen(s, len, ep, 0)
1747 =for apidoc is_strict_utf8_string_loclen
1749 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1750 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1751 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1752 encoded characters in the C<el> pointer.
1754 See also C<L</is_strict_utf8_string_loc>>.
1759 PERL_STATIC_INLINE bool
1760 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1762 const U8 * first_variant;
1764 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1767 len = strlen((const char *) s);
1770 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1782 const U8* const send = s + len;
1783 const U8* x = first_variant;
1784 STRLEN outlen = first_variant - s;
1787 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1788 if (UNLIKELY(! cur_len)) {
1808 =for apidoc is_c9strict_utf8_string_loc
1810 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1811 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1812 "utf8ness success") in the C<ep> pointer.
1814 See also C<L</is_c9strict_utf8_string_loclen>>.
1819 #define is_c9strict_utf8_string_loc(s, len, ep) \
1820 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1824 =for apidoc is_c9strict_utf8_string_loclen
1826 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1827 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1828 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1829 characters in the C<el> pointer.
1831 See also C<L</is_c9strict_utf8_string_loc>>.
1836 PERL_STATIC_INLINE bool
1837 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1839 const U8 * first_variant;
1841 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1844 len = strlen((const char *) s);
1847 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1859 const U8* const send = s + len;
1860 const U8* x = first_variant;
1861 STRLEN outlen = first_variant - s;
1864 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1865 if (UNLIKELY(! cur_len)) {
1885 =for apidoc is_utf8_string_loc_flags
1887 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1888 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1889 "utf8ness success") in the C<ep> pointer.
1891 See also C<L</is_utf8_string_loclen_flags>>.
1896 #define is_utf8_string_loc_flags(s, len, ep, flags) \
1897 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1900 /* The above 3 actual functions could have been moved into the more general one
1901 * just below, and made #defines that call it with the right 'flags'. They are
1902 * currently kept separate to increase their chances of getting inlined */
1906 =for apidoc is_utf8_string_loclen_flags
1908 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1909 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1910 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1911 encoded characters in the C<el> pointer.
1913 See also C<L</is_utf8_string_loc_flags>>.
1918 PERL_STATIC_INLINE bool
1919 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1921 const U8 * first_variant;
1923 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1924 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1925 |UTF8_DISALLOW_PERL_EXTENDED)));
1928 len = strlen((const char *) s);
1932 return is_utf8_string_loclen(s, len, ep, el);
1935 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1936 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1938 return is_strict_utf8_string_loclen(s, len, ep, el);
1941 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1942 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1944 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1947 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1959 const U8* send = s + len;
1960 const U8* x = first_variant;
1961 STRLEN outlen = first_variant - s;
1964 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1965 if (UNLIKELY(! cur_len)) {
1984 =for apidoc utf8_distance
1986 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1989 WARNING: use only if you *know* that the pointers point inside the
1995 PERL_STATIC_INLINE IV
1996 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1998 PERL_ARGS_ASSERT_UTF8_DISTANCE;
2000 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2004 =for apidoc utf8_hop
2006 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2007 forward (if C<off> is positive) or backward (if negative). C<s> does not need
2008 to be pointing to the starting byte of a character. If it isn't, one count of
2009 C<off> will be used up to get to the start of the next character for forward
2010 hops, and to the start of the current character for negative ones.
2012 WARNING: Prefer L</utf8_hop_safe> to this one.
2014 Do NOT use this function unless you B<know> C<off> is within
2015 the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned
2016 on the first byte of a character or just after the last byte of a character.
2021 PERL_STATIC_INLINE U8 *
2022 Perl_utf8_hop(const U8 *s, SSize_t off)
2024 PERL_ARGS_ASSERT_UTF8_HOP;
2026 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2027 * the XXX bitops (especially ~) can create illegal UTF-8.
2028 * In other words: in Perl UTF-8 is not just for Unicode. */
2032 /* Get to next non-continuation byte */
2033 if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2037 while (UTF8_IS_CONTINUATION(*s));
2047 while (UTF8_IS_CONTINUATION(*s))
2052 GCC_DIAG_IGNORE(-Wcast-qual)
2058 =for apidoc utf8_hop_forward
2060 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2061 forward. C<s> does not need to be pointing to the starting byte of a
2062 character. If it isn't, one count of C<off> will be used up to get to the
2063 start of the next character.
2065 C<off> must be non-negative.
2067 C<s> must be before or equal to C<end>.
2069 When moving forward it will not move beyond C<end>.
2071 Will not exceed this limit even if the string is not valid "UTF-8".
2076 PERL_STATIC_INLINE U8 *
2077 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2079 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2081 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2082 * the bitops (especially ~) can create illegal UTF-8.
2083 * In other words: in Perl UTF-8 is not just for Unicode. */
2088 if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2089 /* Get to next non-continuation byte */
2093 while (UTF8_IS_CONTINUATION(*s));
2098 STRLEN skip = UTF8SKIP(s);
2099 if ((STRLEN)(end - s) <= skip) {
2100 GCC_DIAG_IGNORE(-Wcast-qual)
2107 GCC_DIAG_IGNORE(-Wcast-qual)
2113 =for apidoc utf8_hop_back
2115 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2116 backward. C<s> does not need to be pointing to the starting byte of a
2117 character. If it isn't, one count of C<off> will be used up to get to that
2120 C<off> must be non-positive.
2122 C<s> must be after or equal to C<start>.
2124 When moving backward it will not move before C<start>.
2126 Will not exceed this limit even if the string is not valid "UTF-8".
2131 PERL_STATIC_INLINE U8 *
2132 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2134 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2136 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2137 * the bitops (especially ~) can create illegal UTF-8.
2138 * In other words: in Perl UTF-8 is not just for Unicode. */
2143 /* Note: if we know that the input is well-formed, we can do per-word
2144 * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
2145 * that. But it was reverted because doing per-word has some
2146 * start-up/tear-down overhead, so only makes sense if the distance to be
2147 * moved is large, and core perl doesn't currently move more than a few
2148 * characters at a time. You can reinstate it if it does become
2150 while (off++ && s > start) {
2153 } while (UTF8_IS_CONTINUATION(*s) && s > start);
2156 GCC_DIAG_IGNORE(-Wcast-qual)
2162 =for apidoc utf8_hop_safe
2164 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2165 either forward or backward. C<s> does not need to be pointing to the starting
2166 byte of a character. If it isn't, one count of C<off> will be used up to get
2167 to the start of the next character for forward hops, and to the start of the
2168 current character for negative ones.
2170 When moving backward it will not move before C<start>.
2172 When moving forward it will not move beyond C<end>.
2174 Will not exceed those limits even if the string is not valid "UTF-8".
2179 PERL_STATIC_INLINE U8 *
2180 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2182 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2184 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2185 * the bitops (especially ~) can create illegal UTF-8.
2186 * In other words: in Perl UTF-8 is not just for Unicode. */
2188 assert(start <= s && s <= end);
2191 return utf8_hop_forward(s, off, end);
2194 return utf8_hop_back(s, off, start);
2200 =for apidoc isUTF8_CHAR_flags
2202 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2203 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2204 that represents some code point, subject to the restrictions given by C<flags>;
2205 otherwise it evaluates to 0. If non-zero, the value gives how many bytes
2206 starting at C<s> comprise the code point's representation. Any bytes remaining
2207 before C<e>, but beyond the ones needed to form the first code point in C<s>,
2210 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2211 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2212 as C<L</isSTRICT_UTF8_CHAR>>;
2213 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2214 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2215 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2216 understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2218 The three alternative macros are for the most commonly needed validations; they
2219 are likely to run somewhat faster than this more general one, as they can be
2220 inlined into your code.
2222 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2223 L</is_utf8_string_loclen_flags> to check entire strings.
2228 PERL_STATIC_INLINE STRLEN
2229 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2231 PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2232 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2233 |UTF8_DISALLOW_PERL_EXTENDED)));
2235 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2237 DFA_TEASE_APART_FF_,
2238 DFA_RETURN_FAILURE_);
2242 return is_utf8_char_helper_(s0, e, flags);
2244 #ifdef HAS_EXTRA_LONG_UTF8
2248 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2249 * either malformed, or was for the largest possible start byte, which
2250 * indicates perl extended UTF-8, well above the Unicode maximum */
2251 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2252 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2257 /* Otherwise examine the sequence not inline */
2258 return is_utf8_FF_helper_(s0, e,
2259 FALSE /* require full, not partial char */
2267 =for apidoc is_utf8_valid_partial_char
2269 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2270 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2271 points. Otherwise, it returns 1 if there exists at least one non-empty
2272 sequence of bytes that when appended to sequence C<s>, starting at position
2273 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2274 otherwise returns 0.
2276 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2279 This is useful when a fixed-length buffer is being tested for being well-formed
2280 UTF-8, but the final few bytes in it don't comprise a full character; that is,
2281 it is split somewhere in the middle of the final code point's UTF-8
2282 representation. (Presumably when the buffer is refreshed with the next chunk
2283 of data, the new first bytes will complete the partial code point.) This
2284 function is used to verify that the final bytes in the current buffer are in
2285 fact the legal beginning of some code point, so that if they aren't, the
2286 failure can be signalled without having to wait for the next read.
2290 #define is_utf8_valid_partial_char(s, e) \
2291 is_utf8_valid_partial_char_flags(s, e, 0)
2295 =for apidoc is_utf8_valid_partial_char_flags
2297 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2298 or not the input is a valid UTF-8 encoded partial character, but it takes an
2299 extra parameter, C<flags>, which can further restrict which code points are
2302 If C<flags> is 0, this behaves identically to
2303 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
2304 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
2305 there is any sequence of bytes that can complete the input partial character in
2306 such a way that a non-prohibited character is formed, the function returns
2307 TRUE; otherwise FALSE. Non character code points cannot be determined based on
2308 partial character input. But many of the other possible excluded types can be
2309 determined from just the first one or two bytes.
2314 PERL_STATIC_INLINE bool
2315 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2317 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
2318 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2319 |UTF8_DISALLOW_PERL_EXTENDED)));
2321 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2322 DFA_RETURN_FAILURE_,
2323 DFA_TEASE_APART_FF_,
2326 /* The NOOP above causes the DFA to drop down here iff the input was a
2327 * partial character. flags=0 => can return TRUE immediately; otherwise we
2328 * need to check (not inline) if the partial character is the beginning of
2329 * a disallowed one */
2334 return cBOOL(is_utf8_char_helper_(s0, e, flags));
2336 #ifdef HAS_EXTRA_LONG_UTF8
2340 /* Getting here means the input is either malformed, or, in the case of
2341 * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
2342 * latter case has to be extended UTF-8, so can fail immediately if that is
2345 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2346 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2351 return is_utf8_FF_helper_(s0, e,
2352 TRUE /* Require to be a partial character */
2360 =for apidoc is_utf8_fixed_width_buf_flags
2362 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2363 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2364 otherwise it returns FALSE.
2366 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2367 without restriction. If the final few bytes of the buffer do not form a
2368 complete code point, this will return TRUE anyway, provided that
2369 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2371 If C<flags> in non-zero, it can be any combination of the
2372 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2375 This function differs from C<L</is_utf8_string_flags>> only in that the latter
2376 returns FALSE if the final few bytes of the string don't form a complete code
2381 #define is_utf8_fixed_width_buf_flags(s, len, flags) \
2382 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2386 =for apidoc is_utf8_fixed_width_buf_loc_flags
2388 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2389 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
2390 to the beginning of any partial character at the end of the buffer; if there is
2391 no partial character C<*ep> will contain C<s>+C<len>.
2393 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2398 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
2399 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2403 =for apidoc is_utf8_fixed_width_buf_loclen_flags
2405 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2406 complete, valid characters found in the C<el> pointer.
2411 PERL_STATIC_INLINE bool
2412 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
2418 const U8 * maybe_partial;
2420 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2423 ep = &maybe_partial;
2426 /* If it's entirely valid, return that; otherwise see if the only error is
2427 * that the final few bytes are for a partial character */
2428 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
2429 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2432 PERL_STATIC_INLINE UV
2433 Perl_utf8n_to_uvchr_msgs(const U8 *s,
2440 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
2441 * simple cases, and, if necessary calls a helper function to deal with the
2442 * more complex ones. Almost all well-formed non-problematic code points
2443 * are considered simple, so that it's unlikely that the helper function
2444 * will need to be called.
2446 * This is an adaptation of the tables and algorithm given in
2447 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
2448 * comprehensive documentation of the original version. A copyright notice
2449 * for the original version is given at the beginning of this file. The
2450 * Perl adaptation is documented at the definition of PL_strict_utf8_dfa_tab[].
2453 const U8 * const s0 = s;
2454 const U8 * send = s0 + curlen;
2458 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2460 /* This dfa is fast. If it accepts the input, it was for a well-formed,
2461 * non-problematic code point, which can be returned immediately.
2462 * Otherwise we call a helper function to figure out the more complicated
2465 /* No calls from core pass in an empty string; non-core need a check */
2469 if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
2470 flags, errors, msgs);
2473 type = PL_strict_utf8_dfa_tab[*s];
2475 /* The table is structured so that 'type' is 0 iff the input byte is
2476 * represented identically regardless of the UTF-8ness of the string */
2477 if (type == 0) { /* UTF-8 invariants are returned unchanged */
2481 UV state = PL_strict_utf8_dfa_tab[256 + type];
2482 uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
2484 while (++s < send) {
2485 type = PL_strict_utf8_dfa_tab[*s];
2486 state = PL_strict_utf8_dfa_tab[256 + state + type];
2488 uv = UTF8_ACCUMULATE(uv, *s);
2492 uv = UNI_TO_NATIVE(uv);
2497 if (UNLIKELY(state == 1)) {
2502 /* Here is potentially problematic. Use the full mechanism */
2503 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
2509 *retlen = s - s0 + 1;
2521 PERL_STATIC_INLINE UV
2522 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2524 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
2528 if (! ckWARN_d(WARN_UTF8)) {
2530 /* EMPTY is not really allowed, and asserts on debugging builds. But
2531 * on non-debugging we have to deal with it, and this causes it to
2532 * return the REPLACEMENT CHARACTER, as the documentation indicates */
2533 return utf8n_to_uvchr(s, send - s, retlen,
2534 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
2537 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
2538 if (retlen && ret == 0 && (send <= s || *s != '\0')) {
2539 *retlen = (STRLEN) -1;
2546 /* ------------------------------- perl.h ----------------------------- */
2549 =for apidoc_section $utility
2551 =for apidoc is_safe_syscall
2553 Test that the given C<pv> (with length C<len>) doesn't contain any internal
2555 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2556 category, and return FALSE.
2558 Return TRUE if the name is safe.
2560 C<what> and C<op_name> are used in any warning.
2562 Used by the C<IS_SAFE_SYSCALL()> macro.
2567 PERL_STATIC_INLINE bool
2568 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2570 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2571 * perl itself uses xce*() functions which accept 8-bit strings.
2574 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2578 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
2579 SETERRNO(ENOENT, LIB_INVARG);
2580 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
2581 "Invalid \\0 character in %s for %s: %s\\0%s",
2582 what, op_name, pv, null_at+1);
2592 Return true if the supplied filename has a newline character
2593 immediately before the first (hopefully only) NUL.
2595 My original look at this incorrectly used the len from SvPV(), but
2596 that's incorrect, since we allow for a NUL in pv[len-1].
2598 So instead, strlen() and work from there.
2600 This allow for the user reading a filename, forgetting to chomp it,
2603 open my $foo, "$file\0";
2609 PERL_STATIC_INLINE bool
2610 S_should_warn_nl(const char *pv)
2614 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2618 return len > 0 && pv[len-1] == '\n';
2623 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2625 PERL_STATIC_INLINE bool
2626 S_lossless_NV_to_IV(const NV nv, IV *ivp)
2628 /* This function determines if the input NV 'nv' may be converted without
2629 * loss of data to an IV. If not, it returns FALSE taking no other action.
2630 * But if it is possible, it does the conversion, returning TRUE, and
2631 * storing the converted result in '*ivp' */
2633 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2635 # if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2636 /* Normally any comparison with a NaN returns false; if we can't rely
2637 * on that behaviour, check explicitly */
2638 if (UNLIKELY(Perl_isnan(nv))) {
2643 /* Written this way so that with an always-false NaN comparison we
2645 if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
2649 if ((IV) nv != nv) {
2659 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2661 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2663 #define MAX_CHARSET_NAME_LENGTH 2
2665 PERL_STATIC_INLINE const char *
2666 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2668 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2670 /* Returns a string that corresponds to the name of the regex character set
2671 * given by 'flags', and *lenp is set the length of that string, which
2672 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2675 switch (get_regex_charset(flags)) {
2676 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2677 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2678 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2679 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2680 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2682 return ASCII_MORE_RESTRICT_PAT_MODS;
2684 /* The NOT_REACHED; hides an assert() which has a rather complex
2685 * definition in perl.h. */
2686 NOT_REACHED; /* NOTREACHED */
2687 return "?"; /* Unknown */
2694 Return false if any get magic is on the SV other than taint magic.
2698 PERL_STATIC_INLINE bool
2699 Perl_sv_only_taint_gmagic(SV *sv)
2701 MAGIC *mg = SvMAGIC(sv);
2703 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2706 if (mg->mg_type != PERL_MAGIC_taint
2707 && !(mg->mg_flags & MGf_GSKIP)
2708 && mg->mg_virtual->svt_get) {
2711 mg = mg->mg_moremagic;
2717 /* ------------------ cop.h ------------------------------------------- */
2719 /* implement GIMME_V() macro */
2721 PERL_STATIC_INLINE U8
2725 U8 gimme = (PL_op->op_flags & OPf_WANT);
2729 cxix = PL_curstackinfo->si_cxsubix;
2731 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
2732 assert(cxstack[cxix].blk_gimme & G_WANT);
2733 return (cxstack[cxix].blk_gimme & G_WANT);
2737 /* Enter a block. Push a new base context and return its address. */
2739 PERL_STATIC_INLINE PERL_CONTEXT *
2740 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2744 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2749 cx->blk_gimme = gimme;
2750 cx->blk_oldsaveix = saveix;
2751 cx->blk_oldsp = (I32)(sp - PL_stack_base);
2752 cx->blk_oldcop = PL_curcop;
2753 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
2754 cx->blk_oldscopesp = PL_scopestack_ix;
2755 cx->blk_oldpm = PL_curpm;
2756 cx->blk_old_tmpsfloor = PL_tmps_floor;
2758 PL_tmps_floor = PL_tmps_ix;
2759 CX_DEBUG(cx, "PUSH");
2764 /* Exit a block (RETURN and LAST). */
2766 PERL_STATIC_INLINE void
2767 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2769 PERL_ARGS_ASSERT_CX_POPBLOCK;
2771 CX_DEBUG(cx, "POP");
2772 /* these 3 are common to cx_popblock and cx_topblock */
2773 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2774 PL_scopestack_ix = cx->blk_oldscopesp;
2775 PL_curpm = cx->blk_oldpm;
2777 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2778 * and leaves a CX entry lying around for repeated use, so
2779 * skip for multicall */ \
2780 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2781 || PL_savestack_ix == cx->blk_oldsaveix);
2782 PL_curcop = cx->blk_oldcop;
2783 PL_tmps_floor = cx->blk_old_tmpsfloor;
2786 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2787 * Whereas cx_popblock() restores the state to the point just before
2788 * cx_pushblock() was called, cx_topblock() restores it to the point just
2789 * *after* cx_pushblock() was called. */
2791 PERL_STATIC_INLINE void
2792 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2794 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2796 CX_DEBUG(cx, "TOP");
2797 /* these 3 are common to cx_popblock and cx_topblock */
2798 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2799 PL_scopestack_ix = cx->blk_oldscopesp;
2800 PL_curpm = cx->blk_oldpm;
2802 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2806 PERL_STATIC_INLINE void
2807 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2809 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2811 PERL_ARGS_ASSERT_CX_PUSHSUB;
2813 PERL_DTRACE_PROBE_ENTRY(cv);
2814 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2815 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2816 cx->blk_sub.cv = cv;
2817 cx->blk_sub.olddepth = CvDEPTH(cv);
2818 cx->blk_sub.prevcomppad = PL_comppad;
2819 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2820 cx->blk_sub.retop = retop;
2821 SvREFCNT_inc_simple_void_NN(cv);
2822 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2826 /* subsets of cx_popsub() */
2828 PERL_STATIC_INLINE void
2829 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2833 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2834 assert(CxTYPE(cx) == CXt_SUB);
2836 PL_comppad = cx->blk_sub.prevcomppad;
2837 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2838 cv = cx->blk_sub.cv;
2839 CvDEPTH(cv) = cx->blk_sub.olddepth;
2840 cx->blk_sub.cv = NULL;
2842 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2846 /* handle the @_ part of leaving a sub */
2848 PERL_STATIC_INLINE void
2849 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2853 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2854 assert(CxTYPE(cx) == CXt_SUB);
2855 assert(AvARRAY(MUTABLE_AV(
2856 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2857 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2859 CX_POP_SAVEARRAY(cx);
2860 av = MUTABLE_AV(PAD_SVl(0));
2861 if (UNLIKELY(AvREAL(av)))
2862 /* abandon @_ if it got reified */
2863 clear_defarray(av, 0);
2870 PERL_STATIC_INLINE void
2871 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2873 PERL_ARGS_ASSERT_CX_POPSUB;
2874 assert(CxTYPE(cx) == CXt_SUB);
2876 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2880 cx_popsub_common(cx);
2884 PERL_STATIC_INLINE void
2885 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2887 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2889 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2890 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2891 cx->blk_format.cv = cv;
2892 cx->blk_format.retop = retop;
2893 cx->blk_format.gv = gv;
2894 cx->blk_format.dfoutgv = PL_defoutgv;
2895 cx->blk_format.prevcomppad = PL_comppad;
2898 SvREFCNT_inc_simple_void_NN(cv);
2900 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2904 PERL_STATIC_INLINE void
2905 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2910 PERL_ARGS_ASSERT_CX_POPFORMAT;
2911 assert(CxTYPE(cx) == CXt_FORMAT);
2913 dfout = cx->blk_format.dfoutgv;
2915 cx->blk_format.dfoutgv = NULL;
2916 SvREFCNT_dec_NN(dfout);
2918 PL_comppad = cx->blk_format.prevcomppad;
2919 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2920 cv = cx->blk_format.cv;
2921 cx->blk_format.cv = NULL;
2923 SvREFCNT_dec_NN(cv);
2924 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2928 PERL_STATIC_INLINE void
2929 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2931 cx->blk_eval.retop = retop;
2932 cx->blk_eval.old_namesv = namesv;
2933 cx->blk_eval.old_eval_root = PL_eval_root;
2934 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2935 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2936 cx->blk_eval.cur_top_env = PL_top_env;
2938 assert(!(PL_in_eval & ~ 0x3F));
2939 assert(!(PL_op->op_type & ~0x1FF));
2940 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2943 PERL_STATIC_INLINE void
2944 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2946 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2948 Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2950 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2951 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2954 PERL_STATIC_INLINE void
2955 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2957 PERL_ARGS_ASSERT_CX_PUSHTRY;
2959 Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2961 /* Don't actually change it, just store the current value so it's restored
2962 * by the common popeval */
2963 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2967 PERL_STATIC_INLINE void
2968 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2972 PERL_ARGS_ASSERT_CX_POPEVAL;
2973 assert(CxTYPE(cx) == CXt_EVAL);
2975 PL_in_eval = CxOLD_IN_EVAL(cx);
2976 assert(!(PL_in_eval & 0xc0));
2977 PL_eval_root = cx->blk_eval.old_eval_root;
2978 sv = cx->blk_eval.cur_text;
2979 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2980 cx->blk_eval.cur_text = NULL;
2981 SvREFCNT_dec_NN(sv);
2984 sv = cx->blk_eval.old_namesv;
2986 cx->blk_eval.old_namesv = NULL;
2987 SvREFCNT_dec_NN(sv);
2989 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2993 /* push a plain loop, i.e.
2995 * while (cond) { block }
2996 * for (init;cond;continue) { block }
2997 * This loop can be last/redo'ed etc.
3000 PERL_STATIC_INLINE void
3001 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
3003 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
3004 cx->blk_loop.my_op = cLOOP;
3008 /* push a true for loop, i.e.
3009 * for var (list) { block }
3012 PERL_STATIC_INLINE void
3013 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
3015 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
3017 /* this one line is common with cx_pushloop_plain */
3018 cx->blk_loop.my_op = cLOOP;
3020 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
3021 cx->blk_loop.itersave = itersave;
3023 cx->blk_loop.oldcomppad = PL_comppad;
3028 /* pop all loop types, including plain */
3030 PERL_STATIC_INLINE void
3031 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
3033 PERL_ARGS_ASSERT_CX_POPLOOP;
3035 assert(CxTYPE_is_LOOP(cx));
3036 if ( CxTYPE(cx) == CXt_LOOP_ARY
3037 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
3039 /* Free ary or cur. This assumes that state_u.ary.ary
3040 * aligns with state_u.lazysv.cur. See cx_dup() */
3041 SV *sv = cx->blk_loop.state_u.lazysv.cur;
3042 cx->blk_loop.state_u.lazysv.cur = NULL;
3043 SvREFCNT_dec_NN(sv);
3044 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
3045 sv = cx->blk_loop.state_u.lazysv.end;
3046 cx->blk_loop.state_u.lazysv.end = NULL;
3047 SvREFCNT_dec_NN(sv);
3050 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3052 SV **svp = (cx)->blk_loop.itervar_u.svp;
3053 if ((cx->cx_type & CXp_FOR_GV))
3054 svp = &GvSV((GV*)svp);
3056 *svp = cx->blk_loop.itersave;
3057 cx->blk_loop.itersave = NULL;
3058 SvREFCNT_dec(cursv);
3060 if (cx->cx_type & (CXp_FOR_GV|CXp_FOR_LVREF))
3061 SvREFCNT_dec(cx->blk_loop.itervar_u.svp);
3065 PERL_STATIC_INLINE void
3066 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
3068 PERL_ARGS_ASSERT_CX_PUSHWHEN;
3070 cx->blk_givwhen.leave_op = cLOGOP->op_other;
3074 PERL_STATIC_INLINE void
3075 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
3077 PERL_ARGS_ASSERT_CX_POPWHEN;
3078 assert(CxTYPE(cx) == CXt_WHEN);
3080 PERL_UNUSED_ARG(cx);
3081 PERL_UNUSED_CONTEXT;
3082 /* currently NOOP */
3086 PERL_STATIC_INLINE void
3087 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
3089 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3091 cx->blk_givwhen.leave_op = cLOGOP->op_other;
3092 cx->blk_givwhen.defsv_save = orig_defsv;
3096 PERL_STATIC_INLINE void
3097 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
3101 PERL_ARGS_ASSERT_CX_POPGIVEN;
3102 assert(CxTYPE(cx) == CXt_GIVEN);
3104 sv = GvSV(PL_defgv);
3105 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3106 cx->blk_givwhen.defsv_save = NULL;
3111 =for apidoc newPADxVOP
3113 Constructs, checks and returns an op containing a pad offset. C<type> is
3114 the opcode, which should be one of C<OP_PADSV>, C<OP_PADAV>, C<OP_PADHV>
3115 or C<OP_PADCV>. The returned op will have the C<op_targ> field set by
3116 the C<padix> argument.
3118 This is convenient when constructing a large optree in nested function
3119 calls, as it avoids needing to store the pad op directly to set the
3120 C<op_targ> field as a side-effect. For example
3122 o = op_append_elem(OP_LINESEQ, o,
3123 newPADxVOP(OP_PADSV, 0, padix));
3128 PERL_STATIC_INLINE OP *
3129 Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix)
3131 PERL_ARGS_ASSERT_NEWPADXVOP;
3133 assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV
3134 || type == OP_PADCV);
3135 OP *o = newOP(type, flags);
3140 /* ------------------ util.h ------------------------------------------- */
3143 =for apidoc_section $string
3147 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3149 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
3150 match themselves and their opposite case counterparts. Non-cased and non-ASCII
3151 range bytes match only themselves.
3156 PERL_STATIC_INLINE I32
3157 Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len)
3159 const U8 *a = (const U8 *)s1;
3160 const U8 *b = (const U8 *)s2;
3162 PERL_ARGS_ASSERT_FOLDEQ;
3167 if (*a != *b && *a != PL_fold[*b])
3174 PERL_STATIC_INLINE I32
3175 Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len)
3177 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
3178 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3179 * does not check for this. Nor does it check that the strings each have
3180 * at least 'len' characters. */
3182 const U8 *a = (const U8 *)s1;
3183 const U8 *b = (const U8 *)s2;
3185 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3190 if (*a != *b && *a != PL_fold_latin1[*b]) {
3199 =for apidoc_section $locale
3200 =for apidoc foldEQ_locale
3202 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3203 same case-insensitively in the current locale; false otherwise.
3208 PERL_STATIC_INLINE I32
3209 Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len)
3211 const U8 *a = (const U8 *)s1;
3212 const U8 *b = (const U8 *)s2;
3214 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3219 if (*a != *b && *a != PL_fold_locale[*b]) {
3220 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3221 "%s:%d: Our records indicate %02x is not a fold of %02x"
3222 " or its mate %02x\n",
3223 __FILE__, __LINE__, *a, *b, PL_fold_locale[*b]));
3233 =for apidoc_section $string
3234 =for apidoc my_strnlen
3236 The C library C<strnlen> if available, or a Perl implementation of it.
3238 C<my_strnlen()> computes the length of the string, up to C<maxlen>
3239 characters. It will never attempt to address more than C<maxlen>
3240 characters, making it suitable for use with strings that are not
3241 guaranteed to be NUL-terminated.
3245 Description stolen from http://man.openbsd.org/strnlen.3,
3246 implementation stolen from PostgreSQL.
3250 PERL_STATIC_INLINE Size_t
3251 Perl_my_strnlen(const char *str, Size_t maxlen)
3253 const char *end = (char *) memchr(str, '\0', maxlen);
3255 PERL_ARGS_ASSERT_MY_STRNLEN;
3257 if (end == NULL) return maxlen;
3263 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3265 PERL_STATIC_INLINE void *
3266 S_my_memrchr(const char * s, const char c, const STRLEN len)
3268 /* memrchr(), since many platforms lack it */
3270 const char * t = s + len - 1;
3272 PERL_ARGS_ASSERT_MY_MEMRCHR;
3286 PERL_STATIC_INLINE char *
3287 Perl_mortal_getenv(const char * str)
3289 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3291 * It's (mostly) thread-safe because it uses a mutex to prevent other
3292 * threads (that look at this mutex) from destroying the result before this
3293 * routine has a chance to copy the result to a place that won't be
3294 * destroyed before the caller gets a chance to handle it. That place is a
3295 * mortal SV. khw chose this over SAVEFREEPV because he is under the
3296 * impression that the SV will hang around longer under more circumstances
3298 * The reason it isn't completely thread-safe is that other code could
3299 * simply not pay attention to the mutex. All of the Perl core uses the
3300 * mutex, but it is possible for code from, say XS, to not use this mutex,
3301 * defeating the safety.
3303 * getenv() returns, in some implementations, a pointer to a spot in the
3304 * **environ array, which could be invalidated at any time by this or
3305 * another thread changing the environment. Other implementations copy the
3306 * **environ value to a static buffer, returning a pointer to that. That
3307 * buffer might or might not be invalidated by a getenv() call in another
3308 * thread. If it does get zapped, we need an exclusive lock. Otherwise,
3309 * many getenv() calls can safely be running simultaneously, so a
3310 * many-reader (but no simultaneous writers) lock is ok. There is a
3311 * Configure probe to see if another thread destroys the buffer, and the
3312 * mutex is defined accordingly.
3314 * But in all cases, using the mutex prevents these problems, as long as
3315 * all code uses the same mutex.
3317 * A complication is that this can be called during phases where the
3318 * mortalization process isn't available. These are in interpreter
3319 * destruction or early in construction. khw believes that at these times
3320 * there shouldn't be anything else going on, so plain getenv is safe AS
3321 * LONG AS the caller acts on the return before calling it again. */
3326 PERL_ARGS_ASSERT_MORTAL_GETENV;
3328 /* Can't mortalize without stacks. khw believes that no other threads
3329 * should be running, so no need to lock things, and this may be during a
3330 * phase when locking isn't even available */
3331 if (UNLIKELY(PL_scopestack_ix == 0)) {
3337 /* A major complication arises under PERL_MEM_LOG. When that is active,
3338 * every memory allocation may result in logging, depending on the value of
3339 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for
3340 * saving ENV{foo}'s value (but before saving it), the logging code will
3341 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some
3342 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3343 * lock a boolean mutex recursively); 3) destroying the getenv() static
3344 * buffer; or 4) destroying the temporary created by this for the copy
3345 * causes a log entry to be made which could cause a new temporary to be
3346 * created, which will need to be destroyed at some point, leading to an
3349 * The solution adopted here (after some gnashing of teeth) is to detect
3350 * the recursive calls and calls from the logger, and treat them specially.
3351 * Let's say we want to do getenv("foo"). We first find
3352 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3353 * variable, so no temporary is required. Then we do getenv(foo}, and in
3354 * the process of creating a temporary to save it, this function will be
3355 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call,
3356 * we detect that it is such a call and return our saved value instead of
3357 * locking and doing a new getenv(). This solves all of problems 1), 2),
3358 * and 3). Because all the getenv()s are done while the mutex is locked,
3359 * the state cannot have changed. To solve 4), we don't create a temporary
3360 * when this is called from the logging code. That code disposes of the
3361 * return value while the mutex is still locked.
3363 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3364 * digits and 3 particular letters are significant; the rest are ignored by
3365 * the memory logging code. Thus the per-interpreter variable only needs
3366 * to be large enough to save the significant information, the size of
3367 * which is known at compile time. The first byte is extra, reserved for
3368 * flags for our use. To protect against overflowing, only the reserved
3369 * byte, as many digits as don't overflow, and the three letters are
3372 * The reserved byte has two bits:
3373 * 0x1 if set indicates that if we get here, it is a recursive call of
3375 * 0x2 if set indicates that the call is from the logging code.
3377 * If the flag indicates this is a recursive call, just return the stored
3378 * value of PL_mem_log; An empty value gets turned into NULL. */
3379 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3380 if (PL_mem_log[1] == '\0') {
3383 return PL_mem_log + 1;
3393 /* Here we are in a critical section. As explained above, we do our own
3394 * getenv(PERL_MEM_LOG), saving the result safely. */
3395 ret = getenv("PERL_MEM_LOG");
3396 if (ret == NULL) { /* No logging active */
3398 /* Return that immediately if called from the logging code */
3399 if (PL_mem_log[0] & 0x2) {
3404 PL_mem_log[1] = '\0';
3407 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */
3409 /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3410 * extremely long string. But we want only a few characters from it.
3411 * PL_mem_log has been made large enough to hold just the ones we need.
3412 * First the file descriptor. */
3413 if (isDIGIT(*ret)) {
3414 const char * s = ret;
3415 if (UNLIKELY(*s == '0')) {
3417 /* Reduce multiple leading zeros to a single one. This is to
3418 * allow the caller to change what to do with leading zeros. */
3419 *mem_log_meat++ = '0';
3426 /* If the input overflows, copy just enough for the result to also
3427 * overflow, plus 1 to make sure */
3428 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3429 *mem_log_meat++ = *s++;
3433 /* Then each of the four significant characters */
3434 if (strchr(ret, 'm')) {
3435 *mem_log_meat++ = 'm';
3437 if (strchr(ret, 's')) {
3438 *mem_log_meat++ = 's';
3440 if (strchr(ret, 't')) {
3441 *mem_log_meat++ = 't';
3443 if (strchr(ret, 'c')) {
3444 *mem_log_meat++ = 'c';
3446 *mem_log_meat = '\0';
3448 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3451 /* If we are being called from the logger, it only needs the significant
3452 * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3453 if (PL_mem_log[0] & 0x2) {
3454 assert(strEQ(str, "PERL_MEM_LOG"));
3456 return PL_mem_log + 1;
3459 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that
3460 * is coming from other than the logging code, so it should be treated the
3461 * same as any other getenv(), returning the full value, not just the
3462 * significant part, and having its value saved. Set the flag that
3463 * indicates any call to this routine will be a recursion from here */
3464 PL_mem_log[0] = 0x1;
3468 /* Now get the value of the real desired variable, and save a copy */
3472 ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
3479 /* Clear the buffer */
3480 Zero(PL_mem_log, sizeof(PL_mem_log), char);
3487 PERL_STATIC_INLINE bool
3488 Perl_sv_isbool(pTHX_ const SV *sv)
3490 return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv);
3495 PERL_STATIC_INLINE AV *
3496 Perl_cop_file_avn(pTHX_ const COP *cop) {
3498 PERL_ARGS_ASSERT_COP_FILE_AVN;
3500 const char *file = CopFILE(cop);
3502 GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3515 PERL_STATIC_INLINE PADNAME *
3516 Perl_padname_refcnt_inc(PADNAME *pn)
3518 PadnameREFCNT(pn)++;
3522 PERL_STATIC_INLINE PADNAMELIST *
3523 Perl_padnamelist_refcnt_inc(PADNAMELIST *pnl)
3525 PadnamelistREFCNT(pnl)++;
3529 /* copy a string to a safe spot */
3532 =for apidoc_section $string
3535 Perl's version of C<strdup()>. Returns a pointer to a newly allocated
3536 string which is a duplicate of C<pv>. The size of the string is
3537 determined by C<strlen()>, which means it may not contain embedded C<NUL>
3538 characters and must have a trailing C<NUL>. To prevent memory leaks, the
3539 memory allocated for the new string needs to be freed when no longer needed.
3540 This can be done with the C<L</Safefree>> function, or
3541 L<C<SAVEFREEPV>|perlguts/SAVEFREEPV(p)>.
3543 On some platforms, Windows for example, all allocated memory owned by a thread
3544 is deallocated when that thread ends. So if you need that not to happen, you
3545 need to use the shared memory functions, such as C<L</savesharedpv>>.
3550 PERL_STATIC_INLINE char *
3551 Perl_savepv(pTHX_ const char *pv)
3553 PERL_UNUSED_CONTEXT;
3558 const STRLEN pvlen = strlen(pv)+1;
3559 Newx(newaddr, pvlen, char);
3560 return (char*)memcpy(newaddr, pv, pvlen);
3564 /* same thing but with a known length */
3569 Perl's version of what C<strndup()> would be if it existed. Returns a
3570 pointer to a newly allocated string which is a duplicate of the first
3571 C<len> bytes from C<pv>, plus a trailing
3572 C<NUL> byte. The memory allocated for
3573 the new string can be freed with the C<Safefree()> function.
3575 On some platforms, Windows for example, all allocated memory owned by a thread
3576 is deallocated when that thread ends. So if you need that not to happen, you
3577 need to use the shared memory functions, such as C<L</savesharedpvn>>.
3582 PERL_STATIC_INLINE char *
3583 Perl_savepvn(pTHX_ const char *pv, Size_t len)
3586 PERL_UNUSED_CONTEXT;
3588 Newx(newaddr,len+1,char);
3589 /* Give a meaning to NULL pointer mainly for the use in sv_magic() */
3591 /* might not be null terminated */
3592 newaddr[len] = '\0';
3593 return (char *) CopyD(pv,newaddr,len,char);
3596 return (char *) ZeroD(newaddr,len+1,char);
3601 =for apidoc savesvpv
3603 A version of C<savepv()>/C<savepvn()> which gets the string to duplicate from
3604 the passed in SV using C<SvPV()>
3606 On some platforms, Windows for example, all allocated memory owned by a thread
3607 is deallocated when that thread ends. So if you need that not to happen, you
3608 need to use the shared memory functions, such as C<L</savesharedsvpv>>.
3613 PERL_STATIC_INLINE char *
3614 Perl_savesvpv(pTHX_ SV *sv)
3617 const char * const pv = SvPV_const(sv, len);
3620 PERL_ARGS_ASSERT_SAVESVPV;
3623 Newx(newaddr,len,char);
3624 return (char *) CopyD(pv,newaddr,len,char);
3628 =for apidoc savesharedsvpv
3630 A version of C<savesharedpv()> which allocates the duplicate string in
3631 memory which is shared between threads.
3636 PERL_STATIC_INLINE char *
3637 Perl_savesharedsvpv(pTHX_ SV *sv)
3640 const char * const pv = SvPV_const(sv, len);
3642 PERL_ARGS_ASSERT_SAVESHAREDSVPV;
3644 return savesharedpvn(pv, len);
3647 #ifndef PERL_GET_CONTEXT_DEFINED
3650 =for apidoc_section $embedding
3651 =for apidoc get_context
3653 Implements L<perlapi/C<PERL_GET_CONTEXT>>, which you should use instead.
3658 PERL_STATIC_INLINE void *
3659 Perl_get_context(void)
3661 # if defined(USE_ITHREADS)
3662 # ifdef OLD_PTHREADS_API
3664 int error = pthread_getspecific(PL_thr_key, &t);
3666 Perl_croak_nocontext("panic: pthread_getspecific, error=%d", error);
3668 # elif defined(I_MACH_CTHREADS)
3669 return (void*)cthread_data(cthread_self());
3671 return (void*)PTHREAD_GETSPECIFIC(PL_thr_key);
3680 PERL_STATIC_INLINE MGVTBL*
3681 Perl_get_vtbl(pTHX_ int vtbl_id)
3683 PERL_UNUSED_CONTEXT;
3685 return (vtbl_id < 0 || vtbl_id >= magic_vtable_max)
3686 ? NULL : (MGVTBL*)PL_magic_vtables + vtbl_id;
3690 =for apidoc my_strlcat
3692 The C library C<strlcat> if available, or a Perl implementation of it.
3693 This operates on C C<NUL>-terminated strings.
3695 C<my_strlcat()> appends string C<src> to the end of C<dst>. It will append at
3696 most S<C<size - strlen(dst) - 1>> characters. It will then C<NUL>-terminate,
3697 unless C<size> is 0 or the original C<dst> string was longer than C<size> (in
3698 practice this should not happen as it means that either C<size> is incorrect or
3699 that C<dst> is not a proper C<NUL>-terminated string).
3701 Note that C<size> is the full size of the destination buffer and
3702 the result is guaranteed to be C<NUL>-terminated if there is room. Note that
3703 room for the C<NUL> should be included in C<size>.
3705 The return value is the total length that C<dst> would have if C<size> is
3706 sufficiently large. Thus it is the initial length of C<dst> plus the length of
3707 C<src>. If C<size> is smaller than the return, the excess was not appended.
3711 Description stolen from http://man.openbsd.org/strlcat.3
3714 PERL_STATIC_INLINE Size_t
3715 Perl_my_strlcat(char *dst, const char *src, Size_t size)
3717 Size_t used, length, copy;
3720 length = strlen(src);
3721 if (size > 0 && used < size - 1) {
3722 copy = (length >= size - used) ? size - used - 1 : length;
3723 memcpy(dst + used, src, copy);
3724 dst[used + copy] = '\0';
3726 return used + length;
3732 =for apidoc my_strlcpy
3734 The C library C<strlcpy> if available, or a Perl implementation of it.
3735 This operates on C C<NUL>-terminated strings.
3737 C<my_strlcpy()> copies up to S<C<size - 1>> characters from the string C<src>
3738 to C<dst>, C<NUL>-terminating the result if C<size> is not 0.
3740 The return value is the total length C<src> would be if the copy completely
3741 succeeded. If it is larger than C<size>, the excess was not copied.
3745 Description stolen from http://man.openbsd.org/strlcpy.3
3748 PERL_STATIC_INLINE Size_t
3749 Perl_my_strlcpy(char *dst, const char *src, Size_t size)
3751 Size_t length, copy;
3753 length = strlen(src);
3755 copy = (length >= size) ? size - 1 : length;
3756 memcpy(dst, src, copy);
3764 * ex: set ts=8 sts=4 sw=4 et: