3 * Copyright (C) 2012 by Larry Wall and others
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
8 * This file contains tables and code adapted from
9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * This file is a home for static inline functions that cannot go in other
34 * header files, because they depend on proto.h (included after most other
35 * headers) or struct definitions.
37 * Each section names the header file that the functions "belong" to.
40 /* ------------------------------- av.h ------------------------------- */
43 =for apidoc_section $AV
45 Returns the number of elements in the array C<av>. This is the true length of
46 the array, including any undefined elements. It is always the same as
47 S<C<av_top_index(av) + 1>>.
51 PERL_STATIC_INLINE Size_t
52 Perl_av_count(pTHX_ AV *av)
54 PERL_ARGS_ASSERT_AV_COUNT;
55 assert(SvTYPE(av) == SVt_PVAV);
57 return AvFILL(av) + 1;
60 /* ------------------------------- av.c ------------------------------- */
63 =for apidoc av_store_simple
65 This is a cut-down version of av_store that assumes that the array is
66 very straightforward - no magic, not readonly, and AvREAL - and that
67 C<key> is not negative. This function MUST NOT be used in situations
68 where any of those assumptions may not hold.
70 Stores an SV in an array. The array index is specified as C<key>. It
71 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
73 Note that the caller is responsible for suitably incrementing the reference
74 count of C<val> before the call.
76 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
81 PERL_STATIC_INLINE SV**
82 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
86 PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
87 assert(SvTYPE(av) == SVt_PVAV);
88 assert(!SvMAGICAL(av));
89 assert(!SvREADONLY(av));
95 if (AvFILLp(av) < key) {
96 if (key > AvMAX(av)) {
102 SvREFCNT_dec(ary[key]);
109 =for apidoc av_fetch_simple
111 This is a cut-down version of av_fetch that assumes that the array is
112 very straightforward - no magic, not readonly, and AvREAL - and that
113 C<key> is not negative. This function MUST NOT be used in situations
114 where any of those assumptions may not hold.
116 Returns the SV at the specified index in the array. The C<key> is the
117 index. If lval is true, you are guaranteed to get a real SV back (in case
118 it wasn't real before), which you can then modify. Check that the return
119 value is non-null before dereferencing it to a C<SV*>.
121 The rough perl equivalent is C<$myarray[$key]>.
126 PERL_STATIC_INLINE SV**
127 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
129 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
130 assert(SvTYPE(av) == SVt_PVAV);
131 assert(!SvMAGICAL(av));
132 assert(!SvREADONLY(av));
136 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
137 return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
139 return &AvARRAY(av)[key];
144 =for apidoc av_push_simple
146 This is a cut-down version of av_push that assumes that the array is very
147 straightforward - no magic, not readonly, and AvREAL - and that C<key> is
148 not less than -1. This function MUST NOT be used in situations where any
149 of those assumptions may not hold.
151 Pushes an SV (transferring control of one reference count) onto the end of the
152 array. The array will grow automatically to accommodate the addition.
154 Perl equivalent: C<push @myarray, $val;>.
159 PERL_STATIC_INLINE void
160 Perl_av_push_simple(pTHX_ AV *av, SV *val)
162 PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
163 assert(SvTYPE(av) == SVt_PVAV);
164 assert(!SvMAGICAL(av));
165 assert(!SvREADONLY(av));
167 assert(AvFILLp(av) > -2);
169 (void)av_store_simple(av,AvFILLp(av)+1,val);
172 /* ------------------------------- cv.h ------------------------------- */
175 =for apidoc_section $CV
177 Returns the GV associated with the CV C<sv>, reifying it if necessary.
181 PERL_STATIC_INLINE GV *
182 Perl_CvGV(pTHX_ CV *sv)
184 PERL_ARGS_ASSERT_CVGV;
187 ? Perl_cvgv_from_hek(aTHX_ sv)
188 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
193 Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a
198 PERL_STATIC_INLINE I32 *
199 Perl_CvDEPTH(const CV * const sv)
201 PERL_ARGS_ASSERT_CVDEPTH;
202 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
204 return &((XPVCV*)SvANY(sv))->xcv_depth;
208 CvPROTO returns the prototype as stored, which is not necessarily what
209 the interpreter should be using. Specifically, the interpreter assumes
210 that spaces have been stripped, which has been the case if the prototype
211 was added by toke.c, but is generally not the case if it was added elsewhere.
212 Since we can't enforce the spacelessness at assignment time, this routine
213 provides a temporary copy at parse time with spaces removed.
214 I<orig> is the start of the original buffer, I<len> is the length of the
215 prototype and will be updated when this returns.
219 PERL_STATIC_INLINE char *
220 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
224 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
232 *len = tmps - SvPVX(tmpsv);
237 /* ------------------------------- mg.h ------------------------------- */
239 #if defined(PERL_CORE) || defined(PERL_EXT)
240 /* assumes get-magic and stringification have already occurred */
241 PERL_STATIC_INLINE STRLEN
242 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
244 assert(mg->mg_type == PERL_MAGIC_regex_global);
245 assert(mg->mg_len != -1);
246 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
247 return (STRLEN)mg->mg_len;
249 const STRLEN pos = (STRLEN)mg->mg_len;
250 /* Without this check, we may read past the end of the buffer: */
251 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
252 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
257 /* ------------------------------- pad.h ------------------------------ */
259 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
260 PERL_STATIC_INLINE bool
261 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
263 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
265 /* is seq within the range _LOW to _HIGH ?
266 * This is complicated by the fact that PL_cop_seqmax
267 * may have wrapped around at some point */
268 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
269 return FALSE; /* not yet introduced */
271 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
272 /* in compiling scope */
274 (seq > COP_SEQ_RANGE_LOW(pn))
275 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
276 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
281 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
283 ( seq > COP_SEQ_RANGE_LOW(pn)
284 || seq <= COP_SEQ_RANGE_HIGH(pn))
286 : ( seq > COP_SEQ_RANGE_LOW(pn)
287 && seq <= COP_SEQ_RANGE_HIGH(pn))
294 /* ------------------------------- pp.h ------------------------------- */
296 PERL_STATIC_INLINE I32
299 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
300 "MARK top %p %" IVdf "\n",
302 (IV)*PL_markstack_ptr)));
303 return *PL_markstack_ptr;
306 PERL_STATIC_INLINE I32
309 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
310 "MARK pop %p %" IVdf "\n",
311 (PL_markstack_ptr-1),
312 (IV)*(PL_markstack_ptr-1))));
313 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
314 return *PL_markstack_ptr--;
317 /* ----------------------------- regexp.h ----------------------------- */
319 /* PVLVs need to act as a superset of all scalar types - they are basically
320 * PVMGs with a few extra fields.
321 * REGEXPs are first class scalars, but have many fields that can't be copied
324 * Hence we take a different approach - instead of a copy, PVLVs store a pointer
325 * back to the original body. To avoid increasing the size of PVLVs just for the
326 * rare case of REGEXP assignment, this pointer is stored in the memory usually
327 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
328 * read the pointer from the two possible locations. The macro SvLEN() wraps the
329 * access to the union's member xpvlenu_len, but there is no equivalent macro
330 * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
332 * See commit df6b4bd56551f2d3 for more details. */
334 PERL_STATIC_INLINE struct regexp *
335 Perl_ReANY(const REGEXP * const re)
337 XPV* const p = (XPV*)SvANY(re);
339 PERL_ARGS_ASSERT_REANY;
340 assert(isREGEXP(re));
342 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
343 : (struct regexp *)p;
346 /* ------------------------------- utf8.h ------------------------------- */
349 =for apidoc_section $unicode
352 PERL_STATIC_INLINE void
353 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
355 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
356 * encoded string at '*dest', updating '*dest' to include it */
358 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
360 if (NATIVE_BYTE_IS_INVARIANT(byte))
363 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
364 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
369 =for apidoc valid_utf8_to_uvchr
370 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
371 known that the next character in the input UTF-8 string C<s> is well-formed
372 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
373 points, and non-Unicode code points are allowed.
379 PERL_STATIC_INLINE UV
380 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
382 const UV expectlen = UTF8SKIP(s);
383 const U8* send = s + expectlen;
386 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
392 /* An invariant is trivially returned */
393 if (expectlen == 1) {
397 /* Remove the leading bits that indicate the number of bytes, leaving just
398 * the bits that are part of the value */
399 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
401 /* Now, loop through the remaining bytes, accumulating each into the
402 * working total as we go. (I khw tried unrolling the loop for up to 4
403 * bytes, but there was no performance improvement) */
404 for (++s; s < send; s++) {
405 uv = UTF8_ACCUMULATE(uv, *s);
408 return UNI_TO_NATIVE(uv);
413 =for apidoc is_utf8_invariant_string
415 Returns TRUE if the first C<len> bytes of the string C<s> are the same
416 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
417 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
418 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
419 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
420 characters are invariant, but so also are the C1 controls.
422 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
423 use this option, that C<s> can't have embedded C<NUL> characters and has to
424 have a terminating C<NUL> byte).
427 C<L</is_utf8_string>>,
428 C<L</is_utf8_string_flags>>,
429 C<L</is_utf8_string_loc>>,
430 C<L</is_utf8_string_loc_flags>>,
431 C<L</is_utf8_string_loclen>>,
432 C<L</is_utf8_string_loclen_flags>>,
433 C<L</is_utf8_fixed_width_buf_flags>>,
434 C<L</is_utf8_fixed_width_buf_loc_flags>>,
435 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
436 C<L</is_strict_utf8_string>>,
437 C<L</is_strict_utf8_string_loc>>,
438 C<L</is_strict_utf8_string_loclen>>,
439 C<L</is_c9strict_utf8_string>>,
440 C<L</is_c9strict_utf8_string_loc>>,
442 C<L</is_c9strict_utf8_string_loclen>>.
448 #define is_utf8_invariant_string(s, len) \
449 is_utf8_invariant_string_loc(s, len, NULL)
452 =for apidoc is_utf8_invariant_string_loc
454 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
455 the first UTF-8 variant character in the C<ep> pointer; if all characters are
456 UTF-8 invariant, this function does not change the contents of C<*ep>.
462 PERL_STATIC_INLINE bool
463 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
468 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
471 len = strlen((const char *)s);
476 /* This looks like 0x010101... */
477 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
479 /* This looks like 0x808080... */
480 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
481 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
482 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
484 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
485 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
486 * optimized out completely on a 32-bit system, and its mask gets optimized out
487 * on a 64-bit system */
488 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
489 | ( PTR2nat(x) >> 1) \
491 & PERL_WORD_BOUNDARY_MASK) >> 2))))
495 /* Do the word-at-a-time iff there is at least one usable full word. That
496 * means that after advancing to a word boundary, there still is at least a
497 * full word left. The number of bytes needed to advance is 'wordsize -
498 * offset' unless offset is 0. */
499 if ((STRLEN) (send - x) >= PERL_WORDSIZE
501 /* This term is wordsize if subword; 0 if not */
502 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
505 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
508 /* Process per-byte until reach word boundary. XXX This loop could be
509 * eliminated if we knew that this platform had fast unaligned reads */
510 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
511 if (! UTF8_IS_INVARIANT(*x)) {
521 /* Here, we know we have at least one full word to process. Process
522 * per-word as long as we have at least a full word left */
524 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
526 /* Found a variant. Just return if caller doesn't want its
532 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
533 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
535 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
536 assert(*ep >= s && *ep < send);
540 # else /* If weird byte order, drop into next loop to do byte-at-a-time
549 } while (x + PERL_WORDSIZE <= send);
552 #endif /* End of ! EBCDIC */
554 /* Process per-byte */
556 if (! UTF8_IS_INVARIANT(*x)) {
570 /* See if the platform has builtins for finding the most/least significant bit,
571 * and which one is right for using on 32 and 64 bit operands */
572 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
573 # if U32SIZE == INTSIZE
574 # define PERL_CLZ_32 __builtin_clz
576 # if defined(U64TYPE) && U64SIZE == INTSIZE
577 # define PERL_CLZ_64 __builtin_clz
580 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
581 # if U32SIZE == INTSIZE
582 # define PERL_CTZ_32 __builtin_ctz
584 # if defined(U64TYPE) && U64SIZE == INTSIZE
585 # define PERL_CTZ_64 __builtin_ctz
589 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
590 # if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
591 # define PERL_CLZ_32 __builtin_clzl
593 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
594 # define PERL_CLZ_64 __builtin_clzl
597 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
598 # if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
599 # define PERL_CTZ_32 __builtin_ctzl
601 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
602 # define PERL_CTZ_64 __builtin_ctzl
606 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
607 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
608 # define PERL_CLZ_32 __builtin_clzll
610 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
611 # define PERL_CLZ_64 __builtin_clzll
614 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
615 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
616 # define PERL_CTZ_32 __builtin_ctzll
618 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
619 # define PERL_CTZ_64 __builtin_ctzll
623 #if defined(_MSC_VER)
625 # pragma intrinsic(_BitScanForward)
626 # pragma intrinsic(_BitScanReverse)
628 # pragma intrinsic(_BitScanForward64)
629 # pragma intrinsic(_BitScanReverse64)
633 /* The reason there are not checks to see if ffs() and ffsl() are available for
634 * determining the lsb, is because these don't improve on the deBruijn method
635 * fallback, which is just a branchless integer multiply, array element
636 * retrieval, and shift. The others, even if the function call overhead is
637 * optimized out, have to cope with the possibility of the input being all
638 * zeroes, and almost certainly will have conditionals for this eventuality.
639 * khw, at the time of this commit, looked at the source for both gcc and clang
640 * to verify this. (gcc used a method inferior to deBruijn.) */
642 /* Below are functions to find the first, last, or only set bit in a word. On
643 * platforms with 64-bit capability, there is a pair for each operation; the
644 * first taking a 64 bit operand, and the second a 32 bit one. The logic is
645 * the same in each pair, so the second is stripped of most comments. */
647 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
649 PERL_STATIC_INLINE unsigned
650 Perl_lsbit_pos64(U64 word)
652 /* Find the position (0..63) of the least significant set bit in the input
657 /* If we can determine that the platform has a usable fast method to get
658 * this info, use that */
660 # if defined(PERL_CTZ_64)
661 # define PERL_HAS_FAST_GET_LSB_POS64
663 return (unsigned) PERL_CTZ_64(word);
665 # elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
666 # define PERL_HAS_FAST_GET_LSB_POS64
670 _BitScanForward64(&index, word);
671 return (unsigned)index;
676 /* Here, we didn't find a fast method for finding the lsb. Fall back to
677 * making the lsb the only set bit in the word, and use our function that
678 * works on words with a single bit set.
681 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
683 * The word will look like this, with a rightmost set bit in position 's':
684 * ('x's are don't cares, and 'y's are their complements)
687 * y..y011..11 Complement
689 * 0..0100..00 And with the original
691 * (Yes, complementing and adding 1 is just taking the negative on 2's
692 * complement machines, but not on 1's complement ones, and some compilers
693 * complain about negating an unsigned.)
695 return single_1bit_pos64(word & (~word + 1));
701 # define lsbit_pos_uintmax_(word) lsbit_pos64(word)
703 # define lsbit_pos_uintmax_(word) lsbit_pos32(word)
706 PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */
707 Perl_lsbit_pos32(U32 word)
709 /* Find the position (0..31) of the least significant set bit in the input
714 #if defined(PERL_CTZ_32)
715 # define PERL_HAS_FAST_GET_LSB_POS32
717 return (unsigned) PERL_CTZ_32(word);
719 #elif U32SIZE == 4 && defined(_MSC_VER)
720 # define PERL_HAS_FAST_GET_LSB_POS32
724 _BitScanForward(&index, word);
725 return (unsigned)index;
730 return single_1bit_pos32(word & (~word + 1));
737 /* Convert the leading zeros count to the bit position of the first set bit.
738 * This just subtracts from the highest position, 31 or 63. But some compilers
739 * don't optimize this optimally, and so a bit of bit twiddling encourages them
740 * to do the right thing. It turns out that subtracting a smaller non-negative
741 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
742 * the two numbers. To see why, first note that the sum of any number, x, and
743 * its complement, x', is all ones. So all ones minus x is x'. Then note that
744 * the xor of x and all ones is x'. */
745 #define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc))
747 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
749 PERL_STATIC_INLINE unsigned
750 Perl_msbit_pos64(U64 word)
752 /* Find the position (0..63) of the most significant set bit in the input
757 /* If we can determine that the platform has a usable fast method to get
760 # if defined(PERL_CLZ_64)
761 # define PERL_HAS_FAST_GET_MSB_POS64
763 return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
765 # elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
766 # define PERL_HAS_FAST_GET_MSB_POS64
770 _BitScanReverse64(&index, word);
771 return (unsigned)index;
776 /* Here, we didn't find a fast method for finding the msb. Fall back to
777 * making the msb the only set bit in the word, and use our function that
778 * works on words with a single bit set.
780 * Isolate the msb; http://codeforces.com/blog/entry/10330
782 * Only the most significant set bit matters. Or'ing word with its right
783 * shift of 1 makes that bit and the next one to its right both 1.
784 * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
785 * ... We end with the msb and all to the right being 1. */
790 word |= (word >> 16);
791 word |= (word >> 32);
793 /* Then subtracting the right shift by 1 clears all but the left-most of
794 * the 1 bits, which is our desired result */
797 /* Now we have a single bit set */
798 return single_1bit_pos64(word);
804 # define msbit_pos_uintmax_(word) msbit_pos64(word)
806 # define msbit_pos_uintmax_(word) msbit_pos32(word)
809 PERL_STATIC_INLINE unsigned
810 Perl_msbit_pos32(U32 word)
812 /* Find the position (0..31) of the most significant set bit in the input
817 #if defined(PERL_CLZ_32)
818 # define PERL_HAS_FAST_GET_MSB_POS32
820 return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
822 #elif U32SIZE == 4 && defined(_MSC_VER)
823 # define PERL_HAS_FAST_GET_MSB_POS32
827 _BitScanReverse(&index, word);
828 return (unsigned)index;
837 word |= (word >> 16);
839 return single_1bit_pos32(word);
845 #if UVSIZE == U64SIZE
846 # define msbit_pos(word) msbit_pos64(word)
847 # define lsbit_pos(word) lsbit_pos64(word)
848 #elif UVSIZE == U32SIZE
849 # define msbit_pos(word) msbit_pos32(word)
850 # define lsbit_pos(word) lsbit_pos32(word)
853 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
855 PERL_STATIC_INLINE unsigned
856 Perl_single_1bit_pos64(U64 word)
858 /* Given a 64-bit word known to contain all zero bits except one 1 bit,
859 * find and return the 1's position: 0..63 */
861 # ifdef PERL_CORE /* macro not exported */
862 ASSUME(isPOWER_OF_2(word));
864 ASSUME(word && (word & (word-1)) == 0);
867 /* The only set bit is both the most and least significant bit. If we have
868 * a fast way of finding either one, use that.
870 * It may appear at first glance that those functions call this one, but
871 * they don't if the corresponding #define is set */
873 # ifdef PERL_HAS_FAST_GET_MSB_POS64
875 return msbit_pos64(word);
877 # elif defined(PERL_HAS_FAST_GET_LSB_POS64)
879 return lsbit_pos64(word);
883 /* The position of the only set bit in a word can be quickly calculated
884 * using deBruijn sequences. See for example
885 * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
886 return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
887 >> PERL_deBruijnShift64_];
894 PERL_STATIC_INLINE unsigned
895 Perl_single_1bit_pos32(U32 word)
897 /* Given a 32-bit word known to contain all zero bits except one 1 bit,
898 * find and return the 1's position: 0..31 */
900 #ifdef PERL_CORE /* macro not exported */
901 ASSUME(isPOWER_OF_2(word));
903 ASSUME(word && (word & (word-1)) == 0);
905 #ifdef PERL_HAS_FAST_GET_MSB_POS32
907 return msbit_pos32(word);
909 #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
911 return lsbit_pos32(word);
913 /* Unlikely, but possible for the platform to have a wider fast operation but
914 * not a narrower one. But easy enough to handle the case by widening the
915 * parameter size. (Going the other way, emulating 64 bit by two 32 bit ops
916 * would be slower than the deBruijn method.) */
917 #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
919 return msbit_pos64(word);
921 #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
923 return lsbit_pos64(word);
927 return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
928 >> PERL_deBruijnShift32_];
935 PERL_STATIC_INLINE unsigned int
936 Perl_variant_byte_number(PERL_UINTMAX_T word)
938 /* This returns the position in a word (0..7) of the first variant byte in
939 * it. This is a helper function. Note that there are no branches */
941 /* Get just the msb bits of each byte */
942 word &= PERL_VARIANTS_WORD_MASK;
944 /* This should only be called if we know there is a variant byte in the
948 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
950 /* Bytes are stored like
951 * Byte8 ... Byte2 Byte1
952 * 63..56...15...8 7...0
953 * so getting the lsb of the whole modified word is getting the msb of the
954 * first byte that has its msb set */
955 word = lsbit_pos_uintmax_(word);
957 /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert
959 return (unsigned int) ((word + 1) >> 3) - 1;
961 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
963 /* Bytes are stored like
964 * Byte1 Byte2 ... Byte8
965 * 63..56 55..47 ... 7...0
966 * so getting the msb of the whole modified word is getting the msb of the
967 * first byte that has its msb set */
968 word = msbit_pos_uintmax_(word);
970 /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert
972 word = ((word + 1) >> 3) - 1;
974 /* And invert the result because of the reversed byte order on this
976 word = CHARBITS - word - 1;
978 return (unsigned int) word;
981 # error Unexpected byte order
987 #if defined(PERL_CORE) || defined(PERL_EXT)
990 =for apidoc variant_under_utf8_count
992 This function looks at the sequence of bytes between C<s> and C<e>, which are
993 assumed to be encoded in ASCII/Latin1, and returns how many of them would
994 change should the string be translated into UTF-8. Due to the nature of UTF-8,
995 each of these would occupy two bytes instead of the single one in the input
996 string. Thus, this function returns the precise number of bytes the string
997 would expand by when translated to UTF-8.
999 Unlike most of the other functions that have C<utf8> in their name, the input
1000 to this function is NOT a UTF-8-encoded string. The function name is slightly
1001 I<odd> to emphasize this.
1003 This function is internal to Perl because khw thinks that any XS code that
1004 would want this is probably operating too close to the internals. Presenting a
1005 valid use case could change that.
1008 C<L<perlapi/is_utf8_invariant_string>>
1010 C<L<perlapi/is_utf8_invariant_string_loc>>,
1016 PERL_STATIC_INLINE Size_t
1017 S_variant_under_utf8_count(const U8* const s, const U8* const e)
1022 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1026 /* Test if the string is long enough to use word-at-a-time. (Logic is the
1027 * same as for is_utf8_invariant_string()) */
1028 if ((STRLEN) (e - x) >= PERL_WORDSIZE
1029 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1030 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1033 /* Process per-byte until reach word boundary. XXX This loop could be
1034 * eliminated if we knew that this platform had fast unaligned reads */
1035 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1036 count += ! UTF8_IS_INVARIANT(*x++);
1039 /* Process per-word as long as we have at least a full word left */
1040 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1041 explanation of how this works */
1042 PERL_UINTMAX_T increment
1043 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1044 * PERL_COUNT_MULTIPLIER)
1045 >> ((PERL_WORDSIZE - 1) * CHARBITS);
1046 count += (Size_t) increment;
1048 } while (x + PERL_WORDSIZE <= e);
1053 /* Process per-byte */
1055 if (! UTF8_IS_INVARIANT(*x)) {
1067 #ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
1068 # undef PERL_WORDSIZE
1069 # undef PERL_COUNT_MULTIPLIER
1070 # undef PERL_WORD_BOUNDARY_MASK
1071 # undef PERL_VARIANTS_WORD_MASK
1075 =for apidoc is_utf8_string
1077 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1078 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
1079 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1080 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1081 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1083 This function considers Perl's extended UTF-8 to be valid. That means that
1084 code points above Unicode, surrogates, and non-character code points are
1085 considered valid by this function. Use C<L</is_strict_utf8_string>>,
1086 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1087 code points are considered valid.
1090 C<L</is_utf8_invariant_string>>,
1091 C<L</is_utf8_invariant_string_loc>>,
1092 C<L</is_utf8_string_loc>>,
1093 C<L</is_utf8_string_loclen>>,
1094 C<L</is_utf8_fixed_width_buf_flags>>,
1095 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1096 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1101 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
1103 #if defined(PERL_CORE) || defined (PERL_EXT)
1106 =for apidoc is_utf8_non_invariant_string
1108 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1109 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1110 UTF-8; otherwise returns FALSE.
1112 A TRUE return means that at least one code point represented by the sequence
1113 either is a wide character not representable as a single byte, or the
1114 representation differs depending on whether the sequence is encoded in UTF-8 or
1118 C<L<perlapi/is_utf8_invariant_string>>,
1119 C<L<perlapi/is_utf8_string>>
1123 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
1124 It generally needn't be if its string is entirely UTF-8 invariant, and it
1125 shouldn't be if it otherwise contains invalid UTF-8.
1127 It is an internal function because khw thinks that XS code shouldn't be working
1128 at this low a level. A valid use case could change that.
1132 PERL_STATIC_INLINE bool
1133 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
1135 const U8 * first_variant;
1137 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1139 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1143 return is_utf8_string(first_variant, len - (first_variant - s));
1149 =for apidoc is_strict_utf8_string
1151 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1152 UTF-8-encoded string that is fully interchangeable by any application using
1153 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
1154 calculated using C<strlen(s)> (which means if you use this option, that C<s>
1155 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1156 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1158 This function returns FALSE for strings containing any
1159 code points above the Unicode max of 0x10FFFF, surrogate code points, or
1160 non-character code points.
1163 C<L</is_utf8_invariant_string>>,
1164 C<L</is_utf8_invariant_string_loc>>,
1165 C<L</is_utf8_string>>,
1166 C<L</is_utf8_string_flags>>,
1167 C<L</is_utf8_string_loc>>,
1168 C<L</is_utf8_string_loc_flags>>,
1169 C<L</is_utf8_string_loclen>>,
1170 C<L</is_utf8_string_loclen_flags>>,
1171 C<L</is_utf8_fixed_width_buf_flags>>,
1172 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1173 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1174 C<L</is_strict_utf8_string_loc>>,
1175 C<L</is_strict_utf8_string_loclen>>,
1176 C<L</is_c9strict_utf8_string>>,
1177 C<L</is_c9strict_utf8_string_loc>>,
1179 C<L</is_c9strict_utf8_string_loclen>>.
1184 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
1187 =for apidoc is_c9strict_utf8_string
1189 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1190 UTF-8-encoded string that conforms to
1191 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1192 otherwise it returns FALSE. If C<len> is 0, it will be calculated using
1193 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1194 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
1195 characters being ASCII constitute 'a valid UTF-8 string'.
1197 This function returns FALSE for strings containing any code points above the
1198 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1200 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1203 C<L</is_utf8_invariant_string>>,
1204 C<L</is_utf8_invariant_string_loc>>,
1205 C<L</is_utf8_string>>,
1206 C<L</is_utf8_string_flags>>,
1207 C<L</is_utf8_string_loc>>,
1208 C<L</is_utf8_string_loc_flags>>,
1209 C<L</is_utf8_string_loclen>>,
1210 C<L</is_utf8_string_loclen_flags>>,
1211 C<L</is_utf8_fixed_width_buf_flags>>,
1212 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1213 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1214 C<L</is_strict_utf8_string>>,
1215 C<L</is_strict_utf8_string_loc>>,
1216 C<L</is_strict_utf8_string_loclen>>,
1217 C<L</is_c9strict_utf8_string_loc>>,
1219 C<L</is_c9strict_utf8_string_loclen>>.
1224 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
1227 =for apidoc is_utf8_string_flags
1229 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1230 UTF-8 string, subject to the restrictions imposed by C<flags>;
1231 returns FALSE otherwise. If C<len> is 0, it will be calculated
1232 using C<strlen(s)> (which means if you use this option, that C<s> can't have
1233 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
1234 that all characters being ASCII constitute 'a valid UTF-8 string'.
1236 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1237 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1238 as C<L</is_strict_utf8_string>>; and if C<flags> is
1239 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1240 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
1241 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1242 C<L</utf8n_to_uvchr>>, with the same meanings.
1245 C<L</is_utf8_invariant_string>>,
1246 C<L</is_utf8_invariant_string_loc>>,
1247 C<L</is_utf8_string>>,
1248 C<L</is_utf8_string_loc>>,
1249 C<L</is_utf8_string_loc_flags>>,
1250 C<L</is_utf8_string_loclen>>,
1251 C<L</is_utf8_string_loclen_flags>>,
1252 C<L</is_utf8_fixed_width_buf_flags>>,
1253 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1254 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1255 C<L</is_strict_utf8_string>>,
1256 C<L</is_strict_utf8_string_loc>>,
1257 C<L</is_strict_utf8_string_loclen>>,
1258 C<L</is_c9strict_utf8_string>>,
1259 C<L</is_c9strict_utf8_string_loc>>,
1261 C<L</is_c9strict_utf8_string_loclen>>.
1266 PERL_STATIC_INLINE bool
1267 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
1269 const U8 * first_variant;
1271 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1272 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1273 |UTF8_DISALLOW_PERL_EXTENDED)));
1276 len = strlen((const char *)s);
1280 return is_utf8_string(s, len);
1283 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1284 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1286 return is_strict_utf8_string(s, len);
1289 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1290 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1292 return is_c9strict_utf8_string(s, len);
1295 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1296 const U8* const send = s + len;
1297 const U8* x = first_variant;
1300 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1301 if (UNLIKELY(! cur_len)) {
1313 =for apidoc is_utf8_string_loc
1315 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1316 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1317 "utf8ness success") in the C<ep> pointer.
1319 See also C<L</is_utf8_string_loclen>>.
1324 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
1328 =for apidoc is_utf8_string_loclen
1330 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1331 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1332 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1333 encoded characters in the C<el> pointer.
1335 See also C<L</is_utf8_string_loc>>.
1340 PERL_STATIC_INLINE bool
1341 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1343 const U8 * first_variant;
1345 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1348 len = strlen((const char *) s);
1351 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1363 const U8* const send = s + len;
1364 const U8* x = first_variant;
1365 STRLEN outlen = first_variant - s;
1368 const STRLEN cur_len = isUTF8_CHAR(x, send);
1369 if (UNLIKELY(! cur_len)) {
1387 /* The perl core arranges to never call the DFA below without there being at
1388 * least one byte available to look at. This allows the DFA to use a do {}
1389 * while loop which means that calling it with a UTF-8 invariant has a single
1390 * conditional, same as the calling code checking for invariance ahead of time.
1391 * And having the calling code remove that conditional speeds up by that
1392 * conditional, the case where it wasn't invariant. So there's no reason to
1393 * check before caling this.
1395 * But we don't know this for non-core calls, so have to retain the check for
1398 # define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s))
1400 # define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE
1404 * DFA for checking input is valid UTF-8 syntax.
1406 * This uses adaptations of the table and algorithm given in
1407 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1408 * documentation of the original version. A copyright notice for the original
1409 * version is given at the beginning of this file. The Perl adapations are
1410 * documented at the definition of PL_extended_utf8_dfa_tab[].
1412 * This dfa is fast. There are three exit conditions:
1413 * 1) a well-formed code point, acceptable to the table
1414 * 2) the beginning bytes of an incomplete character, whose completion might
1415 * or might not be acceptable
1416 * 3) unacceptable to the table. Some of the adaptations have certain,
1417 * hopefully less likely to occur, legal inputs be unacceptable to the
1418 * table, so these must be sorted out afterwards.
1420 * This macro is a complete implementation of the code executing the DFA. It
1421 * is passed the input sequence bounds and the table to use, and what to do
1422 * for each of the exit conditions. There are three canned actions, likely to
1423 * be the ones you want:
1424 * DFA_RETURN_SUCCESS_
1425 * DFA_RETURN_FAILURE_
1426 * DFA_GOTO_TEASE_APART_FF_
1428 * You pass a parameter giving the action to take for each of the three
1429 * possible exit conditions:
1431 * 'accept_action' This is executed when the DFA accepts the input.
1432 * DFA_RETURN_SUCCESS_ is the most likely candidate.
1433 * 'reject_action' This is executed when the DFA rejects the input.
1434 * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1435 * you have written code to distinguish the rejecting state
1436 * results. Because it happens in several places, and
1437 * involves #ifdefs, the special action
1438 * DFA_GOTO_TEASE_APART_FF_ is what you want with
1439 * PL_extended_utf8_dfa_tab. On platforms without
1440 * EXTRA_LONG_UTF8, there is no need to tease anything apart,
1441 * so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1442 * need to have a label 'tease_apart_FF' that it will transfer
1444 * 'incomplete_char_action' This is executed when the DFA ran off the end
1445 * before accepting or rejecting the input.
1446 * DFA_RETURN_FAILURE_ is the likely action, but you could
1447 * have a 'goto', or NOOP. In the latter case the DFA drops
1448 * off the end, and you place your code to handle this case
1449 * immediately after it.
1452 #define DFA_RETURN_SUCCESS_ return s - s0
1453 #define DFA_RETURN_FAILURE_ return 0
1454 #ifdef HAS_EXTRA_LONG_UTF8
1455 # define DFA_TEASE_APART_FF_ goto tease_apart_FF
1457 # define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_
1460 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \
1463 incomplete_char_action) \
1465 const U8 * s = s0; \
1468 PERL_NON_CORE_CHECK_EMPTY(s,e); \
1471 state = dfa_tab[256 + state + dfa_tab[*s]]; \
1474 if (state == 0) { /* Accepting state */ \
1478 if (UNLIKELY(state == 1)) { /* Rejecting state */ \
1483 /* Here, dropped out of loop before end-of-char */ \
1484 incomplete_char_action; \
1490 =for apidoc isUTF8_CHAR
1492 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1493 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1494 that represents some code point; otherwise it evaluates to 0. If non-zero, the
1495 value gives how many bytes starting at C<s> comprise the code point's
1496 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1497 form the first code point in C<s>, are not examined.
1499 The code point can be any that will fit in an IV on this machine, using Perl's
1500 extension to official UTF-8 to represent those higher than the Unicode maximum
1501 of 0x10FFFF. That means that this macro is used to efficiently decide if the
1502 next few bytes in C<s> is legal UTF-8 for a single character.
1504 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1505 defined by Unicode to be fully interchangeable across applications;
1506 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1507 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1508 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1510 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1511 C<L</is_utf8_string_loclen>> to check entire strings.
1513 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1514 machines) is a valid UTF-8 character.
1518 This uses an adaptation of the table and algorithm given in
1519 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1520 documentation of the original version. A copyright notice for the original
1521 version is given at the beginning of this file. The Perl adapation is
1522 documented at the definition of PL_extended_utf8_dfa_tab[].
1525 PERL_STATIC_INLINE Size_t
1526 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1528 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1530 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1531 DFA_RETURN_SUCCESS_,
1532 DFA_TEASE_APART_FF_,
1533 DFA_RETURN_FAILURE_);
1535 /* Here, we didn't return success, but dropped out of the loop. In the
1536 * case of PL_extended_utf8_dfa_tab, this means the input is either
1537 * malformed, or the start byte was FF on a platform that the dfa doesn't
1538 * handle FF's. Call a helper function. */
1540 #ifdef HAS_EXTRA_LONG_UTF8
1544 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1545 * either malformed, or was for the largest possible start byte, which we
1546 * now check, not inline */
1547 if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1551 return is_utf8_FF_helper_(s0, e,
1552 FALSE /* require full, not partial char */
1560 =for apidoc isSTRICT_UTF8_CHAR
1562 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1563 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1564 Unicode code point completely acceptable for open interchange between all
1565 applications; otherwise it evaluates to 0. If non-zero, the value gives how
1566 many bytes starting at C<s> comprise the code point's representation. Any
1567 bytes remaining before C<e>, but beyond the ones needed to form the first code
1568 point in C<s>, are not examined.
1570 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1571 be a surrogate nor a non-character code point. Thus this excludes any code
1572 point from Perl's extended UTF-8.
1574 This is used to efficiently decide if the next few bytes in C<s> is
1575 legal Unicode-acceptable UTF-8 for a single character.
1577 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1578 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1579 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1580 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1582 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1583 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1587 This uses an adaptation of the tables and algorithm given in
1588 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1589 documentation of the original version. A copyright notice for the original
1590 version is given at the beginning of this file. The Perl adapation is
1591 documented at the definition of strict_extended_utf8_dfa_tab[].
1595 PERL_STATIC_INLINE Size_t
1596 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1598 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1600 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1601 DFA_RETURN_SUCCESS_,
1603 DFA_RETURN_FAILURE_);
1606 /* Here, we didn't return success, but dropped out of the loop. In the
1607 * case of PL_strict_utf8_dfa_tab, this means the input is either
1608 * malformed, or was for certain Hanguls; handle them specially */
1610 /* The dfa above drops out for incomplete or illegal inputs, and certain
1611 * legal Hanguls; check and return accordingly */
1612 return is_HANGUL_ED_utf8_safe(s0, e);
1617 =for apidoc isC9_STRICT_UTF8_CHAR
1619 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1620 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1621 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1622 the value gives how many bytes starting at C<s> comprise the code point's
1623 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1624 form the first code point in C<s>, are not examined.
1626 The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1627 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1628 code points. This corresponds to
1629 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1630 which said that non-character code points are merely discouraged rather than
1631 completely forbidden in open interchange. See
1632 L<perlunicode/Noncharacter code points>.
1634 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1635 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1637 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1638 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1642 This uses an adaptation of the tables and algorithm given in
1643 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1644 documentation of the original version. A copyright notice for the original
1645 version is given at the beginning of this file. The Perl adapation is
1646 documented at the definition of PL_c9_utf8_dfa_tab[].
1650 PERL_STATIC_INLINE Size_t
1651 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1653 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1655 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1656 DFA_RETURN_SUCCESS_,
1657 DFA_RETURN_FAILURE_,
1658 DFA_RETURN_FAILURE_);
1663 =for apidoc is_strict_utf8_string_loc
1665 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1666 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1667 "utf8ness success") in the C<ep> pointer.
1669 See also C<L</is_strict_utf8_string_loclen>>.
1674 #define is_strict_utf8_string_loc(s, len, ep) \
1675 is_strict_utf8_string_loclen(s, len, ep, 0)
1679 =for apidoc is_strict_utf8_string_loclen
1681 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1682 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1683 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1684 encoded characters in the C<el> pointer.
1686 See also C<L</is_strict_utf8_string_loc>>.
1691 PERL_STATIC_INLINE bool
1692 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1694 const U8 * first_variant;
1696 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1699 len = strlen((const char *) s);
1702 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1714 const U8* const send = s + len;
1715 const U8* x = first_variant;
1716 STRLEN outlen = first_variant - s;
1719 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1720 if (UNLIKELY(! cur_len)) {
1740 =for apidoc is_c9strict_utf8_string_loc
1742 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1743 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1744 "utf8ness success") in the C<ep> pointer.
1746 See also C<L</is_c9strict_utf8_string_loclen>>.
1751 #define is_c9strict_utf8_string_loc(s, len, ep) \
1752 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1756 =for apidoc is_c9strict_utf8_string_loclen
1758 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1759 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1760 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1761 characters in the C<el> pointer.
1763 See also C<L</is_c9strict_utf8_string_loc>>.
1768 PERL_STATIC_INLINE bool
1769 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1771 const U8 * first_variant;
1773 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1776 len = strlen((const char *) s);
1779 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1791 const U8* const send = s + len;
1792 const U8* x = first_variant;
1793 STRLEN outlen = first_variant - s;
1796 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1797 if (UNLIKELY(! cur_len)) {
1817 =for apidoc is_utf8_string_loc_flags
1819 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1820 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1821 "utf8ness success") in the C<ep> pointer.
1823 See also C<L</is_utf8_string_loclen_flags>>.
1828 #define is_utf8_string_loc_flags(s, len, ep, flags) \
1829 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1832 /* The above 3 actual functions could have been moved into the more general one
1833 * just below, and made #defines that call it with the right 'flags'. They are
1834 * currently kept separate to increase their chances of getting inlined */
1838 =for apidoc is_utf8_string_loclen_flags
1840 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1841 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1842 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1843 encoded characters in the C<el> pointer.
1845 See also C<L</is_utf8_string_loc_flags>>.
1850 PERL_STATIC_INLINE bool
1851 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1853 const U8 * first_variant;
1855 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1856 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1857 |UTF8_DISALLOW_PERL_EXTENDED)));
1860 len = strlen((const char *) s);
1864 return is_utf8_string_loclen(s, len, ep, el);
1867 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1868 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1870 return is_strict_utf8_string_loclen(s, len, ep, el);
1873 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1874 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1876 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1879 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1891 const U8* send = s + len;
1892 const U8* x = first_variant;
1893 STRLEN outlen = first_variant - s;
1896 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1897 if (UNLIKELY(! cur_len)) {
1916 =for apidoc utf8_distance
1918 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1921 WARNING: use only if you *know* that the pointers point inside the
1927 PERL_STATIC_INLINE IV
1928 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1930 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1932 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1936 =for apidoc utf8_hop
1938 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1939 forward or backward.
1941 WARNING: do not use the following unless you *know* C<off> is within
1942 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1943 on the first byte of character or just after the last byte of a character.
1948 PERL_STATIC_INLINE U8 *
1949 Perl_utf8_hop(const U8 *s, SSize_t off)
1951 PERL_ARGS_ASSERT_UTF8_HOP;
1953 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1954 * the bitops (especially ~) can create illegal UTF-8.
1955 * In other words: in Perl UTF-8 is not just for Unicode. */
1964 while (UTF8_IS_CONTINUATION(*s))
1968 GCC_DIAG_IGNORE(-Wcast-qual)
1974 =for apidoc utf8_hop_forward
1976 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1979 C<off> must be non-negative.
1981 C<s> must be before or equal to C<end>.
1983 When moving forward it will not move beyond C<end>.
1985 Will not exceed this limit even if the string is not valid "UTF-8".
1990 PERL_STATIC_INLINE U8 *
1991 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1993 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1995 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1996 * the bitops (especially ~) can create illegal UTF-8.
1997 * In other words: in Perl UTF-8 is not just for Unicode. */
2003 STRLEN skip = UTF8SKIP(s);
2004 if ((STRLEN)(end - s) <= skip) {
2005 GCC_DIAG_IGNORE(-Wcast-qual)
2012 GCC_DIAG_IGNORE(-Wcast-qual)
2018 =for apidoc utf8_hop_back
2020 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2023 C<off> must be non-positive.
2025 C<s> must be after or equal to C<start>.
2027 When moving backward it will not move before C<start>.
2029 Will not exceed this limit even if the string is not valid "UTF-8".
2034 PERL_STATIC_INLINE U8 *
2035 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2037 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2039 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2040 * the bitops (especially ~) can create illegal UTF-8.
2041 * In other words: in Perl UTF-8 is not just for Unicode. */
2046 while (off++ && s > start) {
2049 } while (UTF8_IS_CONTINUATION(*s) && s > start);
2052 GCC_DIAG_IGNORE(-Wcast-qual)
2058 =for apidoc utf8_hop_safe
2060 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2061 either forward or backward.
2063 When moving backward it will not move before C<start>.
2065 When moving forward it will not move beyond C<end>.
2067 Will not exceed those limits even if the string is not valid "UTF-8".
2072 PERL_STATIC_INLINE U8 *
2073 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2075 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2077 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2078 * the bitops (especially ~) can create illegal UTF-8.
2079 * In other words: in Perl UTF-8 is not just for Unicode. */
2081 assert(start <= s && s <= end);
2084 return utf8_hop_forward(s, off, end);
2087 return utf8_hop_back(s, off, start);
2093 =for apidoc isUTF8_CHAR_flags
2095 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2096 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2097 that represents some code point, subject to the restrictions given by C<flags>;
2098 otherwise it evaluates to 0. If non-zero, the value gives how many bytes
2099 starting at C<s> comprise the code point's representation. Any bytes remaining
2100 before C<e>, but beyond the ones needed to form the first code point in C<s>,
2103 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2104 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2105 as C<L</isSTRICT_UTF8_CHAR>>;
2106 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2107 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2108 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2109 understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2111 The three alternative macros are for the most commonly needed validations; they
2112 are likely to run somewhat faster than this more general one, as they can be
2113 inlined into your code.
2115 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2116 L</is_utf8_string_loclen_flags> to check entire strings.
2121 PERL_STATIC_INLINE STRLEN
2122 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2124 PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2125 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2126 |UTF8_DISALLOW_PERL_EXTENDED)));
2128 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2130 DFA_TEASE_APART_FF_,
2131 DFA_RETURN_FAILURE_);
2135 return is_utf8_char_helper_(s0, e, flags);
2137 #ifdef HAS_EXTRA_LONG_UTF8
2141 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2142 * either malformed, or was for the largest possible start byte, which
2143 * indicates perl extended UTF-8, well above the Unicode maximum */
2144 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2145 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2150 /* Otherwise examine the sequence not inline */
2151 return is_utf8_FF_helper_(s0, e,
2152 FALSE /* require full, not partial char */
2160 =for apidoc is_utf8_valid_partial_char
2162 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2163 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2164 points. Otherwise, it returns 1 if there exists at least one non-empty
2165 sequence of bytes that when appended to sequence C<s>, starting at position
2166 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2167 otherwise returns 0.
2169 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2172 This is useful when a fixed-length buffer is being tested for being well-formed
2173 UTF-8, but the final few bytes in it don't comprise a full character; that is,
2174 it is split somewhere in the middle of the final code point's UTF-8
2175 representation. (Presumably when the buffer is refreshed with the next chunk
2176 of data, the new first bytes will complete the partial code point.) This
2177 function is used to verify that the final bytes in the current buffer are in
2178 fact the legal beginning of some code point, so that if they aren't, the
2179 failure can be signalled without having to wait for the next read.
2183 #define is_utf8_valid_partial_char(s, e) \
2184 is_utf8_valid_partial_char_flags(s, e, 0)
2188 =for apidoc is_utf8_valid_partial_char_flags
2190 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2191 or not the input is a valid UTF-8 encoded partial character, but it takes an
2192 extra parameter, C<flags>, which can further restrict which code points are
2195 If C<flags> is 0, this behaves identically to
2196 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
2197 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
2198 there is any sequence of bytes that can complete the input partial character in
2199 such a way that a non-prohibited character is formed, the function returns
2200 TRUE; otherwise FALSE. Non character code points cannot be determined based on
2201 partial character input. But many of the other possible excluded types can be
2202 determined from just the first one or two bytes.
2207 PERL_STATIC_INLINE bool
2208 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2210 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
2211 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2212 |UTF8_DISALLOW_PERL_EXTENDED)));
2214 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2215 DFA_RETURN_FAILURE_,
2216 DFA_TEASE_APART_FF_,
2219 /* The NOOP above causes the DFA to drop down here iff the input was a
2220 * partial character. flags=0 => can return TRUE immediately; otherwise we
2221 * need to check (not inline) if the partial character is the beginning of
2222 * a disallowed one */
2227 return cBOOL(is_utf8_char_helper_(s0, e, flags));
2229 #ifdef HAS_EXTRA_LONG_UTF8
2233 /* Getting here means the input is either malformed, or, in the case of
2234 * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
2235 * latter case has to be extended UTF-8, so can fail immediately if that is
2238 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2239 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2244 return is_utf8_FF_helper_(s0, e,
2245 TRUE /* Require to be a partial character */
2253 =for apidoc is_utf8_fixed_width_buf_flags
2255 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2256 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2257 otherwise it returns FALSE.
2259 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2260 without restriction. If the final few bytes of the buffer do not form a
2261 complete code point, this will return TRUE anyway, provided that
2262 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2264 If C<flags> in non-zero, it can be any combination of the
2265 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2268 This function differs from C<L</is_utf8_string_flags>> only in that the latter
2269 returns FALSE if the final few bytes of the string don't form a complete code
2274 #define is_utf8_fixed_width_buf_flags(s, len, flags) \
2275 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2279 =for apidoc is_utf8_fixed_width_buf_loc_flags
2281 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2282 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
2283 to the beginning of any partial character at the end of the buffer; if there is
2284 no partial character C<*ep> will contain C<s>+C<len>.
2286 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2291 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
2292 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2296 =for apidoc is_utf8_fixed_width_buf_loclen_flags
2298 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2299 complete, valid characters found in the C<el> pointer.
2304 PERL_STATIC_INLINE bool
2305 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
2311 const U8 * maybe_partial;
2313 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2316 ep = &maybe_partial;
2319 /* If it's entirely valid, return that; otherwise see if the only error is
2320 * that the final few bytes are for a partial character */
2321 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
2322 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2325 PERL_STATIC_INLINE UV
2326 Perl_utf8n_to_uvchr_msgs(const U8 *s,
2333 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
2334 * simple cases, and, if necessary calls a helper function to deal with the
2335 * more complex ones. Almost all well-formed non-problematic code points
2336 * are considered simple, so that it's unlikely that the helper function
2337 * will need to be called.
2339 * This is an adaptation of the tables and algorithm given in
2340 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
2341 * comprehensive documentation of the original version. A copyright notice
2342 * for the original version is given at the beginning of this file. The
2343 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
2346 const U8 * const s0 = s;
2347 const U8 * send = s0 + curlen;
2351 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2353 /* This dfa is fast. If it accepts the input, it was for a well-formed,
2354 * non-problematic code point, which can be returned immediately.
2355 * Otherwise we call a helper function to figure out the more complicated
2358 /* No calls from core pass in an empty string; non-core need a check */
2362 if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
2363 flags, errors, msgs);
2366 type = PL_strict_utf8_dfa_tab[*s];
2368 /* The table is structured so that 'type' is 0 iff the input byte is
2369 * represented identically regardless of the UTF-8ness of the string */
2370 if (type == 0) { /* UTF-8 invariants are returned unchanged */
2374 UV state = PL_strict_utf8_dfa_tab[256 + type];
2375 uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
2377 while (++s < send) {
2378 type = PL_strict_utf8_dfa_tab[*s];
2379 state = PL_strict_utf8_dfa_tab[256 + state + type];
2381 uv = UTF8_ACCUMULATE(uv, *s);
2385 uv = UNI_TO_NATIVE(uv);
2390 if (UNLIKELY(state == 1)) {
2395 /* Here is potentially problematic. Use the full mechanism */
2396 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
2402 *retlen = s - s0 + 1;
2414 PERL_STATIC_INLINE UV
2415 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2417 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
2421 if (! ckWARN_d(WARN_UTF8)) {
2423 /* EMPTY is not really allowed, and asserts on debugging builds. But
2424 * on non-debugging we have to deal with it, and this causes it to
2425 * return the REPLACEMENT CHARACTER, as the documentation indicates */
2426 return utf8n_to_uvchr(s, send - s, retlen,
2427 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
2430 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
2431 if (retlen && ret == 0 && (send <= s || *s != '\0')) {
2432 *retlen = (STRLEN) -1;
2439 /* ------------------------------- perl.h ----------------------------- */
2442 =for apidoc_section $utility
2444 =for apidoc is_safe_syscall
2446 Test that the given C<pv> (with length C<len>) doesn't contain any internal
2448 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2449 category, and return FALSE.
2451 Return TRUE if the name is safe.
2453 C<what> and C<op_name> are used in any warning.
2455 Used by the C<IS_SAFE_SYSCALL()> macro.
2460 PERL_STATIC_INLINE bool
2461 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2463 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2464 * perl itself uses xce*() functions which accept 8-bit strings.
2467 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2471 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
2472 SETERRNO(ENOENT, LIB_INVARG);
2473 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
2474 "Invalid \\0 character in %s for %s: %s\\0%s",
2475 what, op_name, pv, null_at+1);
2485 Return true if the supplied filename has a newline character
2486 immediately before the first (hopefully only) NUL.
2488 My original look at this incorrectly used the len from SvPV(), but
2489 that's incorrect, since we allow for a NUL in pv[len-1].
2491 So instead, strlen() and work from there.
2493 This allow for the user reading a filename, forgetting to chomp it,
2496 open my $foo, "$file\0";
2502 PERL_STATIC_INLINE bool
2503 S_should_warn_nl(const char *pv)
2507 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2511 return len > 0 && pv[len-1] == '\n';
2516 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2518 PERL_STATIC_INLINE bool
2519 S_lossless_NV_to_IV(const NV nv, IV *ivp)
2521 /* This function determines if the input NV 'nv' may be converted without
2522 * loss of data to an IV. If not, it returns FALSE taking no other action.
2523 * But if it is possible, it does the conversion, returning TRUE, and
2524 * storing the converted result in '*ivp' */
2526 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2528 # if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2529 /* Normally any comparison with a NaN returns false; if we can't rely
2530 * on that behaviour, check explicitly */
2531 if (UNLIKELY(Perl_isnan(nv))) {
2536 /* Written this way so that with an always-false NaN comparison we
2538 if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
2542 if ((IV) nv != nv) {
2552 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2554 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2556 #define MAX_CHARSET_NAME_LENGTH 2
2558 PERL_STATIC_INLINE const char *
2559 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2561 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2563 /* Returns a string that corresponds to the name of the regex character set
2564 * given by 'flags', and *lenp is set the length of that string, which
2565 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2568 switch (get_regex_charset(flags)) {
2569 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2570 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2571 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2572 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2573 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2575 return ASCII_MORE_RESTRICT_PAT_MODS;
2577 /* The NOT_REACHED; hides an assert() which has a rather complex
2578 * definition in perl.h. */
2579 NOT_REACHED; /* NOTREACHED */
2580 return "?"; /* Unknown */
2587 Return false if any get magic is on the SV other than taint magic.
2591 PERL_STATIC_INLINE bool
2592 Perl_sv_only_taint_gmagic(SV *sv)
2594 MAGIC *mg = SvMAGIC(sv);
2596 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2599 if (mg->mg_type != PERL_MAGIC_taint
2600 && !(mg->mg_flags & MGf_GSKIP)
2601 && mg->mg_virtual->svt_get) {
2604 mg = mg->mg_moremagic;
2610 /* ------------------ cop.h ------------------------------------------- */
2612 /* implement GIMME_V() macro */
2614 PERL_STATIC_INLINE U8
2618 U8 gimme = (PL_op->op_flags & OPf_WANT);
2622 cxix = PL_curstackinfo->si_cxsubix;
2624 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
2625 assert(cxstack[cxix].blk_gimme & G_WANT);
2626 return (cxstack[cxix].blk_gimme & G_WANT);
2630 /* Enter a block. Push a new base context and return its address. */
2632 PERL_STATIC_INLINE PERL_CONTEXT *
2633 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2637 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2642 cx->blk_gimme = gimme;
2643 cx->blk_oldsaveix = saveix;
2644 cx->blk_oldsp = (I32)(sp - PL_stack_base);
2645 cx->blk_oldcop = PL_curcop;
2646 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
2647 cx->blk_oldscopesp = PL_scopestack_ix;
2648 cx->blk_oldpm = PL_curpm;
2649 cx->blk_old_tmpsfloor = PL_tmps_floor;
2651 PL_tmps_floor = PL_tmps_ix;
2652 CX_DEBUG(cx, "PUSH");
2657 /* Exit a block (RETURN and LAST). */
2659 PERL_STATIC_INLINE void
2660 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2662 PERL_ARGS_ASSERT_CX_POPBLOCK;
2664 CX_DEBUG(cx, "POP");
2665 /* these 3 are common to cx_popblock and cx_topblock */
2666 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2667 PL_scopestack_ix = cx->blk_oldscopesp;
2668 PL_curpm = cx->blk_oldpm;
2670 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2671 * and leaves a CX entry lying around for repeated use, so
2672 * skip for multicall */ \
2673 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2674 || PL_savestack_ix == cx->blk_oldsaveix);
2675 PL_curcop = cx->blk_oldcop;
2676 PL_tmps_floor = cx->blk_old_tmpsfloor;
2679 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2680 * Whereas cx_popblock() restores the state to the point just before
2681 * cx_pushblock() was called, cx_topblock() restores it to the point just
2682 * *after* cx_pushblock() was called. */
2684 PERL_STATIC_INLINE void
2685 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2687 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2689 CX_DEBUG(cx, "TOP");
2690 /* these 3 are common to cx_popblock and cx_topblock */
2691 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2692 PL_scopestack_ix = cx->blk_oldscopesp;
2693 PL_curpm = cx->blk_oldpm;
2695 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2699 PERL_STATIC_INLINE void
2700 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2702 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2704 PERL_ARGS_ASSERT_CX_PUSHSUB;
2706 PERL_DTRACE_PROBE_ENTRY(cv);
2707 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2708 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2709 cx->blk_sub.cv = cv;
2710 cx->blk_sub.olddepth = CvDEPTH(cv);
2711 cx->blk_sub.prevcomppad = PL_comppad;
2712 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2713 cx->blk_sub.retop = retop;
2714 SvREFCNT_inc_simple_void_NN(cv);
2715 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2719 /* subsets of cx_popsub() */
2721 PERL_STATIC_INLINE void
2722 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2726 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2727 assert(CxTYPE(cx) == CXt_SUB);
2729 PL_comppad = cx->blk_sub.prevcomppad;
2730 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2731 cv = cx->blk_sub.cv;
2732 CvDEPTH(cv) = cx->blk_sub.olddepth;
2733 cx->blk_sub.cv = NULL;
2735 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2739 /* handle the @_ part of leaving a sub */
2741 PERL_STATIC_INLINE void
2742 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2746 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2747 assert(CxTYPE(cx) == CXt_SUB);
2748 assert(AvARRAY(MUTABLE_AV(
2749 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2750 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2752 CX_POP_SAVEARRAY(cx);
2753 av = MUTABLE_AV(PAD_SVl(0));
2754 if (UNLIKELY(AvREAL(av)))
2755 /* abandon @_ if it got reified */
2756 clear_defarray(av, 0);
2763 PERL_STATIC_INLINE void
2764 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2766 PERL_ARGS_ASSERT_CX_POPSUB;
2767 assert(CxTYPE(cx) == CXt_SUB);
2769 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2773 cx_popsub_common(cx);
2777 PERL_STATIC_INLINE void
2778 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2780 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2782 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2783 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2784 cx->blk_format.cv = cv;
2785 cx->blk_format.retop = retop;
2786 cx->blk_format.gv = gv;
2787 cx->blk_format.dfoutgv = PL_defoutgv;
2788 cx->blk_format.prevcomppad = PL_comppad;
2791 SvREFCNT_inc_simple_void_NN(cv);
2793 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2797 PERL_STATIC_INLINE void
2798 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2803 PERL_ARGS_ASSERT_CX_POPFORMAT;
2804 assert(CxTYPE(cx) == CXt_FORMAT);
2806 dfout = cx->blk_format.dfoutgv;
2808 cx->blk_format.dfoutgv = NULL;
2809 SvREFCNT_dec_NN(dfout);
2811 PL_comppad = cx->blk_format.prevcomppad;
2812 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2813 cv = cx->blk_format.cv;
2814 cx->blk_format.cv = NULL;
2816 SvREFCNT_dec_NN(cv);
2817 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2821 PERL_STATIC_INLINE void
2822 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2824 cx->blk_eval.retop = retop;
2825 cx->blk_eval.old_namesv = namesv;
2826 cx->blk_eval.old_eval_root = PL_eval_root;
2827 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2828 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2829 cx->blk_eval.cur_top_env = PL_top_env;
2831 assert(!(PL_in_eval & ~ 0x3F));
2832 assert(!(PL_op->op_type & ~0x1FF));
2833 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2836 PERL_STATIC_INLINE void
2837 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2839 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2841 Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2843 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2844 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2847 PERL_STATIC_INLINE void
2848 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2850 PERL_ARGS_ASSERT_CX_PUSHTRY;
2852 Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2854 /* Don't actually change it, just store the current value so it's restored
2855 * by the common popeval */
2856 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2860 PERL_STATIC_INLINE void
2861 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2865 PERL_ARGS_ASSERT_CX_POPEVAL;
2866 assert(CxTYPE(cx) == CXt_EVAL);
2868 PL_in_eval = CxOLD_IN_EVAL(cx);
2869 assert(!(PL_in_eval & 0xc0));
2870 PL_eval_root = cx->blk_eval.old_eval_root;
2871 sv = cx->blk_eval.cur_text;
2872 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2873 cx->blk_eval.cur_text = NULL;
2874 SvREFCNT_dec_NN(sv);
2877 sv = cx->blk_eval.old_namesv;
2879 cx->blk_eval.old_namesv = NULL;
2880 SvREFCNT_dec_NN(sv);
2882 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2886 /* push a plain loop, i.e.
2888 * while (cond) { block }
2889 * for (init;cond;continue) { block }
2890 * This loop can be last/redo'ed etc.
2893 PERL_STATIC_INLINE void
2894 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
2896 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2897 cx->blk_loop.my_op = cLOOP;
2901 /* push a true for loop, i.e.
2902 * for var (list) { block }
2905 PERL_STATIC_INLINE void
2906 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
2908 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2910 /* this one line is common with cx_pushloop_plain */
2911 cx->blk_loop.my_op = cLOOP;
2913 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2914 cx->blk_loop.itersave = itersave;
2916 cx->blk_loop.oldcomppad = PL_comppad;
2921 /* pop all loop types, including plain */
2923 PERL_STATIC_INLINE void
2924 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
2926 PERL_ARGS_ASSERT_CX_POPLOOP;
2928 assert(CxTYPE_is_LOOP(cx));
2929 if ( CxTYPE(cx) == CXt_LOOP_ARY
2930 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2932 /* Free ary or cur. This assumes that state_u.ary.ary
2933 * aligns with state_u.lazysv.cur. See cx_dup() */
2934 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2935 cx->blk_loop.state_u.lazysv.cur = NULL;
2936 SvREFCNT_dec_NN(sv);
2937 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2938 sv = cx->blk_loop.state_u.lazysv.end;
2939 cx->blk_loop.state_u.lazysv.end = NULL;
2940 SvREFCNT_dec_NN(sv);
2943 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2945 SV **svp = (cx)->blk_loop.itervar_u.svp;
2946 if ((cx->cx_type & CXp_FOR_GV))
2947 svp = &GvSV((GV*)svp);
2949 *svp = cx->blk_loop.itersave;
2950 cx->blk_loop.itersave = NULL;
2951 SvREFCNT_dec(cursv);
2956 PERL_STATIC_INLINE void
2957 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2959 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2961 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2965 PERL_STATIC_INLINE void
2966 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2968 PERL_ARGS_ASSERT_CX_POPWHEN;
2969 assert(CxTYPE(cx) == CXt_WHEN);
2971 PERL_UNUSED_ARG(cx);
2972 PERL_UNUSED_CONTEXT;
2973 /* currently NOOP */
2977 PERL_STATIC_INLINE void
2978 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
2980 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2982 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2983 cx->blk_givwhen.defsv_save = orig_defsv;
2987 PERL_STATIC_INLINE void
2988 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
2992 PERL_ARGS_ASSERT_CX_POPGIVEN;
2993 assert(CxTYPE(cx) == CXt_GIVEN);
2995 sv = GvSV(PL_defgv);
2996 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2997 cx->blk_givwhen.defsv_save = NULL;
3001 /* ------------------ util.h ------------------------------------------- */
3004 =for apidoc_section $string
3008 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3010 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
3011 match themselves and their opposite case counterparts. Non-cased and non-ASCII
3012 range bytes match only themselves.
3017 PERL_STATIC_INLINE I32
3018 Perl_foldEQ(const char *s1, const char *s2, I32 len)
3020 const U8 *a = (const U8 *)s1;
3021 const U8 *b = (const U8 *)s2;
3023 PERL_ARGS_ASSERT_FOLDEQ;
3028 if (*a != *b && *a != PL_fold[*b])
3035 PERL_STATIC_INLINE I32
3036 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
3038 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
3039 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3040 * does not check for this. Nor does it check that the strings each have
3041 * at least 'len' characters. */
3043 const U8 *a = (const U8 *)s1;
3044 const U8 *b = (const U8 *)s2;
3046 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3051 if (*a != *b && *a != PL_fold_latin1[*b]) {
3060 =for apidoc_section $locale
3061 =for apidoc foldEQ_locale
3063 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3064 same case-insensitively in the current locale; false otherwise.
3069 PERL_STATIC_INLINE I32
3070 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
3072 const U8 *a = (const U8 *)s1;
3073 const U8 *b = (const U8 *)s2;
3075 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3080 if (*a != *b && *a != PL_fold_locale[*b])
3088 =for apidoc_section $string
3089 =for apidoc my_strnlen
3091 The C library C<strnlen> if available, or a Perl implementation of it.
3093 C<my_strnlen()> computes the length of the string, up to C<maxlen>
3094 characters. It will never attempt to address more than C<maxlen>
3095 characters, making it suitable for use with strings that are not
3096 guaranteed to be NUL-terminated.
3100 Description stolen from http://man.openbsd.org/strnlen.3,
3101 implementation stolen from PostgreSQL.
3105 PERL_STATIC_INLINE Size_t
3106 Perl_my_strnlen(const char *str, Size_t maxlen)
3108 const char *end = (char *) memchr(str, '\0', maxlen);
3110 PERL_ARGS_ASSERT_MY_STRNLEN;
3112 if (end == NULL) return maxlen;
3118 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3120 PERL_STATIC_INLINE void *
3121 S_my_memrchr(const char * s, const char c, const STRLEN len)
3123 /* memrchr(), since many platforms lack it */
3125 const char * t = s + len - 1;
3127 PERL_ARGS_ASSERT_MY_MEMRCHR;
3141 PERL_STATIC_INLINE char *
3142 Perl_mortal_getenv(const char * str)
3144 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3146 * It's (mostly) thread-safe because it uses a mutex to prevent other
3147 * threads (that look at this mutex) from destroying the result before this
3148 * routine has a chance to copy the result to a place that won't be
3149 * destroyed before the caller gets a chance to handle it. That place is a
3150 * mortal SV. khw chose this over SAVEFREEPV because he is under the
3151 * impression that the SV will hang around longer under more circumstances
3153 * The reason it isn't completely thread-safe is that other code could
3154 * simply not pay attention to the mutex. All of the Perl core uses the
3155 * mutex, but it is possible for code from, say XS, to not use this mutex,
3156 * defeating the safety.
3158 * getenv() returns, in some implementations, a pointer to a spot in the
3159 * **environ array, which could be invalidated at any time by this or
3160 * another thread changing the environment. Other implementations copy the
3161 * **environ value to a static buffer, returning a pointer to that. That
3162 * buffer might or might not be invalidated by a getenv() call in another
3163 * thread. If it does get zapped, we need an exclusive lock. Otherwise,
3164 * many getenv() calls can safely be running simultaneously, so a
3165 * many-reader (but no simultaneous writers) lock is ok. There is a
3166 * Configure probe to see if another thread destroys the buffer, and the
3167 * mutex is defined accordingly.
3169 * But in all cases, using the mutex prevents these problems, as long as
3170 * all code uses the same mutex.
3172 * A complication is that this can be called during phases where the
3173 * mortalization process isn't available. These are in interpreter
3174 * destruction or early in construction. khw believes that at these times
3175 * there shouldn't be anything else going on, so plain getenv is safe AS
3176 * LONG AS the caller acts on the return before calling it again. */
3181 PERL_ARGS_ASSERT_MORTAL_GETENV;
3183 /* Can't mortalize without stacks. khw believes that no other threads
3184 * should be running, so no need to lock things, and this may be during a
3185 * phase when locking isn't even available */
3186 if (UNLIKELY(PL_scopestack_ix == 0)) {
3192 /* A major complication arises under PERL_MEM_LOG. When that is active,
3193 * every memory allocation may result in logging, depending on the value of
3194 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for
3195 * saving ENV{foo}'s value (but before saving it), the logging code will
3196 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some
3197 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3198 * lock a boolean mutex recursively); 3) destroying the getenv() static
3199 * buffer; or 4) destroying the temporary created by this for the copy
3200 * causes a log entry to be made which could cause a new temporary to be
3201 * created, which will need to be destroyed at some point, leading to an
3204 * The solution adopted here (after some gnashing of teeth) is to detect
3205 * the recursive calls and calls from the logger, and treat them specially.
3206 * Let's say we want to do getenv("foo"). We first find
3207 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3208 * variable, so no temporary is required. Then we do getenv(foo}, and in
3209 * the process of creating a temporary to save it, this function will be
3210 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call,
3211 * we detect that it is such a call and return our saved value instead of
3212 * locking and doing a new getenv(). This solves all of problems 1), 2),
3213 * and 3). Because all the getenv()s are done while the mutex is locked,
3214 * the state cannot have changed. To solve 4), we don't create a temporary
3215 * when this is called from the logging code. That code disposes of the
3216 * return value while the mutex is still locked.
3218 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3219 * digits and 3 particular letters are significant; the rest are ignored by
3220 * the memory logging code. Thus the per-interpreter variable only needs
3221 * to be large enough to save the significant information, the size of
3222 * which is known at compile time. The first byte is extra, reserved for
3223 * flags for our use. To protect against overflowing, only the reserved
3224 * byte, as many digits as don't overflow, and the three letters are
3227 * The reserved byte has two bits:
3228 * 0x1 if set indicates that if we get here, it is a recursive call of
3230 * 0x2 if set indicates that the call is from the logging code.
3232 * If the flag indicates this is a recursive call, just return the stored
3233 * value of PL_mem_log; An empty value gets turned into NULL. */
3234 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3235 if (PL_mem_log[1] == '\0') {
3238 return PL_mem_log + 1;
3248 /* Here we are in a critical section. As explained above, we do our own
3249 * getenv(PERL_MEM_LOG), saving the result safely. */
3250 ret = getenv("PERL_MEM_LOG");
3251 if (ret == NULL) { /* No logging active */
3253 /* Return that immediately if called from the logging code */
3254 if (PL_mem_log[0] & 0x2) {
3259 PL_mem_log[1] = '\0';
3262 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */
3264 /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3265 * extremely long string. But we want only a few characters from it.
3266 * PL_mem_log has been made large enough to hold just the ones we need.
3267 * First the file descriptor. */
3268 if (isDIGIT(*ret)) {
3269 const char * s = ret;
3270 if (UNLIKELY(*s == '0')) {
3272 /* Reduce multiple leading zeros to a single one. This is to
3273 * allow the caller to change what to do with leading zeros. */
3274 *mem_log_meat++ = '0';
3281 /* If the input overflows, copy just enough for the result to also
3282 * overflow, plus 1 to make sure */
3283 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3284 *mem_log_meat++ = *s++;
3288 /* Then each of the three significant characters */
3289 if (strchr(ret, 'm')) {
3290 *mem_log_meat++ = 'm';
3292 if (strchr(ret, 's')) {
3293 *mem_log_meat++ = 's';
3295 if (strchr(ret, 't')) {
3296 *mem_log_meat++ = 't';
3298 *mem_log_meat = '\0';
3300 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3303 /* If we are being called from the logger, it only needs the significant
3304 * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3305 if (PL_mem_log[0] & 0x2) {
3306 assert(strEQ(str, "PERL_MEM_LOG"));
3308 return PL_mem_log + 1;
3311 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that
3312 * is coming from other than the logging code, so it should be treated the
3313 * same as any other getenv(), returning the full value, not just the
3314 * significant part, and having its value saved. Set the flag that
3315 * indicates any call to this routine will be a recursion from here */
3316 PL_mem_log[0] = 0x1;
3320 /* Now get the value of the real desired variable, and save a copy */
3324 ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
3331 /* Clear the buffer */
3332 Zero(PL_mem_log, sizeof(PL_mem_log), char);
3339 PERL_STATIC_INLINE bool
3340 Perl_sv_isbool(pTHX_ const SV *sv)
3342 /* change to the following in 5.37, logically the same but
3343 * more efficient and more future proof */
3345 return (SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv));
3347 return SvIOK(sv) && SvPOK(sv) && SvIsCOW_static(sv) &&
3348 (SvPVX_const(sv) == PL_Yes || SvPVX_const(sv) == PL_No);
3355 PERL_STATIC_INLINE AV *
3356 Perl_cop_file_avn(pTHX_ const COP *cop) {
3358 PERL_ARGS_ASSERT_COP_FILE_AVN;
3360 const char *file = CopFILE(cop);
3362 GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3376 * ex: set ts=8 sts=4 sw=4 et: