3 * Copyright (C) 2012 by Larry Wall and others
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
8 * This file contains tables and code adapted from
9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * This file is a home for static inline functions that cannot go in other
34 * header files, because they depend on proto.h (included after most other
35 * headers) or struct definitions.
37 * Each section names the header file that the functions "belong" to.
40 /* ------------------------------- av.h ------------------------------- */
42 PERL_STATIC_INLINE SSize_t
43 Perl_av_top_index(pTHX_ AV *av)
45 PERL_ARGS_ASSERT_AV_TOP_INDEX;
46 assert(SvTYPE(av) == SVt_PVAV);
51 /* ------------------------------- cv.h ------------------------------- */
53 PERL_STATIC_INLINE GV *
54 Perl_CvGV(pTHX_ CV *sv)
56 PERL_ARGS_ASSERT_CVGV;
59 ? Perl_cvgv_from_hek(aTHX_ sv)
60 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
63 PERL_STATIC_INLINE I32 *
64 Perl_CvDEPTH(const CV * const sv)
66 PERL_ARGS_ASSERT_CVDEPTH;
67 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
69 return &((XPVCV*)SvANY(sv))->xcv_depth;
73 CvPROTO returns the prototype as stored, which is not necessarily what
74 the interpreter should be using. Specifically, the interpreter assumes
75 that spaces have been stripped, which has been the case if the prototype
76 was added by toke.c, but is generally not the case if it was added elsewhere.
77 Since we can't enforce the spacelessness at assignment time, this routine
78 provides a temporary copy at parse time with spaces removed.
79 I<orig> is the start of the original buffer, I<len> is the length of the
80 prototype and will be updated when this returns.
84 PERL_STATIC_INLINE char *
85 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
89 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
97 *len = tmps - SvPVX(tmpsv);
102 /* ------------------------------- mg.h ------------------------------- */
104 #if defined(PERL_CORE) || defined(PERL_EXT)
105 /* assumes get-magic and stringification have already occurred */
106 PERL_STATIC_INLINE STRLEN
107 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
109 assert(mg->mg_type == PERL_MAGIC_regex_global);
110 assert(mg->mg_len != -1);
111 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
112 return (STRLEN)mg->mg_len;
114 const STRLEN pos = (STRLEN)mg->mg_len;
115 /* Without this check, we may read past the end of the buffer: */
116 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
117 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
122 /* ------------------------------- pad.h ------------------------------ */
124 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
125 PERL_STATIC_INLINE bool
126 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
128 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
130 /* is seq within the range _LOW to _HIGH ?
131 * This is complicated by the fact that PL_cop_seqmax
132 * may have wrapped around at some point */
133 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
134 return FALSE; /* not yet introduced */
136 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
137 /* in compiling scope */
139 (seq > COP_SEQ_RANGE_LOW(pn))
140 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
141 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
146 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
148 ( seq > COP_SEQ_RANGE_LOW(pn)
149 || seq <= COP_SEQ_RANGE_HIGH(pn))
151 : ( seq > COP_SEQ_RANGE_LOW(pn)
152 && seq <= COP_SEQ_RANGE_HIGH(pn))
159 /* ------------------------------- pp.h ------------------------------- */
161 PERL_STATIC_INLINE I32
164 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
165 "MARK top %p %" IVdf "\n",
167 (IV)*PL_markstack_ptr)));
168 return *PL_markstack_ptr;
171 PERL_STATIC_INLINE I32
174 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
175 "MARK pop %p %" IVdf "\n",
176 (PL_markstack_ptr-1),
177 (IV)*(PL_markstack_ptr-1))));
178 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
179 return *PL_markstack_ptr--;
182 /* ----------------------------- regexp.h ----------------------------- */
184 PERL_STATIC_INLINE struct regexp *
185 Perl_ReANY(const REGEXP * const re)
187 XPV* const p = (XPV*)SvANY(re);
189 PERL_ARGS_ASSERT_REANY;
190 assert(isREGEXP(re));
192 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
193 : (struct regexp *)p;
196 /* ------------------------------- sv.h ------------------------------- */
198 PERL_STATIC_INLINE SV *
199 Perl_SvREFCNT_inc(SV *sv)
201 if (LIKELY(sv != NULL))
205 PERL_STATIC_INLINE SV *
206 Perl_SvREFCNT_inc_NN(SV *sv)
208 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
213 PERL_STATIC_INLINE void
214 Perl_SvREFCNT_inc_void(SV *sv)
216 if (LIKELY(sv != NULL))
219 PERL_STATIC_INLINE void
220 Perl_SvREFCNT_dec(pTHX_ SV *sv)
222 if (LIKELY(sv != NULL)) {
223 U32 rc = SvREFCNT(sv);
225 SvREFCNT(sv) = rc - 1;
227 Perl_sv_free2(aTHX_ sv, rc);
231 PERL_STATIC_INLINE void
232 Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
234 U32 rc = SvREFCNT(sv);
236 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
239 SvREFCNT(sv) = rc - 1;
241 Perl_sv_free2(aTHX_ sv, rc);
244 PERL_STATIC_INLINE void
245 Perl_SvAMAGIC_on(SV *sv)
247 PERL_ARGS_ASSERT_SVAMAGIC_ON;
250 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
252 PERL_STATIC_INLINE void
253 Perl_SvAMAGIC_off(SV *sv)
255 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
257 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
258 HvAMAGIC_off(SvSTASH(SvRV(sv)));
261 PERL_STATIC_INLINE U32
262 Perl_SvPADSTALE_on(SV *sv)
264 assert(!(SvFLAGS(sv) & SVs_PADTMP));
265 return SvFLAGS(sv) |= SVs_PADSTALE;
267 PERL_STATIC_INLINE U32
268 Perl_SvPADSTALE_off(SV *sv)
270 assert(!(SvFLAGS(sv) & SVs_PADTMP));
271 return SvFLAGS(sv) &= ~SVs_PADSTALE;
273 #if defined(PERL_CORE) || defined (PERL_EXT)
274 PERL_STATIC_INLINE STRLEN
275 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
277 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
279 U8 *hopped = utf8_hop((U8 *)pv, pos);
280 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
281 return (STRLEN)(hopped - (U8 *)pv);
283 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
287 /* ------------------------------- handy.h ------------------------------- */
289 /* saves machine code for a common noreturn idiom typically used in Newx*() */
290 GCC_DIAG_IGNORE_DECL(-Wunused-function);
292 Perl_croak_memory_wrap(void)
294 Perl_croak_nocontext("%s",PL_memory_wrap);
296 GCC_DIAG_RESTORE_DECL;
298 /* ------------------------------- utf8.h ------------------------------- */
301 =head1 Unicode Support
304 PERL_STATIC_INLINE void
305 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
307 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
308 * encoded string at '*dest', updating '*dest' to include it */
310 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
312 if (NATIVE_BYTE_IS_INVARIANT(byte))
315 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
316 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
321 =for apidoc valid_utf8_to_uvchr
322 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
323 known that the next character in the input UTF-8 string C<s> is well-formed
324 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
325 points, and non-Unicode code points are allowed.
331 PERL_STATIC_INLINE UV
332 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
334 const UV expectlen = UTF8SKIP(s);
335 const U8* send = s + expectlen;
338 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
344 /* An invariant is trivially returned */
345 if (expectlen == 1) {
349 /* Remove the leading bits that indicate the number of bytes, leaving just
350 * the bits that are part of the value */
351 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
353 /* Now, loop through the remaining bytes, accumulating each into the
354 * working total as we go. (I khw tried unrolling the loop for up to 4
355 * bytes, but there was no performance improvement) */
356 for (++s; s < send; s++) {
357 uv = UTF8_ACCUMULATE(uv, *s);
360 return UNI_TO_NATIVE(uv);
365 =for apidoc is_utf8_invariant_string
367 Returns TRUE if the first C<len> bytes of the string C<s> are the same
368 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
369 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
370 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
371 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
372 characters are invariant, but so also are the C1 controls.
374 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
375 use this option, that C<s> can't have embedded C<NUL> characters and has to
376 have a terminating C<NUL> byte).
379 C<L</is_utf8_string>>,
380 C<L</is_utf8_string_flags>>,
381 C<L</is_utf8_string_loc>>,
382 C<L</is_utf8_string_loc_flags>>,
383 C<L</is_utf8_string_loclen>>,
384 C<L</is_utf8_string_loclen_flags>>,
385 C<L</is_utf8_fixed_width_buf_flags>>,
386 C<L</is_utf8_fixed_width_buf_loc_flags>>,
387 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
388 C<L</is_strict_utf8_string>>,
389 C<L</is_strict_utf8_string_loc>>,
390 C<L</is_strict_utf8_string_loclen>>,
391 C<L</is_c9strict_utf8_string>>,
392 C<L</is_c9strict_utf8_string_loc>>,
394 C<L</is_c9strict_utf8_string_loclen>>.
400 #define is_utf8_invariant_string(s, len) \
401 is_utf8_invariant_string_loc(s, len, NULL)
404 =for apidoc is_utf8_invariant_string_loc
406 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
407 the first UTF-8 variant character in the C<ep> pointer; if all characters are
408 UTF-8 invariant, this function does not change the contents of C<*ep>.
414 PERL_STATIC_INLINE bool
415 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
420 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
423 len = strlen((const char *)s);
428 /* This looks like 0x010101... */
429 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
431 /* This looks like 0x808080... */
432 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
433 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
434 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
436 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
437 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
438 * optimized out completely on a 32-bit system, and its mask gets optimized out
439 * on a 64-bit system */
440 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
441 | ( PTR2nat(x) >> 1) \
443 & PERL_WORD_BOUNDARY_MASK) >> 2))))
447 /* Do the word-at-a-time iff there is at least one usable full word. That
448 * means that after advancing to a word boundary, there still is at least a
449 * full word left. The number of bytes needed to advance is 'wordsize -
450 * offset' unless offset is 0. */
451 if ((STRLEN) (send - x) >= PERL_WORDSIZE
453 /* This term is wordsize if subword; 0 if not */
454 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
457 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
460 /* Process per-byte until reach word boundary. XXX This loop could be
461 * eliminated if we knew that this platform had fast unaligned reads */
462 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
463 if (! UTF8_IS_INVARIANT(*x)) {
473 /* Here, we know we have at least one full word to process. Process
474 * per-word as long as we have at least a full word left */
476 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
478 /* Found a variant. Just return if caller doesn't want its
484 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
485 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
487 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
488 assert(*ep >= s && *ep < send);
492 # else /* If weird byte order, drop into next loop to do byte-at-a-time
501 } while (x + PERL_WORDSIZE <= send);
504 #endif /* End of ! EBCDIC */
506 /* Process per-byte */
508 if (! UTF8_IS_INVARIANT(*x)) {
524 PERL_STATIC_INLINE unsigned int
525 Perl_variant_byte_number(PERL_UINTMAX_T word)
528 /* This returns the position in a word (0..7) of the first variant byte in
529 * it. This is a helper function. Note that there are no branches */
533 /* Get just the msb bits of each byte */
534 word &= PERL_VARIANTS_WORD_MASK;
536 # ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the
537 easiest thing is to hide that from the callers */
540 const U8 * s = (U8 *) &word;
543 for (i = 0; i < sizeof(word); i++ ) {
549 Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n",
553 # elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
555 /* Bytes are stored like
556 * Byte8 ... Byte2 Byte1
557 * 63..56...15...8 7...0
560 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
562 * The word will look this this, with a rightmost set bit in position 's':
563 * ('x's are don't cares)
566 * x..xx10..0 Right shift (rightmost 0 is shifted off)
567 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
568 * the 1 just to their left into a 0; the remainder is
570 * 0..0011..1 The xor with the original, x..xx10..0, clears that
571 * remainder, sets the bottom to all 1
572 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
574 * Another method is to do 'word &= -word'; but it generates a compiler
575 * message on some platforms about taking the negative of an unsigned */
578 word = 1 + (word ^ (word - 1));
580 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
582 /* Bytes are stored like
583 * Byte1 Byte2 ... Byte8
584 * 63..56 55..47 ... 7...0
586 * Isolate the msb; http://codeforces.com/blog/entry/10330
588 * Only the most significant set bit matters. Or'ing word with its right
589 * shift of 1 makes that bit and the next one to its right both 1. Then
590 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
591 * msb and all to the right being 1. */
597 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
599 /* Then subtracting the right shift by 1 clears all but the left-most of
600 * the 1 bits, which is our desired result */
604 # error Unexpected byte order
607 /* Here 'word' has a single bit set: the msb of the first byte in which it
608 * is set. Calculate that position in the word. We can use this
609 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
610 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
611 * just get shifted off at compile time) */
612 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
613 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
614 | (39 << 24) | (47 << 16)
615 | (55 << 8) | (63 << 0));
616 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
618 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
619 word = ((word + 1) >> 3) - 1;
621 # if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
623 /* And invert the result */
624 word = CHARBITS - word - 1;
628 return (unsigned int) word;
632 #if defined(PERL_CORE) || defined(PERL_EXT)
635 =for apidoc variant_under_utf8_count
637 This function looks at the sequence of bytes between C<s> and C<e>, which are
638 assumed to be encoded in ASCII/Latin1, and returns how many of them would
639 change should the string be translated into UTF-8. Due to the nature of UTF-8,
640 each of these would occupy two bytes instead of the single one in the input
641 string. Thus, this function returns the precise number of bytes the string
642 would expand by when translated to UTF-8.
644 Unlike most of the other functions that have C<utf8> in their name, the input
645 to this function is NOT a UTF-8-encoded string. The function name is slightly
646 I<odd> to emphasize this.
648 This function is internal to Perl because khw thinks that any XS code that
649 would want this is probably operating too close to the internals. Presenting a
650 valid use case could change that.
653 C<L<perlapi/is_utf8_invariant_string>>
655 C<L<perlapi/is_utf8_invariant_string_loc>>,
661 PERL_STATIC_INLINE Size_t
662 S_variant_under_utf8_count(const U8* const s, const U8* const e)
667 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
671 /* Test if the string is long enough to use word-at-a-time. (Logic is the
672 * same as for is_utf8_invariant_string()) */
673 if ((STRLEN) (e - x) >= PERL_WORDSIZE
674 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
675 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
678 /* Process per-byte until reach word boundary. XXX This loop could be
679 * eliminated if we knew that this platform had fast unaligned reads */
680 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
681 count += ! UTF8_IS_INVARIANT(*x++);
684 /* Process per-word as long as we have at least a full word left */
685 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
686 explanation of how this works */
687 PERL_UINTMAX_T increment
688 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
689 * PERL_COUNT_MULTIPLIER)
690 >> ((PERL_WORDSIZE - 1) * CHARBITS);
691 count += (Size_t) increment;
693 } while (x + PERL_WORDSIZE <= e);
698 /* Process per-byte */
700 if (! UTF8_IS_INVARIANT(*x)) {
712 #ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
713 # undef PERL_WORDSIZE
714 # undef PERL_COUNT_MULTIPLIER
715 # undef PERL_WORD_BOUNDARY_MASK
716 # undef PERL_VARIANTS_WORD_MASK
720 =for apidoc is_utf8_string
722 Returns TRUE if the first C<len> bytes of string C<s> form a valid
723 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
724 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
725 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
726 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
728 This function considers Perl's extended UTF-8 to be valid. That means that
729 code points above Unicode, surrogates, and non-character code points are
730 considered valid by this function. Use C<L</is_strict_utf8_string>>,
731 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
732 code points are considered valid.
735 C<L</is_utf8_invariant_string>>,
736 C<L</is_utf8_invariant_string_loc>>,
737 C<L</is_utf8_string_loc>>,
738 C<L</is_utf8_string_loclen>>,
739 C<L</is_utf8_fixed_width_buf_flags>>,
740 C<L</is_utf8_fixed_width_buf_loc_flags>>,
741 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
746 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
748 #if defined(PERL_CORE) || defined (PERL_EXT)
751 =for apidoc is_utf8_non_invariant_string
753 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
754 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
755 UTF-8; otherwise returns FALSE.
757 A TRUE return means that at least one code point represented by the sequence
758 either is a wide character not representable as a single byte, or the
759 representation differs depending on whether the sequence is encoded in UTF-8 or
763 C<L<perlapi/is_utf8_invariant_string>>,
764 C<L<perlapi/is_utf8_string>>
768 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
769 It generally needn't be if its string is entirely UTF-8 invariant, and it
770 shouldn't be if it otherwise contains invalid UTF-8.
772 It is an internal function because khw thinks that XS code shouldn't be working
773 at this low a level. A valid use case could change that.
777 PERL_STATIC_INLINE bool
778 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
780 const U8 * first_variant;
782 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
784 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
788 return is_utf8_string(first_variant, len - (first_variant - s));
794 =for apidoc is_strict_utf8_string
796 Returns TRUE if the first C<len> bytes of string C<s> form a valid
797 UTF-8-encoded string that is fully interchangeable by any application using
798 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
799 calculated using C<strlen(s)> (which means if you use this option, that C<s>
800 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
801 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
803 This function returns FALSE for strings containing any
804 code points above the Unicode max of 0x10FFFF, surrogate code points, or
805 non-character code points.
808 C<L</is_utf8_invariant_string>>,
809 C<L</is_utf8_invariant_string_loc>>,
810 C<L</is_utf8_string>>,
811 C<L</is_utf8_string_flags>>,
812 C<L</is_utf8_string_loc>>,
813 C<L</is_utf8_string_loc_flags>>,
814 C<L</is_utf8_string_loclen>>,
815 C<L</is_utf8_string_loclen_flags>>,
816 C<L</is_utf8_fixed_width_buf_flags>>,
817 C<L</is_utf8_fixed_width_buf_loc_flags>>,
818 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
819 C<L</is_strict_utf8_string_loc>>,
820 C<L</is_strict_utf8_string_loclen>>,
821 C<L</is_c9strict_utf8_string>>,
822 C<L</is_c9strict_utf8_string_loc>>,
824 C<L</is_c9strict_utf8_string_loclen>>.
829 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
832 =for apidoc is_c9strict_utf8_string
834 Returns TRUE if the first C<len> bytes of string C<s> form a valid
835 UTF-8-encoded string that conforms to
836 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
837 otherwise it returns FALSE. If C<len> is 0, it will be calculated using
838 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
839 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
840 characters being ASCII constitute 'a valid UTF-8 string'.
842 This function returns FALSE for strings containing any code points above the
843 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
845 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
848 C<L</is_utf8_invariant_string>>,
849 C<L</is_utf8_invariant_string_loc>>,
850 C<L</is_utf8_string>>,
851 C<L</is_utf8_string_flags>>,
852 C<L</is_utf8_string_loc>>,
853 C<L</is_utf8_string_loc_flags>>,
854 C<L</is_utf8_string_loclen>>,
855 C<L</is_utf8_string_loclen_flags>>,
856 C<L</is_utf8_fixed_width_buf_flags>>,
857 C<L</is_utf8_fixed_width_buf_loc_flags>>,
858 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
859 C<L</is_strict_utf8_string>>,
860 C<L</is_strict_utf8_string_loc>>,
861 C<L</is_strict_utf8_string_loclen>>,
862 C<L</is_c9strict_utf8_string_loc>>,
864 C<L</is_c9strict_utf8_string_loclen>>.
869 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
872 =for apidoc is_utf8_string_flags
874 Returns TRUE if the first C<len> bytes of string C<s> form a valid
875 UTF-8 string, subject to the restrictions imposed by C<flags>;
876 returns FALSE otherwise. If C<len> is 0, it will be calculated
877 using C<strlen(s)> (which means if you use this option, that C<s> can't have
878 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
879 that all characters being ASCII constitute 'a valid UTF-8 string'.
881 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
882 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
883 as C<L</is_strict_utf8_string>>; and if C<flags> is
884 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
885 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
886 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
887 C<L</utf8n_to_uvchr>>, with the same meanings.
890 C<L</is_utf8_invariant_string>>,
891 C<L</is_utf8_invariant_string_loc>>,
892 C<L</is_utf8_string>>,
893 C<L</is_utf8_string_loc>>,
894 C<L</is_utf8_string_loc_flags>>,
895 C<L</is_utf8_string_loclen>>,
896 C<L</is_utf8_string_loclen_flags>>,
897 C<L</is_utf8_fixed_width_buf_flags>>,
898 C<L</is_utf8_fixed_width_buf_loc_flags>>,
899 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
900 C<L</is_strict_utf8_string>>,
901 C<L</is_strict_utf8_string_loc>>,
902 C<L</is_strict_utf8_string_loclen>>,
903 C<L</is_c9strict_utf8_string>>,
904 C<L</is_c9strict_utf8_string_loc>>,
906 C<L</is_c9strict_utf8_string_loclen>>.
911 PERL_STATIC_INLINE bool
912 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
914 const U8 * first_variant;
916 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
917 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
918 |UTF8_DISALLOW_PERL_EXTENDED)));
921 len = strlen((const char *)s);
925 return is_utf8_string(s, len);
928 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
929 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
931 return is_strict_utf8_string(s, len);
934 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
935 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
937 return is_c9strict_utf8_string(s, len);
940 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
941 const U8* const send = s + len;
942 const U8* x = first_variant;
945 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
946 if (UNLIKELY(! cur_len)) {
958 =for apidoc is_utf8_string_loc
960 Like C<L</is_utf8_string>> but stores the location of the failure (in the
961 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
962 "utf8ness success") in the C<ep> pointer.
964 See also C<L</is_utf8_string_loclen>>.
969 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
973 =for apidoc is_utf8_string_loclen
975 Like C<L</is_utf8_string>> but stores the location of the failure (in the
976 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
977 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
978 encoded characters in the C<el> pointer.
980 See also C<L</is_utf8_string_loc>>.
985 PERL_STATIC_INLINE bool
986 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
988 const U8 * first_variant;
990 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
993 len = strlen((const char *) s);
996 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1008 const U8* const send = s + len;
1009 const U8* x = first_variant;
1010 STRLEN outlen = first_variant - s;
1013 const STRLEN cur_len = isUTF8_CHAR(x, send);
1014 if (UNLIKELY(! cur_len)) {
1034 =for apidoc isUTF8_CHAR
1036 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1037 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1038 that represents some code point; otherwise it evaluates to 0. If non-zero, the
1039 value gives how many bytes starting at C<s> comprise the code point's
1040 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1041 form the first code point in C<s>, are not examined.
1043 The code point can be any that will fit in an IV on this machine, using Perl's
1044 extension to official UTF-8 to represent those higher than the Unicode maximum
1045 of 0x10FFFF. That means that this macro is used to efficiently decide if the
1046 next few bytes in C<s> is legal UTF-8 for a single character.
1048 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1049 defined by Unicode to be fully interchangeable across applications;
1050 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1051 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1052 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1054 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1055 C<L</is_utf8_string_loclen>> to check entire strings.
1057 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1058 machines) is a valid UTF-8 character.
1062 This uses an adaptation of the table and algorithm given in
1063 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1064 documentation of the original version. A copyright notice for the original
1065 version is given at the beginning of this file. The Perl adapation is
1066 documented at the definition of PL_extended_utf8_dfa_tab[].
1070 PERL_STATIC_INLINE Size_t
1071 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1076 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1078 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1079 * code point, which can be returned immediately. Otherwise, it is either
1080 * malformed, or for the start byte FF which the dfa doesn't handle (except
1081 * on 32-bit ASCII platforms where it trivially is an error). Call a
1082 * helper function for the other platforms. */
1084 while (s < e && LIKELY(state != 1)) {
1085 state = PL_extended_utf8_dfa_tab[256
1087 + PL_extended_utf8_dfa_tab[*s]];
1096 #if defined(UV_IS_QUAD) || defined(EBCDIC)
1098 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1099 return is_utf8_char_helper(s0, e, 0);
1109 =for apidoc isSTRICT_UTF8_CHAR
1111 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1112 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1113 Unicode code point completely acceptable for open interchange between all
1114 applications; otherwise it evaluates to 0. If non-zero, the value gives how
1115 many bytes starting at C<s> comprise the code point's representation. Any
1116 bytes remaining before C<e>, but beyond the ones needed to form the first code
1117 point in C<s>, are not examined.
1119 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1120 be a surrogate nor a non-character code point. Thus this excludes any code
1121 point from Perl's extended UTF-8.
1123 This is used to efficiently decide if the next few bytes in C<s> is
1124 legal Unicode-acceptable UTF-8 for a single character.
1126 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1127 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1128 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1129 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1131 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1132 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1136 This uses an adaptation of the tables and algorithm given in
1137 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1138 documentation of the original version. A copyright notice for the original
1139 version is given at the beginning of this file. The Perl adapation is
1140 documented at the definition of strict_extended_utf8_dfa_tab[].
1144 PERL_STATIC_INLINE Size_t
1145 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1150 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1152 while (s < e && LIKELY(state != 1)) {
1153 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
1165 /* The dfa above drops out for certain Hanguls; handle them specially */
1166 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1177 =for apidoc isC9_STRICT_UTF8_CHAR
1179 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1180 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1181 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1182 the value gives how many bytes starting at C<s> comprise the code point's
1183 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1184 form the first code point in C<s>, are not examined.
1186 The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1187 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1188 code points. This corresponds to
1189 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1190 which said that non-character code points are merely discouraged rather than
1191 completely forbidden in open interchange. See
1192 L<perlunicode/Noncharacter code points>.
1194 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1195 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1197 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1198 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1202 This uses an adaptation of the tables and algorithm given in
1203 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1204 documentation of the original version. A copyright notice for the original
1205 version is given at the beginning of this file. The Perl adapation is
1206 documented at the definition of PL_c9_utf8_dfa_tab[].
1210 PERL_STATIC_INLINE Size_t
1211 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1216 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1218 while (s < e && LIKELY(state != 1)) {
1219 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
1234 =for apidoc is_strict_utf8_string_loc
1236 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1237 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1238 "utf8ness success") in the C<ep> pointer.
1240 See also C<L</is_strict_utf8_string_loclen>>.
1245 #define is_strict_utf8_string_loc(s, len, ep) \
1246 is_strict_utf8_string_loclen(s, len, ep, 0)
1250 =for apidoc is_strict_utf8_string_loclen
1252 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1253 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1254 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1255 encoded characters in the C<el> pointer.
1257 See also C<L</is_strict_utf8_string_loc>>.
1262 PERL_STATIC_INLINE bool
1263 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1265 const U8 * first_variant;
1267 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1270 len = strlen((const char *) s);
1273 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1285 const U8* const send = s + len;
1286 const U8* x = first_variant;
1287 STRLEN outlen = first_variant - s;
1290 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1291 if (UNLIKELY(! cur_len)) {
1311 =for apidoc is_c9strict_utf8_string_loc
1313 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1314 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1315 "utf8ness success") in the C<ep> pointer.
1317 See also C<L</is_c9strict_utf8_string_loclen>>.
1322 #define is_c9strict_utf8_string_loc(s, len, ep) \
1323 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1327 =for apidoc is_c9strict_utf8_string_loclen
1329 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1330 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1331 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1332 characters in the C<el> pointer.
1334 See also C<L</is_c9strict_utf8_string_loc>>.
1339 PERL_STATIC_INLINE bool
1340 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1342 const U8 * first_variant;
1344 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1347 len = strlen((const char *) s);
1350 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1362 const U8* const send = s + len;
1363 const U8* x = first_variant;
1364 STRLEN outlen = first_variant - s;
1367 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1368 if (UNLIKELY(! cur_len)) {
1388 =for apidoc is_utf8_string_loc_flags
1390 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1391 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1392 "utf8ness success") in the C<ep> pointer.
1394 See also C<L</is_utf8_string_loclen_flags>>.
1399 #define is_utf8_string_loc_flags(s, len, ep, flags) \
1400 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1403 /* The above 3 actual functions could have been moved into the more general one
1404 * just below, and made #defines that call it with the right 'flags'. They are
1405 * currently kept separate to increase their chances of getting inlined */
1409 =for apidoc is_utf8_string_loclen_flags
1411 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1412 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1413 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1414 encoded characters in the C<el> pointer.
1416 See also C<L</is_utf8_string_loc_flags>>.
1421 PERL_STATIC_INLINE bool
1422 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1424 const U8 * first_variant;
1426 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1427 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1428 |UTF8_DISALLOW_PERL_EXTENDED)));
1431 len = strlen((const char *) s);
1435 return is_utf8_string_loclen(s, len, ep, el);
1438 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1439 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1441 return is_strict_utf8_string_loclen(s, len, ep, el);
1444 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1445 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1447 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1450 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1462 const U8* send = s + len;
1463 const U8* x = first_variant;
1464 STRLEN outlen = first_variant - s;
1467 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1468 if (UNLIKELY(! cur_len)) {
1487 =for apidoc utf8_distance
1489 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1492 WARNING: use only if you *know* that the pointers point inside the
1498 PERL_STATIC_INLINE IV
1499 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1501 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1503 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1507 =for apidoc utf8_hop
1509 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1510 forward or backward.
1512 WARNING: do not use the following unless you *know* C<off> is within
1513 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1514 on the first byte of character or just after the last byte of a character.
1519 PERL_STATIC_INLINE U8 *
1520 Perl_utf8_hop(const U8 *s, SSize_t off)
1522 PERL_ARGS_ASSERT_UTF8_HOP;
1524 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1525 * the bitops (especially ~) can create illegal UTF-8.
1526 * In other words: in Perl UTF-8 is not just for Unicode. */
1535 while (UTF8_IS_CONTINUATION(*s))
1539 GCC_DIAG_IGNORE(-Wcast-qual)
1545 =for apidoc utf8_hop_forward
1547 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1550 C<off> must be non-negative.
1552 C<s> must be before or equal to C<end>.
1554 When moving forward it will not move beyond C<end>.
1556 Will not exceed this limit even if the string is not valid "UTF-8".
1561 PERL_STATIC_INLINE U8 *
1562 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1564 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1566 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1567 * the bitops (especially ~) can create illegal UTF-8.
1568 * In other words: in Perl UTF-8 is not just for Unicode. */
1574 STRLEN skip = UTF8SKIP(s);
1575 if ((STRLEN)(end - s) <= skip) {
1576 GCC_DIAG_IGNORE(-Wcast-qual)
1583 GCC_DIAG_IGNORE(-Wcast-qual)
1589 =for apidoc utf8_hop_back
1591 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1594 C<off> must be non-positive.
1596 C<s> must be after or equal to C<start>.
1598 When moving backward it will not move before C<start>.
1600 Will not exceed this limit even if the string is not valid "UTF-8".
1605 PERL_STATIC_INLINE U8 *
1606 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1608 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1610 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1611 * the bitops (especially ~) can create illegal UTF-8.
1612 * In other words: in Perl UTF-8 is not just for Unicode. */
1617 while (off++ && s > start) {
1620 } while (UTF8_IS_CONTINUATION(*s) && s > start);
1623 GCC_DIAG_IGNORE(-Wcast-qual)
1629 =for apidoc utf8_hop_safe
1631 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1632 either forward or backward.
1634 When moving backward it will not move before C<start>.
1636 When moving forward it will not move beyond C<end>.
1638 Will not exceed those limits even if the string is not valid "UTF-8".
1643 PERL_STATIC_INLINE U8 *
1644 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1646 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1648 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1649 * the bitops (especially ~) can create illegal UTF-8.
1650 * In other words: in Perl UTF-8 is not just for Unicode. */
1652 assert(start <= s && s <= end);
1655 return utf8_hop_forward(s, off, end);
1658 return utf8_hop_back(s, off, start);
1664 =for apidoc is_utf8_valid_partial_char
1666 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1667 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1668 points. Otherwise, it returns 1 if there exists at least one non-empty
1669 sequence of bytes that when appended to sequence C<s>, starting at position
1670 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1671 otherwise returns 0.
1673 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1676 This is useful when a fixed-length buffer is being tested for being well-formed
1677 UTF-8, but the final few bytes in it don't comprise a full character; that is,
1678 it is split somewhere in the middle of the final code point's UTF-8
1679 representation. (Presumably when the buffer is refreshed with the next chunk
1680 of data, the new first bytes will complete the partial code point.) This
1681 function is used to verify that the final bytes in the current buffer are in
1682 fact the legal beginning of some code point, so that if they aren't, the
1683 failure can be signalled without having to wait for the next read.
1687 #define is_utf8_valid_partial_char(s, e) \
1688 is_utf8_valid_partial_char_flags(s, e, 0)
1692 =for apidoc is_utf8_valid_partial_char_flags
1694 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1695 or not the input is a valid UTF-8 encoded partial character, but it takes an
1696 extra parameter, C<flags>, which can further restrict which code points are
1699 If C<flags> is 0, this behaves identically to
1700 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1701 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1702 there is any sequence of bytes that can complete the input partial character in
1703 such a way that a non-prohibited character is formed, the function returns
1704 TRUE; otherwise FALSE. Non character code points cannot be determined based on
1705 partial character input. But many of the other possible excluded types can be
1706 determined from just the first one or two bytes.
1711 PERL_STATIC_INLINE bool
1712 Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
1714 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
1716 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1717 |UTF8_DISALLOW_PERL_EXTENDED)));
1719 if (s >= e || s + UTF8SKIP(s) <= e) {
1723 return cBOOL(is_utf8_char_helper(s, e, flags));
1728 =for apidoc is_utf8_fixed_width_buf_flags
1730 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1731 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1732 otherwise it returns FALSE.
1734 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1735 without restriction. If the final few bytes of the buffer do not form a
1736 complete code point, this will return TRUE anyway, provided that
1737 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1739 If C<flags> in non-zero, it can be any combination of the
1740 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1743 This function differs from C<L</is_utf8_string_flags>> only in that the latter
1744 returns FALSE if the final few bytes of the string don't form a complete code
1749 #define is_utf8_fixed_width_buf_flags(s, len, flags) \
1750 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1754 =for apidoc is_utf8_fixed_width_buf_loc_flags
1756 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1757 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1758 to the beginning of any partial character at the end of the buffer; if there is
1759 no partial character C<*ep> will contain C<s>+C<len>.
1761 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1766 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1767 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1771 =for apidoc is_utf8_fixed_width_buf_loclen_flags
1773 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1774 complete, valid characters found in the C<el> pointer.
1779 PERL_STATIC_INLINE bool
1780 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
1786 const U8 * maybe_partial;
1788 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1791 ep = &maybe_partial;
1794 /* If it's entirely valid, return that; otherwise see if the only error is
1795 * that the final few bytes are for a partial character */
1796 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1797 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1800 PERL_STATIC_INLINE UV
1801 Perl_utf8n_to_uvchr_msgs(const U8 *s,
1808 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1809 * simple cases, and, if necessary calls a helper function to deal with the
1810 * more complex ones. Almost all well-formed non-problematic code points
1811 * are considered simple, so that it's unlikely that the helper function
1812 * will need to be called.
1814 * This is an adaptation of the tables and algorithm given in
1815 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
1816 * comprehensive documentation of the original version. A copyright notice
1817 * for the original version is given at the beginning of this file. The
1818 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
1821 const U8 * const s0 = s;
1822 const U8 * send = s0 + curlen;
1823 UV uv = 0; /* The 0 silences some stupid compilers */
1826 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1828 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1829 * non-problematic code point, which can be returned immediately.
1830 * Otherwise we call a helper function to figure out the more complicated
1833 while (s < send && LIKELY(state != 1)) {
1834 UV type = PL_strict_utf8_dfa_tab[*s];
1837 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1838 : UTF8_ACCUMULATE(uv, *s);
1839 state = PL_strict_utf8_dfa_tab[256 + state + type];
1847 *retlen = s - s0 + 1;
1856 return UNI_TO_NATIVE(uv);
1859 /* Here is potentially problematic. Use the full mechanism */
1860 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1863 PERL_STATIC_INLINE UV
1864 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1866 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
1870 if (! ckWARN_d(WARN_UTF8)) {
1872 /* EMPTY is not really allowed, and asserts on debugging builds. But
1873 * on non-debugging we have to deal with it, and this causes it to
1874 * return the REPLACEMENT CHARACTER, as the documentation indicates */
1875 return utf8n_to_uvchr(s, send - s, retlen,
1876 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
1879 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1880 if (retlen && ret == 0 && *s != '\0') {
1881 *retlen = (STRLEN) -1;
1888 /* ------------------------------- perl.h ----------------------------- */
1891 =head1 Miscellaneous Functions
1893 =for apidoc is_safe_syscall
1895 Test that the given C<pv> (with length C<len>) doesn't contain any internal
1897 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
1898 category, and return FALSE.
1900 Return TRUE if the name is safe.
1902 C<what> and C<op_name> are used in any warning.
1904 Used by the C<IS_SAFE_SYSCALL()> macro.
1909 PERL_STATIC_INLINE bool
1910 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1912 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1913 * perl itself uses xce*() functions which accept 8-bit strings.
1916 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1920 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
1921 SETERRNO(ENOENT, LIB_INVARG);
1922 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
1923 "Invalid \\0 character in %s for %s: %s\\0%s",
1924 what, op_name, pv, null_at+1);
1934 Return true if the supplied filename has a newline character
1935 immediately before the first (hopefully only) NUL.
1937 My original look at this incorrectly used the len from SvPV(), but
1938 that's incorrect, since we allow for a NUL in pv[len-1].
1940 So instead, strlen() and work from there.
1942 This allow for the user reading a filename, forgetting to chomp it,
1945 open my $foo, "$file\0";
1951 PERL_STATIC_INLINE bool
1952 S_should_warn_nl(const char *pv)
1956 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1960 return len > 0 && pv[len-1] == '\n';
1965 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1967 PERL_STATIC_INLINE bool
1968 S_lossless_NV_to_IV(const NV nv, IV *ivp)
1970 /* This function determines if the input NV 'nv' may be converted without
1971 * loss of data to an IV. If not, it returns FALSE taking no other action.
1972 * But if it is possible, it does the conversion, returning TRUE, and
1973 * storing the converted result in '*ivp' */
1975 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1977 # if defined(Perl_isnan)
1979 if (UNLIKELY(Perl_isnan(nv))) {
1985 if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1989 if ((IV) nv != nv) {
1999 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2001 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2003 #define MAX_CHARSET_NAME_LENGTH 2
2005 PERL_STATIC_INLINE const char *
2006 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2008 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2010 /* Returns a string that corresponds to the name of the regex character set
2011 * given by 'flags', and *lenp is set the length of that string, which
2012 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2015 switch (get_regex_charset(flags)) {
2016 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2017 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2018 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2019 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2020 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2022 return ASCII_MORE_RESTRICT_PAT_MODS;
2024 /* The NOT_REACHED; hides an assert() which has a rather complex
2025 * definition in perl.h. */
2026 NOT_REACHED; /* NOTREACHED */
2027 return "?"; /* Unknown */
2034 Return false if any get magic is on the SV other than taint magic.
2038 PERL_STATIC_INLINE bool
2039 Perl_sv_only_taint_gmagic(SV *sv)
2041 MAGIC *mg = SvMAGIC(sv);
2043 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2046 if (mg->mg_type != PERL_MAGIC_taint
2047 && !(mg->mg_flags & MGf_GSKIP)
2048 && mg->mg_virtual->svt_get) {
2051 mg = mg->mg_moremagic;
2057 /* ------------------ cop.h ------------------------------------------- */
2059 /* implement GIMME_V() macro */
2061 PERL_STATIC_INLINE U8
2065 U8 gimme = (PL_op->op_flags & OPf_WANT);
2069 cxix = PL_curstackinfo->si_cxsubix;
2072 assert(cxstack[cxix].blk_gimme & G_WANT);
2073 return (cxstack[cxix].blk_gimme & G_WANT);
2077 /* Enter a block. Push a new base context and return its address. */
2079 PERL_STATIC_INLINE PERL_CONTEXT *
2080 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2084 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2089 cx->blk_gimme = gimme;
2090 cx->blk_oldsaveix = saveix;
2091 cx->blk_oldsp = (I32)(sp - PL_stack_base);
2092 cx->blk_oldcop = PL_curcop;
2093 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
2094 cx->blk_oldscopesp = PL_scopestack_ix;
2095 cx->blk_oldpm = PL_curpm;
2096 cx->blk_old_tmpsfloor = PL_tmps_floor;
2098 PL_tmps_floor = PL_tmps_ix;
2099 CX_DEBUG(cx, "PUSH");
2104 /* Exit a block (RETURN and LAST). */
2106 PERL_STATIC_INLINE void
2107 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2109 PERL_ARGS_ASSERT_CX_POPBLOCK;
2111 CX_DEBUG(cx, "POP");
2112 /* these 3 are common to cx_popblock and cx_topblock */
2113 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2114 PL_scopestack_ix = cx->blk_oldscopesp;
2115 PL_curpm = cx->blk_oldpm;
2117 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2118 * and leaves a CX entry lying around for repeated use, so
2119 * skip for multicall */ \
2120 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2121 || PL_savestack_ix == cx->blk_oldsaveix);
2122 PL_curcop = cx->blk_oldcop;
2123 PL_tmps_floor = cx->blk_old_tmpsfloor;
2126 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2127 * Whereas cx_popblock() restores the state to the point just before
2128 * cx_pushblock() was called, cx_topblock() restores it to the point just
2129 * *after* cx_pushblock() was called. */
2131 PERL_STATIC_INLINE void
2132 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2134 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2136 CX_DEBUG(cx, "TOP");
2137 /* these 3 are common to cx_popblock and cx_topblock */
2138 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2139 PL_scopestack_ix = cx->blk_oldscopesp;
2140 PL_curpm = cx->blk_oldpm;
2142 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2146 PERL_STATIC_INLINE void
2147 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2149 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2151 PERL_ARGS_ASSERT_CX_PUSHSUB;
2153 PERL_DTRACE_PROBE_ENTRY(cv);
2154 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2155 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2156 cx->blk_sub.cv = cv;
2157 cx->blk_sub.olddepth = CvDEPTH(cv);
2158 cx->blk_sub.prevcomppad = PL_comppad;
2159 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2160 cx->blk_sub.retop = retop;
2161 SvREFCNT_inc_simple_void_NN(cv);
2162 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2166 /* subsets of cx_popsub() */
2168 PERL_STATIC_INLINE void
2169 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2173 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2174 assert(CxTYPE(cx) == CXt_SUB);
2176 PL_comppad = cx->blk_sub.prevcomppad;
2177 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2178 cv = cx->blk_sub.cv;
2179 CvDEPTH(cv) = cx->blk_sub.olddepth;
2180 cx->blk_sub.cv = NULL;
2182 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2186 /* handle the @_ part of leaving a sub */
2188 PERL_STATIC_INLINE void
2189 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2193 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2194 assert(CxTYPE(cx) == CXt_SUB);
2195 assert(AvARRAY(MUTABLE_AV(
2196 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2197 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2199 CX_POP_SAVEARRAY(cx);
2200 av = MUTABLE_AV(PAD_SVl(0));
2201 if (UNLIKELY(AvREAL(av)))
2202 /* abandon @_ if it got reified */
2203 clear_defarray(av, 0);
2210 PERL_STATIC_INLINE void
2211 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2213 PERL_ARGS_ASSERT_CX_POPSUB;
2214 assert(CxTYPE(cx) == CXt_SUB);
2216 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2220 cx_popsub_common(cx);
2224 PERL_STATIC_INLINE void
2225 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2227 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2229 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2230 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2231 cx->blk_format.cv = cv;
2232 cx->blk_format.retop = retop;
2233 cx->blk_format.gv = gv;
2234 cx->blk_format.dfoutgv = PL_defoutgv;
2235 cx->blk_format.prevcomppad = PL_comppad;
2238 SvREFCNT_inc_simple_void_NN(cv);
2240 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2244 PERL_STATIC_INLINE void
2245 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2250 PERL_ARGS_ASSERT_CX_POPFORMAT;
2251 assert(CxTYPE(cx) == CXt_FORMAT);
2253 dfout = cx->blk_format.dfoutgv;
2255 cx->blk_format.dfoutgv = NULL;
2256 SvREFCNT_dec_NN(dfout);
2258 PL_comppad = cx->blk_format.prevcomppad;
2259 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2260 cv = cx->blk_format.cv;
2261 cx->blk_format.cv = NULL;
2263 SvREFCNT_dec_NN(cv);
2264 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2268 PERL_STATIC_INLINE void
2269 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2271 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2273 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2274 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2275 cx->blk_eval.retop = retop;
2276 cx->blk_eval.old_namesv = namesv;
2277 cx->blk_eval.old_eval_root = PL_eval_root;
2278 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2279 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2280 cx->blk_eval.cur_top_env = PL_top_env;
2282 assert(!(PL_in_eval & ~ 0x3F));
2283 assert(!(PL_op->op_type & ~0x1FF));
2284 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2288 PERL_STATIC_INLINE void
2289 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2293 PERL_ARGS_ASSERT_CX_POPEVAL;
2294 assert(CxTYPE(cx) == CXt_EVAL);
2296 PL_in_eval = CxOLD_IN_EVAL(cx);
2297 assert(!(PL_in_eval & 0xc0));
2298 PL_eval_root = cx->blk_eval.old_eval_root;
2299 sv = cx->blk_eval.cur_text;
2300 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2301 cx->blk_eval.cur_text = NULL;
2302 SvREFCNT_dec_NN(sv);
2305 sv = cx->blk_eval.old_namesv;
2307 cx->blk_eval.old_namesv = NULL;
2308 SvREFCNT_dec_NN(sv);
2310 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2314 /* push a plain loop, i.e.
2316 * while (cond) { block }
2317 * for (init;cond;continue) { block }
2318 * This loop can be last/redo'ed etc.
2321 PERL_STATIC_INLINE void
2322 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
2324 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2325 cx->blk_loop.my_op = cLOOP;
2329 /* push a true for loop, i.e.
2330 * for var (list) { block }
2333 PERL_STATIC_INLINE void
2334 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
2336 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2338 /* this one line is common with cx_pushloop_plain */
2339 cx->blk_loop.my_op = cLOOP;
2341 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2342 cx->blk_loop.itersave = itersave;
2344 cx->blk_loop.oldcomppad = PL_comppad;
2349 /* pop all loop types, including plain */
2351 PERL_STATIC_INLINE void
2352 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
2354 PERL_ARGS_ASSERT_CX_POPLOOP;
2356 assert(CxTYPE_is_LOOP(cx));
2357 if ( CxTYPE(cx) == CXt_LOOP_ARY
2358 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2360 /* Free ary or cur. This assumes that state_u.ary.ary
2361 * aligns with state_u.lazysv.cur. See cx_dup() */
2362 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2363 cx->blk_loop.state_u.lazysv.cur = NULL;
2364 SvREFCNT_dec_NN(sv);
2365 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2366 sv = cx->blk_loop.state_u.lazysv.end;
2367 cx->blk_loop.state_u.lazysv.end = NULL;
2368 SvREFCNT_dec_NN(sv);
2371 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2373 SV **svp = (cx)->blk_loop.itervar_u.svp;
2374 if ((cx->cx_type & CXp_FOR_GV))
2375 svp = &GvSV((GV*)svp);
2377 *svp = cx->blk_loop.itersave;
2378 cx->blk_loop.itersave = NULL;
2379 SvREFCNT_dec(cursv);
2384 PERL_STATIC_INLINE void
2385 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2387 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2389 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2393 PERL_STATIC_INLINE void
2394 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2396 PERL_ARGS_ASSERT_CX_POPWHEN;
2397 assert(CxTYPE(cx) == CXt_WHEN);
2399 PERL_UNUSED_ARG(cx);
2400 PERL_UNUSED_CONTEXT;
2401 /* currently NOOP */
2405 PERL_STATIC_INLINE void
2406 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
2408 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2410 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2411 cx->blk_givwhen.defsv_save = orig_defsv;
2415 PERL_STATIC_INLINE void
2416 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
2420 PERL_ARGS_ASSERT_CX_POPGIVEN;
2421 assert(CxTYPE(cx) == CXt_GIVEN);
2423 sv = GvSV(PL_defgv);
2424 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2425 cx->blk_givwhen.defsv_save = NULL;
2429 /* ------------------ util.h ------------------------------------------- */
2432 =head1 Miscellaneous Functions
2436 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2438 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2439 match themselves and their opposite case counterparts. Non-cased and non-ASCII
2440 range bytes match only themselves.
2445 PERL_STATIC_INLINE I32
2446 Perl_foldEQ(const char *s1, const char *s2, I32 len)
2448 const U8 *a = (const U8 *)s1;
2449 const U8 *b = (const U8 *)s2;
2451 PERL_ARGS_ASSERT_FOLDEQ;
2456 if (*a != *b && *a != PL_fold[*b])
2463 PERL_STATIC_INLINE I32
2464 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2466 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2467 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2468 * does not check for this. Nor does it check that the strings each have
2469 * at least 'len' characters. */
2471 const U8 *a = (const U8 *)s1;
2472 const U8 *b = (const U8 *)s2;
2474 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2479 if (*a != *b && *a != PL_fold_latin1[*b]) {
2488 =for apidoc foldEQ_locale
2490 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2491 same case-insensitively in the current locale; false otherwise.
2496 PERL_STATIC_INLINE I32
2497 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2500 const U8 *a = (const U8 *)s1;
2501 const U8 *b = (const U8 *)s2;
2503 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2508 if (*a != *b && *a != PL_fold_locale[*b])
2515 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2517 PERL_STATIC_INLINE void *
2518 S_my_memrchr(const char * s, const char c, const STRLEN len)
2520 /* memrchr(), since many platforms lack it */
2522 const char * t = s + len - 1;
2524 PERL_ARGS_ASSERT_MY_MEMRCHR;
2539 * ex: set ts=8 sts=4 sw=4 et: