3 * Copyright (C) 2012 by Larry Wall and others
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
8 * This file contains tables and code adapted from
9 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * This file is a home for static inline functions that cannot go in other
34 * header files, because they depend on proto.h (included after most other
35 * headers) or struct definitions.
37 * Each section names the header file that the functions "belong" to.
40 /* ------------------------------- av.h ------------------------------- */
42 PERL_STATIC_INLINE SSize_t
43 Perl_av_top_index(pTHX_ AV *av)
45 PERL_ARGS_ASSERT_AV_TOP_INDEX;
46 assert(SvTYPE(av) == SVt_PVAV);
51 /* ------------------------------- cv.h ------------------------------- */
53 PERL_STATIC_INLINE GV *
54 Perl_CvGV(pTHX_ CV *sv)
56 PERL_ARGS_ASSERT_CVGV;
59 ? Perl_cvgv_from_hek(aTHX_ sv)
60 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
63 PERL_STATIC_INLINE I32 *
64 Perl_CvDEPTH(const CV * const sv)
66 PERL_ARGS_ASSERT_CVDEPTH;
67 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
69 return &((XPVCV*)SvANY(sv))->xcv_depth;
73 CvPROTO returns the prototype as stored, which is not necessarily what
74 the interpreter should be using. Specifically, the interpreter assumes
75 that spaces have been stripped, which has been the case if the prototype
76 was added by toke.c, but is generally not the case if it was added elsewhere.
77 Since we can't enforce the spacelessness at assignment time, this routine
78 provides a temporary copy at parse time with spaces removed.
79 I<orig> is the start of the original buffer, I<len> is the length of the
80 prototype and will be updated when this returns.
84 PERL_STATIC_INLINE char *
85 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
89 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
97 *len = tmps - SvPVX(tmpsv);
102 /* ------------------------------- mg.h ------------------------------- */
104 #if defined(PERL_CORE) || defined(PERL_EXT)
105 /* assumes get-magic and stringification have already occurred */
106 PERL_STATIC_INLINE STRLEN
107 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
109 assert(mg->mg_type == PERL_MAGIC_regex_global);
110 assert(mg->mg_len != -1);
111 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
112 return (STRLEN)mg->mg_len;
114 const STRLEN pos = (STRLEN)mg->mg_len;
115 /* Without this check, we may read past the end of the buffer: */
116 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
117 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
122 /* ------------------------------- pad.h ------------------------------ */
124 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
125 PERL_STATIC_INLINE bool
126 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
128 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
130 /* is seq within the range _LOW to _HIGH ?
131 * This is complicated by the fact that PL_cop_seqmax
132 * may have wrapped around at some point */
133 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
134 return FALSE; /* not yet introduced */
136 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
137 /* in compiling scope */
139 (seq > COP_SEQ_RANGE_LOW(pn))
140 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
141 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
146 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
148 ( seq > COP_SEQ_RANGE_LOW(pn)
149 || seq <= COP_SEQ_RANGE_HIGH(pn))
151 : ( seq > COP_SEQ_RANGE_LOW(pn)
152 && seq <= COP_SEQ_RANGE_HIGH(pn))
159 /* ------------------------------- pp.h ------------------------------- */
161 PERL_STATIC_INLINE I32
164 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
165 "MARK top %p %" IVdf "\n",
167 (IV)*PL_markstack_ptr)));
168 return *PL_markstack_ptr;
171 PERL_STATIC_INLINE I32
174 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
175 "MARK pop %p %" IVdf "\n",
176 (PL_markstack_ptr-1),
177 (IV)*(PL_markstack_ptr-1))));
178 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
179 return *PL_markstack_ptr--;
182 /* ----------------------------- regexp.h ----------------------------- */
184 PERL_STATIC_INLINE struct regexp *
185 Perl_ReANY(const REGEXP * const re)
187 XPV* const p = (XPV*)SvANY(re);
188 assert(isREGEXP(re));
189 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
190 : (struct regexp *)p;
193 /* ------------------------------- sv.h ------------------------------- */
195 PERL_STATIC_INLINE SV *
196 Perl_SvREFCNT_inc(SV *sv)
198 if (LIKELY(sv != NULL))
202 PERL_STATIC_INLINE SV *
203 Perl_SvREFCNT_inc_NN(SV *sv)
205 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
210 PERL_STATIC_INLINE void
211 Perl_SvREFCNT_inc_void(SV *sv)
213 if (LIKELY(sv != NULL))
216 PERL_STATIC_INLINE void
217 Perl_SvREFCNT_dec(pTHX_ SV *sv)
219 if (LIKELY(sv != NULL)) {
220 U32 rc = SvREFCNT(sv);
222 SvREFCNT(sv) = rc - 1;
224 Perl_sv_free2(aTHX_ sv, rc);
228 PERL_STATIC_INLINE void
229 Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
231 U32 rc = SvREFCNT(sv);
233 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
236 SvREFCNT(sv) = rc - 1;
238 Perl_sv_free2(aTHX_ sv, rc);
241 PERL_STATIC_INLINE void
242 Perl_SvAMAGIC_on(SV *sv)
244 PERL_ARGS_ASSERT_SVAMAGIC_ON;
247 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
249 PERL_STATIC_INLINE void
250 Perl_SvAMAGIC_off(SV *sv)
252 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
254 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
255 HvAMAGIC_off(SvSTASH(SvRV(sv)));
258 PERL_STATIC_INLINE U32
259 Perl_SvPADSTALE_on(SV *sv)
261 assert(!(SvFLAGS(sv) & SVs_PADTMP));
262 return SvFLAGS(sv) |= SVs_PADSTALE;
264 PERL_STATIC_INLINE U32
265 Perl_SvPADSTALE_off(SV *sv)
267 assert(!(SvFLAGS(sv) & SVs_PADTMP));
268 return SvFLAGS(sv) &= ~SVs_PADSTALE;
270 #if defined(PERL_CORE) || defined (PERL_EXT)
271 PERL_STATIC_INLINE STRLEN
272 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
274 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
276 U8 *hopped = utf8_hop((U8 *)pv, pos);
277 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
278 return (STRLEN)(hopped - (U8 *)pv);
280 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
284 /* ------------------------------- handy.h ------------------------------- */
286 /* saves machine code for a common noreturn idiom typically used in Newx*() */
287 GCC_DIAG_IGNORE_DECL(-Wunused-function);
289 Perl_croak_memory_wrap(void)
291 Perl_croak_nocontext("%s",PL_memory_wrap);
293 GCC_DIAG_RESTORE_DECL;
295 /* ------------------------------- utf8.h ------------------------------- */
298 =head1 Unicode Support
301 PERL_STATIC_INLINE void
302 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
304 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
305 * encoded string at '*dest', updating '*dest' to include it */
307 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
309 if (NATIVE_BYTE_IS_INVARIANT(byte))
312 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
313 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
318 =for apidoc valid_utf8_to_uvchr
319 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
320 known that the next character in the input UTF-8 string C<s> is well-formed
321 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
322 points, and non-Unicode code points are allowed.
328 PERL_STATIC_INLINE UV
329 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
331 const UV expectlen = UTF8SKIP(s);
332 const U8* send = s + expectlen;
335 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
341 /* An invariant is trivially returned */
342 if (expectlen == 1) {
346 /* Remove the leading bits that indicate the number of bytes, leaving just
347 * the bits that are part of the value */
348 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
350 /* Now, loop through the remaining bytes, accumulating each into the
351 * working total as we go. (I khw tried unrolling the loop for up to 4
352 * bytes, but there was no performance improvement) */
353 for (++s; s < send; s++) {
354 uv = UTF8_ACCUMULATE(uv, *s);
357 return UNI_TO_NATIVE(uv);
362 =for apidoc is_utf8_invariant_string
364 Returns TRUE if the first C<len> bytes of the string C<s> are the same
365 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
366 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
367 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
368 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
369 characters are invariant, but so also are the C1 controls.
371 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
372 use this option, that C<s> can't have embedded C<NUL> characters and has to
373 have a terminating C<NUL> byte).
376 C<L</is_utf8_string>>,
377 C<L</is_utf8_string_flags>>,
378 C<L</is_utf8_string_loc>>,
379 C<L</is_utf8_string_loc_flags>>,
380 C<L</is_utf8_string_loclen>>,
381 C<L</is_utf8_string_loclen_flags>>,
382 C<L</is_utf8_fixed_width_buf_flags>>,
383 C<L</is_utf8_fixed_width_buf_loc_flags>>,
384 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
385 C<L</is_strict_utf8_string>>,
386 C<L</is_strict_utf8_string_loc>>,
387 C<L</is_strict_utf8_string_loclen>>,
388 C<L</is_c9strict_utf8_string>>,
389 C<L</is_c9strict_utf8_string_loc>>,
391 C<L</is_c9strict_utf8_string_loclen>>.
397 #define is_utf8_invariant_string(s, len) \
398 is_utf8_invariant_string_loc(s, len, NULL)
401 =for apidoc is_utf8_invariant_string_loc
403 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
404 the first UTF-8 variant character in the C<ep> pointer; if all characters are
405 UTF-8 invariant, this function does not change the contents of C<*ep>.
411 PERL_STATIC_INLINE bool
412 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
417 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
420 len = strlen((const char *)s);
425 /* This looks like 0x010101... */
426 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
428 /* This looks like 0x808080... */
429 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
430 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
431 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
433 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
434 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
435 * optimized out completely on a 32-bit system, and its mask gets optimized out
436 * on a 64-bit system */
437 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
438 | ( PTR2nat(x) >> 1) \
440 & PERL_WORD_BOUNDARY_MASK) >> 2))))
444 /* Do the word-at-a-time iff there is at least one usable full word. That
445 * means that after advancing to a word boundary, there still is at least a
446 * full word left. The number of bytes needed to advance is 'wordsize -
447 * offset' unless offset is 0. */
448 if ((STRLEN) (send - x) >= PERL_WORDSIZE
450 /* This term is wordsize if subword; 0 if not */
451 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
454 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
457 /* Process per-byte until reach word boundary. XXX This loop could be
458 * eliminated if we knew that this platform had fast unaligned reads */
459 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
460 if (! UTF8_IS_INVARIANT(*x)) {
470 /* Here, we know we have at least one full word to process. Process
471 * per-word as long as we have at least a full word left */
473 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
475 /* Found a variant. Just return if caller doesn't want its
481 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
482 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
484 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
485 assert(*ep >= s && *ep < send);
489 # else /* If weird byte order, drop into next loop to do byte-at-a-time
498 } while (x + PERL_WORDSIZE <= send);
501 #endif /* End of ! EBCDIC */
503 /* Process per-byte */
505 if (! UTF8_IS_INVARIANT(*x)) {
521 PERL_STATIC_INLINE unsigned int
522 Perl_variant_byte_number(PERL_UINTMAX_T word)
525 /* This returns the position in a word (0..7) of the first variant byte in
526 * it. This is a helper function. Note that there are no branches */
530 /* Get just the msb bits of each byte */
531 word &= PERL_VARIANTS_WORD_MASK;
533 # ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the
534 easiest thing is to hide that from the callers */
537 const U8 * s = (U8 *) &word;
540 for (i = 0; i < sizeof(word); i++ ) {
546 Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n",
550 # elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
552 /* Bytes are stored like
553 * Byte8 ... Byte2 Byte1
554 * 63..56...15...8 7...0
557 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
559 * The word will look this this, with a rightmost set bit in position 's':
560 * ('x's are don't cares)
563 * x..xx10..0 Right shift (rightmost 0 is shifted off)
564 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
565 * the 1 just to their left into a 0; the remainder is
567 * 0..0011..1 The xor with the original, x..xx10..0, clears that
568 * remainder, sets the bottom to all 1
569 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
571 * Another method is to do 'word &= -word'; but it generates a compiler
572 * message on some platforms about taking the negative of an unsigned */
575 word = 1 + (word ^ (word - 1));
577 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
579 /* Bytes are stored like
580 * Byte1 Byte2 ... Byte8
581 * 63..56 55..47 ... 7...0
583 * Isolate the msb; http://codeforces.com/blog/entry/10330
585 * Only the most significant set bit matters. Or'ing word with its right
586 * shift of 1 makes that bit and the next one to its right both 1. Then
587 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
588 * msb and all to the right being 1. */
594 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
596 /* Then subtracting the right shift by 1 clears all but the left-most of
597 * the 1 bits, which is our desired result */
601 # error Unexpected byte order
604 /* Here 'word' has a single bit set: the msb of the first byte in which it
605 * is set. Calculate that position in the word. We can use this
606 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
607 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
608 * just get shifted off at compile time) */
609 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
610 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
611 | (39 << 24) | (47 << 16)
612 | (55 << 8) | (63 << 0));
613 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
615 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
616 word = ((word + 1) >> 3) - 1;
618 # if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
620 /* And invert the result */
621 word = CHARBITS - word - 1;
625 return (unsigned int) word;
629 #if defined(PERL_CORE) || defined(PERL_EXT)
632 =for apidoc variant_under_utf8_count
634 This function looks at the sequence of bytes between C<s> and C<e>, which are
635 assumed to be encoded in ASCII/Latin1, and returns how many of them would
636 change should the string be translated into UTF-8. Due to the nature of UTF-8,
637 each of these would occupy two bytes instead of the single one in the input
638 string. Thus, this function returns the precise number of bytes the string
639 would expand by when translated to UTF-8.
641 Unlike most of the other functions that have C<utf8> in their name, the input
642 to this function is NOT a UTF-8-encoded string. The function name is slightly
643 I<odd> to emphasize this.
645 This function is internal to Perl because khw thinks that any XS code that
646 would want this is probably operating too close to the internals. Presenting a
647 valid use case could change that.
650 C<L<perlapi/is_utf8_invariant_string>>
652 C<L<perlapi/is_utf8_invariant_string_loc>>,
658 PERL_STATIC_INLINE Size_t
659 S_variant_under_utf8_count(const U8* const s, const U8* const e)
664 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
668 /* Test if the string is long enough to use word-at-a-time. (Logic is the
669 * same as for is_utf8_invariant_string()) */
670 if ((STRLEN) (e - x) >= PERL_WORDSIZE
671 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
672 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
675 /* Process per-byte until reach word boundary. XXX This loop could be
676 * eliminated if we knew that this platform had fast unaligned reads */
677 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
678 count += ! UTF8_IS_INVARIANT(*x++);
681 /* Process per-word as long as we have at least a full word left */
682 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
683 explanation of how this works */
684 PERL_UINTMAX_T increment
685 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
686 * PERL_COUNT_MULTIPLIER)
687 >> ((PERL_WORDSIZE - 1) * CHARBITS);
688 count += (Size_t) increment;
690 } while (x + PERL_WORDSIZE <= e);
695 /* Process per-byte */
697 if (! UTF8_IS_INVARIANT(*x)) {
709 #ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
710 # undef PERL_WORDSIZE
711 # undef PERL_COUNT_MULTIPLIER
712 # undef PERL_WORD_BOUNDARY_MASK
713 # undef PERL_VARIANTS_WORD_MASK
717 =for apidoc is_utf8_string
719 Returns TRUE if the first C<len> bytes of string C<s> form a valid
720 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
721 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
722 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
723 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
725 This function considers Perl's extended UTF-8 to be valid. That means that
726 code points above Unicode, surrogates, and non-character code points are
727 considered valid by this function. Use C<L</is_strict_utf8_string>>,
728 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
729 code points are considered valid.
732 C<L</is_utf8_invariant_string>>,
733 C<L</is_utf8_invariant_string_loc>>,
734 C<L</is_utf8_string_loc>>,
735 C<L</is_utf8_string_loclen>>,
736 C<L</is_utf8_fixed_width_buf_flags>>,
737 C<L</is_utf8_fixed_width_buf_loc_flags>>,
738 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
743 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
745 #if defined(PERL_CORE) || defined (PERL_EXT)
748 =for apidoc is_utf8_non_invariant_string
750 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
751 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
752 UTF-8; otherwise returns FALSE.
754 A TRUE return means that at least one code point represented by the sequence
755 either is a wide character not representable as a single byte, or the
756 representation differs depending on whether the sequence is encoded in UTF-8 or
760 C<L<perlapi/is_utf8_invariant_string>>,
761 C<L<perlapi/is_utf8_string>>
765 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
766 It generally needn't be if its string is entirely UTF-8 invariant, and it
767 shouldn't be if it otherwise contains invalid UTF-8.
769 It is an internal function because khw thinks that XS code shouldn't be working
770 at this low a level. A valid use case could change that.
774 PERL_STATIC_INLINE bool
775 S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
777 const U8 * first_variant;
779 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
781 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
785 return is_utf8_string(first_variant, len - (first_variant - s));
791 =for apidoc is_strict_utf8_string
793 Returns TRUE if the first C<len> bytes of string C<s> form a valid
794 UTF-8-encoded string that is fully interchangeable by any application using
795 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
796 calculated using C<strlen(s)> (which means if you use this option, that C<s>
797 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
798 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
800 This function returns FALSE for strings containing any
801 code points above the Unicode max of 0x10FFFF, surrogate code points, or
802 non-character code points.
805 C<L</is_utf8_invariant_string>>,
806 C<L</is_utf8_invariant_string_loc>>,
807 C<L</is_utf8_string>>,
808 C<L</is_utf8_string_flags>>,
809 C<L</is_utf8_string_loc>>,
810 C<L</is_utf8_string_loc_flags>>,
811 C<L</is_utf8_string_loclen>>,
812 C<L</is_utf8_string_loclen_flags>>,
813 C<L</is_utf8_fixed_width_buf_flags>>,
814 C<L</is_utf8_fixed_width_buf_loc_flags>>,
815 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
816 C<L</is_strict_utf8_string_loc>>,
817 C<L</is_strict_utf8_string_loclen>>,
818 C<L</is_c9strict_utf8_string>>,
819 C<L</is_c9strict_utf8_string_loc>>,
821 C<L</is_c9strict_utf8_string_loclen>>.
826 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
829 =for apidoc is_c9strict_utf8_string
831 Returns TRUE if the first C<len> bytes of string C<s> form a valid
832 UTF-8-encoded string that conforms to
833 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
834 otherwise it returns FALSE. If C<len> is 0, it will be calculated using
835 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
836 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
837 characters being ASCII constitute 'a valid UTF-8 string'.
839 This function returns FALSE for strings containing any code points above the
840 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
842 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
845 C<L</is_utf8_invariant_string>>,
846 C<L</is_utf8_invariant_string_loc>>,
847 C<L</is_utf8_string>>,
848 C<L</is_utf8_string_flags>>,
849 C<L</is_utf8_string_loc>>,
850 C<L</is_utf8_string_loc_flags>>,
851 C<L</is_utf8_string_loclen>>,
852 C<L</is_utf8_string_loclen_flags>>,
853 C<L</is_utf8_fixed_width_buf_flags>>,
854 C<L</is_utf8_fixed_width_buf_loc_flags>>,
855 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
856 C<L</is_strict_utf8_string>>,
857 C<L</is_strict_utf8_string_loc>>,
858 C<L</is_strict_utf8_string_loclen>>,
859 C<L</is_c9strict_utf8_string_loc>>,
861 C<L</is_c9strict_utf8_string_loclen>>.
866 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
869 =for apidoc is_utf8_string_flags
871 Returns TRUE if the first C<len> bytes of string C<s> form a valid
872 UTF-8 string, subject to the restrictions imposed by C<flags>;
873 returns FALSE otherwise. If C<len> is 0, it will be calculated
874 using C<strlen(s)> (which means if you use this option, that C<s> can't have
875 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
876 that all characters being ASCII constitute 'a valid UTF-8 string'.
878 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
879 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
880 as C<L</is_strict_utf8_string>>; and if C<flags> is
881 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
882 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
883 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
884 C<L</utf8n_to_uvchr>>, with the same meanings.
887 C<L</is_utf8_invariant_string>>,
888 C<L</is_utf8_invariant_string_loc>>,
889 C<L</is_utf8_string>>,
890 C<L</is_utf8_string_loc>>,
891 C<L</is_utf8_string_loc_flags>>,
892 C<L</is_utf8_string_loclen>>,
893 C<L</is_utf8_string_loclen_flags>>,
894 C<L</is_utf8_fixed_width_buf_flags>>,
895 C<L</is_utf8_fixed_width_buf_loc_flags>>,
896 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
897 C<L</is_strict_utf8_string>>,
898 C<L</is_strict_utf8_string_loc>>,
899 C<L</is_strict_utf8_string_loclen>>,
900 C<L</is_c9strict_utf8_string>>,
901 C<L</is_c9strict_utf8_string_loc>>,
903 C<L</is_c9strict_utf8_string_loclen>>.
908 PERL_STATIC_INLINE bool
909 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
911 const U8 * first_variant;
913 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
914 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
915 |UTF8_DISALLOW_PERL_EXTENDED)));
918 len = strlen((const char *)s);
922 return is_utf8_string(s, len);
925 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
926 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
928 return is_strict_utf8_string(s, len);
931 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
932 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
934 return is_c9strict_utf8_string(s, len);
937 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
938 const U8* const send = s + len;
939 const U8* x = first_variant;
942 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
943 if (UNLIKELY(! cur_len)) {
955 =for apidoc is_utf8_string_loc
957 Like C<L</is_utf8_string>> but stores the location of the failure (in the
958 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
959 "utf8ness success") in the C<ep> pointer.
961 See also C<L</is_utf8_string_loclen>>.
966 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
970 =for apidoc is_utf8_string_loclen
972 Like C<L</is_utf8_string>> but stores the location of the failure (in the
973 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
974 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
975 encoded characters in the C<el> pointer.
977 See also C<L</is_utf8_string_loc>>.
982 PERL_STATIC_INLINE bool
983 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
985 const U8 * first_variant;
987 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
990 len = strlen((const char *) s);
993 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1005 const U8* const send = s + len;
1006 const U8* x = first_variant;
1007 STRLEN outlen = first_variant - s;
1010 const STRLEN cur_len = isUTF8_CHAR(x, send);
1011 if (UNLIKELY(! cur_len)) {
1031 =for apidoc isUTF8_CHAR
1033 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1034 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1035 that represents some code point; otherwise it evaluates to 0. If non-zero, the
1036 value gives how many bytes starting at C<s> comprise the code point's
1037 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1038 form the first code point in C<s>, are not examined.
1040 The code point can be any that will fit in an IV on this machine, using Perl's
1041 extension to official UTF-8 to represent those higher than the Unicode maximum
1042 of 0x10FFFF. That means that this macro is used to efficiently decide if the
1043 next few bytes in C<s> is legal UTF-8 for a single character.
1045 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1046 defined by Unicode to be fully interchangeable across applications;
1047 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1048 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1049 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1051 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1052 C<L</is_utf8_string_loclen>> to check entire strings.
1054 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1055 machines) is a valid UTF-8 character.
1059 This uses an adaptation of the table and algorithm given in
1060 http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1061 documentation of the original version. A copyright notice for the original
1062 version is given at the beginning of this file. The Perl adapation is
1063 documented at the definition of PL_extended_utf8_dfa_tab[].
1067 PERL_STATIC_INLINE Size_t
1068 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1073 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1075 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1076 * code point, which can be returned immediately. Otherwise, it is either
1077 * malformed, or for the start byte FF which the dfa doesn't handle (except
1078 * on 32-bit ASCII platforms where it trivially is an error). Call a
1079 * helper function for the other platforms. */
1081 while (s < e && LIKELY(state != 1)) {
1082 state = PL_extended_utf8_dfa_tab[256
1084 + PL_extended_utf8_dfa_tab[*s]];
1093 #if defined(UV_IS_QUAD) || defined(EBCDIC)
1095 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1096 return is_utf8_char_helper(s0, e, 0);
1106 =for apidoc isSTRICT_UTF8_CHAR
1108 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1109 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1110 Unicode code point completely acceptable for open interchange between all
1111 applications; otherwise it evaluates to 0. If non-zero, the value gives how
1112 many bytes starting at C<s> comprise the code point's representation. Any
1113 bytes remaining before C<e>, but beyond the ones needed to form the first code
1114 point in C<s>, are not examined.
1116 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1117 be a surrogate nor a non-character code point. Thus this excludes any code
1118 point from Perl's extended UTF-8.
1120 This is used to efficiently decide if the next few bytes in C<s> is
1121 legal Unicode-acceptable UTF-8 for a single character.
1123 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1124 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1125 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1126 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1128 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1129 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1133 This uses an adaptation of the tables and algorithm given in
1134 http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1135 documentation of the original version. A copyright notice for the original
1136 version is given at the beginning of this file. The Perl adapation is
1137 documented at the definition of strict_extended_utf8_dfa_tab[].
1141 PERL_STATIC_INLINE Size_t
1142 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1147 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1149 while (s < e && LIKELY(state != 1)) {
1150 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
1162 /* The dfa above drops out for certain Hanguls; handle them specially */
1163 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1174 =for apidoc isC9_STRICT_UTF8_CHAR
1176 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1177 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1178 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1179 the value gives how many bytes starting at C<s> comprise the code point's
1180 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1181 form the first code point in C<s>, are not examined.
1183 The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1184 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1185 code points. This corresponds to
1186 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1187 which said that non-character code points are merely discouraged rather than
1188 completely forbidden in open interchange. See
1189 L<perlunicode/Noncharacter code points>.
1191 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1192 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1194 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1195 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1199 This uses an adaptation of the tables and algorithm given in
1200 http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1201 documentation of the original version. A copyright notice for the original
1202 version is given at the beginning of this file. The Perl adapation is
1203 documented at the definition of PL_c9_utf8_dfa_tab[].
1207 PERL_STATIC_INLINE Size_t
1208 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1213 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1215 while (s < e && LIKELY(state != 1)) {
1216 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
1231 =for apidoc is_strict_utf8_string_loc
1233 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1234 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1235 "utf8ness success") in the C<ep> pointer.
1237 See also C<L</is_strict_utf8_string_loclen>>.
1242 #define is_strict_utf8_string_loc(s, len, ep) \
1243 is_strict_utf8_string_loclen(s, len, ep, 0)
1247 =for apidoc is_strict_utf8_string_loclen
1249 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1250 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1251 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1252 encoded characters in the C<el> pointer.
1254 See also C<L</is_strict_utf8_string_loc>>.
1259 PERL_STATIC_INLINE bool
1260 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1262 const U8 * first_variant;
1264 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1267 len = strlen((const char *) s);
1270 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1282 const U8* const send = s + len;
1283 const U8* x = first_variant;
1284 STRLEN outlen = first_variant - s;
1287 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1288 if (UNLIKELY(! cur_len)) {
1308 =for apidoc is_c9strict_utf8_string_loc
1310 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1311 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1312 "utf8ness success") in the C<ep> pointer.
1314 See also C<L</is_c9strict_utf8_string_loclen>>.
1319 #define is_c9strict_utf8_string_loc(s, len, ep) \
1320 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1324 =for apidoc is_c9strict_utf8_string_loclen
1326 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1327 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1328 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1329 characters in the C<el> pointer.
1331 See also C<L</is_c9strict_utf8_string_loc>>.
1336 PERL_STATIC_INLINE bool
1337 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1339 const U8 * first_variant;
1341 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1344 len = strlen((const char *) s);
1347 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1359 const U8* const send = s + len;
1360 const U8* x = first_variant;
1361 STRLEN outlen = first_variant - s;
1364 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1365 if (UNLIKELY(! cur_len)) {
1385 =for apidoc is_utf8_string_loc_flags
1387 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1388 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1389 "utf8ness success") in the C<ep> pointer.
1391 See also C<L</is_utf8_string_loclen_flags>>.
1396 #define is_utf8_string_loc_flags(s, len, ep, flags) \
1397 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1400 /* The above 3 actual functions could have been moved into the more general one
1401 * just below, and made #defines that call it with the right 'flags'. They are
1402 * currently kept separate to increase their chances of getting inlined */
1406 =for apidoc is_utf8_string_loclen_flags
1408 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1409 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1410 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1411 encoded characters in the C<el> pointer.
1413 See also C<L</is_utf8_string_loc_flags>>.
1418 PERL_STATIC_INLINE bool
1419 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1421 const U8 * first_variant;
1423 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1424 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1425 |UTF8_DISALLOW_PERL_EXTENDED)));
1428 len = strlen((const char *) s);
1432 return is_utf8_string_loclen(s, len, ep, el);
1435 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1436 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1438 return is_strict_utf8_string_loclen(s, len, ep, el);
1441 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1442 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1444 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1447 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1459 const U8* send = s + len;
1460 const U8* x = first_variant;
1461 STRLEN outlen = first_variant - s;
1464 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1465 if (UNLIKELY(! cur_len)) {
1484 =for apidoc utf8_distance
1486 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1489 WARNING: use only if you *know* that the pointers point inside the
1495 PERL_STATIC_INLINE IV
1496 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1498 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1500 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1504 =for apidoc utf8_hop
1506 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1507 forward or backward.
1509 WARNING: do not use the following unless you *know* C<off> is within
1510 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1511 on the first byte of character or just after the last byte of a character.
1516 PERL_STATIC_INLINE U8 *
1517 Perl_utf8_hop(const U8 *s, SSize_t off)
1519 PERL_ARGS_ASSERT_UTF8_HOP;
1521 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1522 * the bitops (especially ~) can create illegal UTF-8.
1523 * In other words: in Perl UTF-8 is not just for Unicode. */
1532 while (UTF8_IS_CONTINUATION(*s))
1536 GCC_DIAG_IGNORE(-Wcast-qual)
1542 =for apidoc utf8_hop_forward
1544 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1547 C<off> must be non-negative.
1549 C<s> must be before or equal to C<end>.
1551 When moving forward it will not move beyond C<end>.
1553 Will not exceed this limit even if the string is not valid "UTF-8".
1558 PERL_STATIC_INLINE U8 *
1559 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1561 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1563 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1564 * the bitops (especially ~) can create illegal UTF-8.
1565 * In other words: in Perl UTF-8 is not just for Unicode. */
1571 STRLEN skip = UTF8SKIP(s);
1572 if ((STRLEN)(end - s) <= skip) {
1573 GCC_DIAG_IGNORE(-Wcast-qual)
1580 GCC_DIAG_IGNORE(-Wcast-qual)
1586 =for apidoc utf8_hop_back
1588 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1591 C<off> must be non-positive.
1593 C<s> must be after or equal to C<start>.
1595 When moving backward it will not move before C<start>.
1597 Will not exceed this limit even if the string is not valid "UTF-8".
1602 PERL_STATIC_INLINE U8 *
1603 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1605 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1607 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1608 * the bitops (especially ~) can create illegal UTF-8.
1609 * In other words: in Perl UTF-8 is not just for Unicode. */
1614 while (off++ && s > start) {
1617 } while (UTF8_IS_CONTINUATION(*s) && s > start);
1620 GCC_DIAG_IGNORE(-Wcast-qual)
1626 =for apidoc utf8_hop_safe
1628 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1629 either forward or backward.
1631 When moving backward it will not move before C<start>.
1633 When moving forward it will not move beyond C<end>.
1635 Will not exceed those limits even if the string is not valid "UTF-8".
1640 PERL_STATIC_INLINE U8 *
1641 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1643 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1645 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1646 * the bitops (especially ~) can create illegal UTF-8.
1647 * In other words: in Perl UTF-8 is not just for Unicode. */
1649 assert(start <= s && s <= end);
1652 return utf8_hop_forward(s, off, end);
1655 return utf8_hop_back(s, off, start);
1661 =for apidoc is_utf8_valid_partial_char
1663 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1664 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1665 points. Otherwise, it returns 1 if there exists at least one non-empty
1666 sequence of bytes that when appended to sequence C<s>, starting at position
1667 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1668 otherwise returns 0.
1670 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1673 This is useful when a fixed-length buffer is being tested for being well-formed
1674 UTF-8, but the final few bytes in it don't comprise a full character; that is,
1675 it is split somewhere in the middle of the final code point's UTF-8
1676 representation. (Presumably when the buffer is refreshed with the next chunk
1677 of data, the new first bytes will complete the partial code point.) This
1678 function is used to verify that the final bytes in the current buffer are in
1679 fact the legal beginning of some code point, so that if they aren't, the
1680 failure can be signalled without having to wait for the next read.
1684 #define is_utf8_valid_partial_char(s, e) \
1685 is_utf8_valid_partial_char_flags(s, e, 0)
1689 =for apidoc is_utf8_valid_partial_char_flags
1691 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1692 or not the input is a valid UTF-8 encoded partial character, but it takes an
1693 extra parameter, C<flags>, which can further restrict which code points are
1696 If C<flags> is 0, this behaves identically to
1697 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1698 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1699 there is any sequence of bytes that can complete the input partial character in
1700 such a way that a non-prohibited character is formed, the function returns
1701 TRUE; otherwise FALSE. Non character code points cannot be determined based on
1702 partial character input. But many of the other possible excluded types can be
1703 determined from just the first one or two bytes.
1708 PERL_STATIC_INLINE bool
1709 Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
1711 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
1713 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1714 |UTF8_DISALLOW_PERL_EXTENDED)));
1716 if (s >= e || s + UTF8SKIP(s) <= e) {
1720 return cBOOL(is_utf8_char_helper(s, e, flags));
1725 =for apidoc is_utf8_fixed_width_buf_flags
1727 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1728 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1729 otherwise it returns FALSE.
1731 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1732 without restriction. If the final few bytes of the buffer do not form a
1733 complete code point, this will return TRUE anyway, provided that
1734 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1736 If C<flags> in non-zero, it can be any combination of the
1737 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1740 This function differs from C<L</is_utf8_string_flags>> only in that the latter
1741 returns FALSE if the final few bytes of the string don't form a complete code
1746 #define is_utf8_fixed_width_buf_flags(s, len, flags) \
1747 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1751 =for apidoc is_utf8_fixed_width_buf_loc_flags
1753 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1754 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1755 to the beginning of any partial character at the end of the buffer; if there is
1756 no partial character C<*ep> will contain C<s>+C<len>.
1758 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1763 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1764 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1768 =for apidoc is_utf8_fixed_width_buf_loclen_flags
1770 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1771 complete, valid characters found in the C<el> pointer.
1776 PERL_STATIC_INLINE bool
1777 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
1783 const U8 * maybe_partial;
1785 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1788 ep = &maybe_partial;
1791 /* If it's entirely valid, return that; otherwise see if the only error is
1792 * that the final few bytes are for a partial character */
1793 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1794 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1797 PERL_STATIC_INLINE UV
1798 Perl_utf8n_to_uvchr_msgs(const U8 *s,
1805 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1806 * simple cases, and, if necessary calls a helper function to deal with the
1807 * more complex ones. Almost all well-formed non-problematic code points
1808 * are considered simple, so that it's unlikely that the helper function
1809 * will need to be called.
1811 * This is an adaptation of the tables and algorithm given in
1812 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
1813 * comprehensive documentation of the original version. A copyright notice
1814 * for the original version is given at the beginning of this file. The
1815 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
1818 const U8 * const s0 = s;
1819 const U8 * send = s0 + curlen;
1820 UV uv = 0; /* The 0 silences some stupid compilers */
1823 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1825 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1826 * non-problematic code point, which can be returned immediately.
1827 * Otherwise we call a helper function to figure out the more complicated
1830 while (s < send && LIKELY(state != 1)) {
1831 UV type = PL_strict_utf8_dfa_tab[*s];
1834 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1835 : UTF8_ACCUMULATE(uv, *s);
1836 state = PL_strict_utf8_dfa_tab[256 + state + type];
1844 *retlen = s - s0 + 1;
1853 return UNI_TO_NATIVE(uv);
1856 /* Here is potentially problematic. Use the full mechanism */
1857 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1860 PERL_STATIC_INLINE UV
1861 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1863 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
1867 if (! ckWARN_d(WARN_UTF8)) {
1869 /* EMPTY is not really allowed, and asserts on debugging builds. But
1870 * on non-debugging we have to deal with it, and this causes it to
1871 * return the REPLACEMENT CHARACTER, as the documentation indicates */
1872 return utf8n_to_uvchr(s, send - s, retlen,
1873 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
1876 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1877 if (retlen && ret == 0 && *s != '\0') {
1878 *retlen = (STRLEN) -1;
1885 /* ------------------------------- perl.h ----------------------------- */
1888 =head1 Miscellaneous Functions
1890 =for apidoc is_safe_syscall
1892 Test that the given C<pv> doesn't contain any internal C<NUL> characters.
1893 If it does, set C<errno> to C<ENOENT>, optionally warn, and return FALSE.
1895 Return TRUE if the name is safe.
1897 Used by the C<IS_SAFE_SYSCALL()> macro.
1902 PERL_STATIC_INLINE bool
1903 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1905 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1906 * perl itself uses xce*() functions which accept 8-bit strings.
1909 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1913 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
1914 SETERRNO(ENOENT, LIB_INVARG);
1915 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
1916 "Invalid \\0 character in %s for %s: %s\\0%s",
1917 what, op_name, pv, null_at+1);
1927 Return true if the supplied filename has a newline character
1928 immediately before the first (hopefully only) NUL.
1930 My original look at this incorrectly used the len from SvPV(), but
1931 that's incorrect, since we allow for a NUL in pv[len-1].
1933 So instead, strlen() and work from there.
1935 This allow for the user reading a filename, forgetting to chomp it,
1938 open my $foo, "$file\0";
1944 PERL_STATIC_INLINE bool
1945 S_should_warn_nl(const char *pv)
1949 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1953 return len > 0 && pv[len-1] == '\n';
1958 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1960 PERL_STATIC_INLINE bool
1961 S_lossless_NV_to_IV(const NV nv, IV *ivp)
1963 /* This function determines if the input NV 'nv' may be converted without
1964 * loss of data to an IV. If not, it returns FALSE taking no other action.
1965 * But if it is possible, it does the conversion, returning TRUE, and
1966 * storing the converted result in '*ivp' */
1968 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1970 # if defined(Perl_isnan)
1972 if (UNLIKELY(Perl_isnan(nv))) {
1978 if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1982 if ((IV) nv != nv) {
1992 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
1994 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
1996 #define MAX_CHARSET_NAME_LENGTH 2
1998 PERL_STATIC_INLINE const char *
1999 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2001 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2003 /* Returns a string that corresponds to the name of the regex character set
2004 * given by 'flags', and *lenp is set the length of that string, which
2005 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2008 switch (get_regex_charset(flags)) {
2009 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2010 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2011 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2012 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2013 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2015 return ASCII_MORE_RESTRICT_PAT_MODS;
2017 /* The NOT_REACHED; hides an assert() which has a rather complex
2018 * definition in perl.h. */
2019 NOT_REACHED; /* NOTREACHED */
2020 return "?"; /* Unknown */
2027 Return false if any get magic is on the SV other than taint magic.
2031 PERL_STATIC_INLINE bool
2032 Perl_sv_only_taint_gmagic(SV *sv)
2034 MAGIC *mg = SvMAGIC(sv);
2036 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2039 if (mg->mg_type != PERL_MAGIC_taint
2040 && !(mg->mg_flags & MGf_GSKIP)
2041 && mg->mg_virtual->svt_get) {
2044 mg = mg->mg_moremagic;
2050 /* ------------------ cop.h ------------------------------------------- */
2053 /* Enter a block. Push a new base context and return its address. */
2055 PERL_STATIC_INLINE PERL_CONTEXT *
2056 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2060 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2065 cx->blk_gimme = gimme;
2066 cx->blk_oldsaveix = saveix;
2067 cx->blk_oldsp = (I32)(sp - PL_stack_base);
2068 cx->blk_oldcop = PL_curcop;
2069 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
2070 cx->blk_oldscopesp = PL_scopestack_ix;
2071 cx->blk_oldpm = PL_curpm;
2072 cx->blk_old_tmpsfloor = PL_tmps_floor;
2074 PL_tmps_floor = PL_tmps_ix;
2075 CX_DEBUG(cx, "PUSH");
2080 /* Exit a block (RETURN and LAST). */
2082 PERL_STATIC_INLINE void
2083 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2085 PERL_ARGS_ASSERT_CX_POPBLOCK;
2087 CX_DEBUG(cx, "POP");
2088 /* these 3 are common to cx_popblock and cx_topblock */
2089 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2090 PL_scopestack_ix = cx->blk_oldscopesp;
2091 PL_curpm = cx->blk_oldpm;
2093 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2094 * and leaves a CX entry lying around for repeated use, so
2095 * skip for multicall */ \
2096 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2097 || PL_savestack_ix == cx->blk_oldsaveix);
2098 PL_curcop = cx->blk_oldcop;
2099 PL_tmps_floor = cx->blk_old_tmpsfloor;
2102 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2103 * Whereas cx_popblock() restores the state to the point just before
2104 * cx_pushblock() was called, cx_topblock() restores it to the point just
2105 * *after* cx_pushblock() was called. */
2107 PERL_STATIC_INLINE void
2108 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2110 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2112 CX_DEBUG(cx, "TOP");
2113 /* these 3 are common to cx_popblock and cx_topblock */
2114 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2115 PL_scopestack_ix = cx->blk_oldscopesp;
2116 PL_curpm = cx->blk_oldpm;
2118 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2122 PERL_STATIC_INLINE void
2123 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2125 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2127 PERL_ARGS_ASSERT_CX_PUSHSUB;
2129 PERL_DTRACE_PROBE_ENTRY(cv);
2130 cx->blk_sub.cv = cv;
2131 cx->blk_sub.olddepth = CvDEPTH(cv);
2132 cx->blk_sub.prevcomppad = PL_comppad;
2133 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2134 cx->blk_sub.retop = retop;
2135 SvREFCNT_inc_simple_void_NN(cv);
2136 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2140 /* subsets of cx_popsub() */
2142 PERL_STATIC_INLINE void
2143 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2147 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2148 assert(CxTYPE(cx) == CXt_SUB);
2150 PL_comppad = cx->blk_sub.prevcomppad;
2151 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2152 cv = cx->blk_sub.cv;
2153 CvDEPTH(cv) = cx->blk_sub.olddepth;
2154 cx->blk_sub.cv = NULL;
2159 /* handle the @_ part of leaving a sub */
2161 PERL_STATIC_INLINE void
2162 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2166 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2167 assert(CxTYPE(cx) == CXt_SUB);
2168 assert(AvARRAY(MUTABLE_AV(
2169 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2170 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2172 CX_POP_SAVEARRAY(cx);
2173 av = MUTABLE_AV(PAD_SVl(0));
2174 if (UNLIKELY(AvREAL(av)))
2175 /* abandon @_ if it got reified */
2176 clear_defarray(av, 0);
2183 PERL_STATIC_INLINE void
2184 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2186 PERL_ARGS_ASSERT_CX_POPSUB;
2187 assert(CxTYPE(cx) == CXt_SUB);
2189 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2193 cx_popsub_common(cx);
2197 PERL_STATIC_INLINE void
2198 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2200 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2202 cx->blk_format.cv = cv;
2203 cx->blk_format.retop = retop;
2204 cx->blk_format.gv = gv;
2205 cx->blk_format.dfoutgv = PL_defoutgv;
2206 cx->blk_format.prevcomppad = PL_comppad;
2209 SvREFCNT_inc_simple_void_NN(cv);
2211 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2215 PERL_STATIC_INLINE void
2216 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2221 PERL_ARGS_ASSERT_CX_POPFORMAT;
2222 assert(CxTYPE(cx) == CXt_FORMAT);
2224 dfout = cx->blk_format.dfoutgv;
2226 cx->blk_format.dfoutgv = NULL;
2227 SvREFCNT_dec_NN(dfout);
2229 PL_comppad = cx->blk_format.prevcomppad;
2230 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2231 cv = cx->blk_format.cv;
2232 cx->blk_format.cv = NULL;
2234 SvREFCNT_dec_NN(cv);
2238 PERL_STATIC_INLINE void
2239 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2241 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2243 cx->blk_eval.retop = retop;
2244 cx->blk_eval.old_namesv = namesv;
2245 cx->blk_eval.old_eval_root = PL_eval_root;
2246 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2247 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2248 cx->blk_eval.cur_top_env = PL_top_env;
2250 assert(!(PL_in_eval & ~ 0x3F));
2251 assert(!(PL_op->op_type & ~0x1FF));
2252 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2256 PERL_STATIC_INLINE void
2257 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2261 PERL_ARGS_ASSERT_CX_POPEVAL;
2262 assert(CxTYPE(cx) == CXt_EVAL);
2264 PL_in_eval = CxOLD_IN_EVAL(cx);
2265 assert(!(PL_in_eval & 0xc0));
2266 PL_eval_root = cx->blk_eval.old_eval_root;
2267 sv = cx->blk_eval.cur_text;
2268 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2269 cx->blk_eval.cur_text = NULL;
2270 SvREFCNT_dec_NN(sv);
2273 sv = cx->blk_eval.old_namesv;
2275 cx->blk_eval.old_namesv = NULL;
2276 SvREFCNT_dec_NN(sv);
2281 /* push a plain loop, i.e.
2283 * while (cond) { block }
2284 * for (init;cond;continue) { block }
2285 * This loop can be last/redo'ed etc.
2288 PERL_STATIC_INLINE void
2289 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
2291 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2292 cx->blk_loop.my_op = cLOOP;
2296 /* push a true for loop, i.e.
2297 * for var (list) { block }
2300 PERL_STATIC_INLINE void
2301 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
2303 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2305 /* this one line is common with cx_pushloop_plain */
2306 cx->blk_loop.my_op = cLOOP;
2308 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2309 cx->blk_loop.itersave = itersave;
2311 cx->blk_loop.oldcomppad = PL_comppad;
2316 /* pop all loop types, including plain */
2318 PERL_STATIC_INLINE void
2319 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
2321 PERL_ARGS_ASSERT_CX_POPLOOP;
2323 assert(CxTYPE_is_LOOP(cx));
2324 if ( CxTYPE(cx) == CXt_LOOP_ARY
2325 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2327 /* Free ary or cur. This assumes that state_u.ary.ary
2328 * aligns with state_u.lazysv.cur. See cx_dup() */
2329 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2330 cx->blk_loop.state_u.lazysv.cur = NULL;
2331 SvREFCNT_dec_NN(sv);
2332 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2333 sv = cx->blk_loop.state_u.lazysv.end;
2334 cx->blk_loop.state_u.lazysv.end = NULL;
2335 SvREFCNT_dec_NN(sv);
2338 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2340 SV **svp = (cx)->blk_loop.itervar_u.svp;
2341 if ((cx->cx_type & CXp_FOR_GV))
2342 svp = &GvSV((GV*)svp);
2344 *svp = cx->blk_loop.itersave;
2345 cx->blk_loop.itersave = NULL;
2346 SvREFCNT_dec(cursv);
2351 PERL_STATIC_INLINE void
2352 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2354 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2356 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2360 PERL_STATIC_INLINE void
2361 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2363 PERL_ARGS_ASSERT_CX_POPWHEN;
2364 assert(CxTYPE(cx) == CXt_WHEN);
2366 PERL_UNUSED_ARG(cx);
2367 PERL_UNUSED_CONTEXT;
2368 /* currently NOOP */
2372 PERL_STATIC_INLINE void
2373 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
2375 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2377 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2378 cx->blk_givwhen.defsv_save = orig_defsv;
2382 PERL_STATIC_INLINE void
2383 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
2387 PERL_ARGS_ASSERT_CX_POPGIVEN;
2388 assert(CxTYPE(cx) == CXt_GIVEN);
2390 sv = GvSV(PL_defgv);
2391 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2392 cx->blk_givwhen.defsv_save = NULL;
2396 /* ------------------ util.h ------------------------------------------- */
2399 =head1 Miscellaneous Functions
2403 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2405 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2406 match themselves and their opposite case counterparts. Non-cased and non-ASCII
2407 range bytes match only themselves.
2412 PERL_STATIC_INLINE I32
2413 Perl_foldEQ(const char *s1, const char *s2, I32 len)
2415 const U8 *a = (const U8 *)s1;
2416 const U8 *b = (const U8 *)s2;
2418 PERL_ARGS_ASSERT_FOLDEQ;
2423 if (*a != *b && *a != PL_fold[*b])
2430 PERL_STATIC_INLINE I32
2431 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2433 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2434 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2435 * does not check for this. Nor does it check that the strings each have
2436 * at least 'len' characters. */
2438 const U8 *a = (const U8 *)s1;
2439 const U8 *b = (const U8 *)s2;
2441 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2446 if (*a != *b && *a != PL_fold_latin1[*b]) {
2455 =for apidoc foldEQ_locale
2457 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2458 same case-insensitively in the current locale; false otherwise.
2463 PERL_STATIC_INLINE I32
2464 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2467 const U8 *a = (const U8 *)s1;
2468 const U8 *b = (const U8 *)s2;
2470 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2475 if (*a != *b && *a != PL_fold_locale[*b])
2482 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2484 PERL_STATIC_INLINE void *
2485 S_my_memrchr(const char * s, const char c, const STRLEN len)
2487 /* memrchr(), since many platforms lack it */
2489 const char * t = s + len - 1;
2491 PERL_ARGS_ASSERT_MY_MEMRCHR;
2506 * ex: set ts=8 sts=4 sw=4 et: