3 * Copyright (C) 2012 by Larry Wall and others
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
8 * This file contains tables and code adapted from
9 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 * This file is a home for static inline functions that cannot go in other
34 * header files, because they depend on proto.h (included after most other
35 * headers) or struct definitions.
37 * Each section names the header file that the functions "belong" to.
40 /* ------------------------------- av.h ------------------------------- */
42 PERL_STATIC_INLINE SSize_t
43 Perl_av_top_index(pTHX_ AV *av)
45 PERL_ARGS_ASSERT_AV_TOP_INDEX;
46 assert(SvTYPE(av) == SVt_PVAV);
51 /* ------------------------------- cv.h ------------------------------- */
53 PERL_STATIC_INLINE GV *
54 Perl_CvGV(pTHX_ CV *sv)
56 PERL_ARGS_ASSERT_CVGV;
59 ? Perl_cvgv_from_hek(aTHX_ sv)
60 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
63 PERL_STATIC_INLINE I32 *
64 Perl_CvDEPTH(const CV * const sv)
66 PERL_ARGS_ASSERT_CVDEPTH;
67 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
69 return &((XPVCV*)SvANY(sv))->xcv_depth;
73 CvPROTO returns the prototype as stored, which is not necessarily what
74 the interpreter should be using. Specifically, the interpreter assumes
75 that spaces have been stripped, which has been the case if the prototype
76 was added by toke.c, but is generally not the case if it was added elsewhere.
77 Since we can't enforce the spacelessness at assignment time, this routine
78 provides a temporary copy at parse time with spaces removed.
79 I<orig> is the start of the original buffer, I<len> is the length of the
80 prototype and will be updated when this returns.
84 PERL_STATIC_INLINE char *
85 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
89 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
97 *len = tmps - SvPVX(tmpsv);
102 /* ------------------------------- mg.h ------------------------------- */
104 #if defined(PERL_CORE) || defined(PERL_EXT)
105 /* assumes get-magic and stringification have already occurred */
106 PERL_STATIC_INLINE STRLEN
107 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
109 assert(mg->mg_type == PERL_MAGIC_regex_global);
110 assert(mg->mg_len != -1);
111 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
112 return (STRLEN)mg->mg_len;
114 const STRLEN pos = (STRLEN)mg->mg_len;
115 /* Without this check, we may read past the end of the buffer: */
116 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
117 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
122 /* ------------------------------- pad.h ------------------------------ */
124 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
125 PERL_STATIC_INLINE bool
126 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
128 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
130 /* is seq within the range _LOW to _HIGH ?
131 * This is complicated by the fact that PL_cop_seqmax
132 * may have wrapped around at some point */
133 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
134 return FALSE; /* not yet introduced */
136 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
137 /* in compiling scope */
139 (seq > COP_SEQ_RANGE_LOW(pn))
140 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
141 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
146 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
148 ( seq > COP_SEQ_RANGE_LOW(pn)
149 || seq <= COP_SEQ_RANGE_HIGH(pn))
151 : ( seq > COP_SEQ_RANGE_LOW(pn)
152 && seq <= COP_SEQ_RANGE_HIGH(pn))
159 /* ------------------------------- pp.h ------------------------------- */
161 PERL_STATIC_INLINE I32
164 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
165 "MARK top %p %" IVdf "\n",
167 (IV)*PL_markstack_ptr)));
168 return *PL_markstack_ptr;
171 PERL_STATIC_INLINE I32
174 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
175 "MARK pop %p %" IVdf "\n",
176 (PL_markstack_ptr-1),
177 (IV)*(PL_markstack_ptr-1))));
178 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
179 return *PL_markstack_ptr--;
182 /* ----------------------------- regexp.h ----------------------------- */
184 PERL_STATIC_INLINE struct regexp *
185 Perl_ReANY(const REGEXP * const re)
187 XPV* const p = (XPV*)SvANY(re);
188 assert(isREGEXP(re));
189 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
190 : (struct regexp *)p;
193 /* ------------------------------- sv.h ------------------------------- */
195 PERL_STATIC_INLINE SV *
196 Perl_SvREFCNT_inc(SV *sv)
198 if (LIKELY(sv != NULL))
202 PERL_STATIC_INLINE SV *
203 Perl_SvREFCNT_inc_NN(SV *sv)
205 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
210 PERL_STATIC_INLINE void
211 Perl_SvREFCNT_inc_void(SV *sv)
213 if (LIKELY(sv != NULL))
216 PERL_STATIC_INLINE void
217 Perl_SvREFCNT_dec(pTHX_ SV *sv)
219 if (LIKELY(sv != NULL)) {
220 U32 rc = SvREFCNT(sv);
222 SvREFCNT(sv) = rc - 1;
224 Perl_sv_free2(aTHX_ sv, rc);
228 PERL_STATIC_INLINE void
229 Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
231 U32 rc = SvREFCNT(sv);
233 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
236 SvREFCNT(sv) = rc - 1;
238 Perl_sv_free2(aTHX_ sv, rc);
241 PERL_STATIC_INLINE void
245 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
247 PERL_STATIC_INLINE void
250 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
251 HvAMAGIC_off(SvSTASH(SvRV(sv)));
254 PERL_STATIC_INLINE U32
255 Perl_SvPADSTALE_on(SV *sv)
257 assert(!(SvFLAGS(sv) & SVs_PADTMP));
258 return SvFLAGS(sv) |= SVs_PADSTALE;
260 PERL_STATIC_INLINE U32
261 Perl_SvPADSTALE_off(SV *sv)
263 assert(!(SvFLAGS(sv) & SVs_PADTMP));
264 return SvFLAGS(sv) &= ~SVs_PADSTALE;
266 #if defined(PERL_CORE) || defined (PERL_EXT)
267 PERL_STATIC_INLINE STRLEN
268 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
270 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
272 U8 *hopped = utf8_hop((U8 *)pv, pos);
273 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
274 return (STRLEN)(hopped - (U8 *)pv);
276 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
280 /* ------------------------------- handy.h ------------------------------- */
282 /* saves machine code for a common noreturn idiom typically used in Newx*() */
283 GCC_DIAG_IGNORE_DECL(-Wunused-function);
285 Perl_croak_memory_wrap(void)
287 Perl_croak_nocontext("%s",PL_memory_wrap);
289 GCC_DIAG_RESTORE_DECL;
291 /* ------------------------------- utf8.h ------------------------------- */
294 =head1 Unicode Support
297 PERL_STATIC_INLINE void
298 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
300 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
301 * encoded string at '*dest', updating '*dest' to include it */
303 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
305 if (NATIVE_BYTE_IS_INVARIANT(byte))
308 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
309 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
314 =for apidoc valid_utf8_to_uvchr
315 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
316 known that the next character in the input UTF-8 string C<s> is well-formed
317 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
318 points, and non-Unicode code points are allowed.
324 PERL_STATIC_INLINE UV
325 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
327 const UV expectlen = UTF8SKIP(s);
328 const U8* send = s + expectlen;
331 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
337 /* An invariant is trivially returned */
338 if (expectlen == 1) {
342 /* Remove the leading bits that indicate the number of bytes, leaving just
343 * the bits that are part of the value */
344 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
346 /* Now, loop through the remaining bytes, accumulating each into the
347 * working total as we go. (I khw tried unrolling the loop for up to 4
348 * bytes, but there was no performance improvement) */
349 for (++s; s < send; s++) {
350 uv = UTF8_ACCUMULATE(uv, *s);
353 return UNI_TO_NATIVE(uv);
358 =for apidoc is_utf8_invariant_string
360 Returns TRUE if the first C<len> bytes of the string C<s> are the same
361 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
362 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
363 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
364 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
365 characters are invariant, but so also are the C1 controls.
367 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
368 use this option, that C<s> can't have embedded C<NUL> characters and has to
369 have a terminating C<NUL> byte).
372 C<L</is_utf8_string>>,
373 C<L</is_utf8_string_flags>>,
374 C<L</is_utf8_string_loc>>,
375 C<L</is_utf8_string_loc_flags>>,
376 C<L</is_utf8_string_loclen>>,
377 C<L</is_utf8_string_loclen_flags>>,
378 C<L</is_utf8_fixed_width_buf_flags>>,
379 C<L</is_utf8_fixed_width_buf_loc_flags>>,
380 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
381 C<L</is_strict_utf8_string>>,
382 C<L</is_strict_utf8_string_loc>>,
383 C<L</is_strict_utf8_string_loclen>>,
384 C<L</is_c9strict_utf8_string>>,
385 C<L</is_c9strict_utf8_string_loc>>,
387 C<L</is_c9strict_utf8_string_loclen>>.
393 #define is_utf8_invariant_string(s, len) \
394 is_utf8_invariant_string_loc(s, len, NULL)
397 =for apidoc is_utf8_invariant_string_loc
399 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
400 the first UTF-8 variant character in the C<ep> pointer; if all characters are
401 UTF-8 invariant, this function does not change the contents of C<*ep>.
407 PERL_STATIC_INLINE bool
408 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
413 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
416 len = strlen((const char *)s);
421 /* This looks like 0x010101... */
422 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
424 /* This looks like 0x808080... */
425 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
426 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
427 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
429 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
430 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
431 * optimized out completely on a 32-bit system, and its mask gets optimized out
432 * on a 64-bit system */
433 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
434 | ( PTR2nat(x) >> 1) \
436 & PERL_WORD_BOUNDARY_MASK) >> 2))))
440 /* Do the word-at-a-time iff there is at least one usable full word. That
441 * means that after advancing to a word boundary, there still is at least a
442 * full word left. The number of bytes needed to advance is 'wordsize -
443 * offset' unless offset is 0. */
444 if ((STRLEN) (send - x) >= PERL_WORDSIZE
446 /* This term is wordsize if subword; 0 if not */
447 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
450 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
453 /* Process per-byte until reach word boundary. XXX This loop could be
454 * eliminated if we knew that this platform had fast unaligned reads */
455 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
456 if (! UTF8_IS_INVARIANT(*x)) {
466 /* Here, we know we have at least one full word to process. Process
467 * per-word as long as we have at least a full word left */
469 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
471 /* Found a variant. Just return if caller doesn't want its
477 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
478 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
480 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
481 assert(*ep >= s && *ep < send);
485 # else /* If weird byte order, drop into next loop to do byte-at-a-time
494 } while (x + PERL_WORDSIZE <= send);
497 #endif /* End of ! EBCDIC */
499 /* Process per-byte */
501 if (! UTF8_IS_INVARIANT(*x)) {
517 PERL_STATIC_INLINE unsigned int
518 Perl_variant_byte_number(PERL_UINTMAX_T word)
521 /* This returns the position in a word (0..7) of the first variant byte in
522 * it. This is a helper function. Note that there are no branches */
526 /* Get just the msb bits of each byte */
527 word &= PERL_VARIANTS_WORD_MASK;
529 # ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the
530 easiest thing is to hide that from the callers */
533 const U8 * s = (U8 *) &word;
536 for (i = 0; i < sizeof(word); i++ ) {
542 Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n",
546 # elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
548 /* Bytes are stored like
549 * Byte8 ... Byte2 Byte1
550 * 63..56...15...8 7...0
553 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
555 * The word will look this this, with a rightmost set bit in position 's':
556 * ('x's are don't cares)
559 * x..xx10..0 Right shift (rightmost 0 is shifted off)
560 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
561 * the 1 just to their left into a 0; the remainder is
563 * 0..0011..1 The xor with the original, x..xx10..0, clears that
564 * remainder, sets the bottom to all 1
565 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
567 * Another method is to do 'word &= -word'; but it generates a compiler
568 * message on some platforms about taking the negative of an unsigned */
571 word = 1 + (word ^ (word - 1));
573 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
575 /* Bytes are stored like
576 * Byte1 Byte2 ... Byte8
577 * 63..56 55..47 ... 7...0
579 * Isolate the msb; http://codeforces.com/blog/entry/10330
581 * Only the most significant set bit matters. Or'ing word with its right
582 * shift of 1 makes that bit and the next one to its right both 1. Then
583 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
584 * msb and all to the right being 1. */
590 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
592 /* Then subtracting the right shift by 1 clears all but the left-most of
593 * the 1 bits, which is our desired result */
597 # error Unexpected byte order
600 /* Here 'word' has a single bit set: the msb of the first byte in which it
601 * is set. Calculate that position in the word. We can use this
602 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
603 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
604 * just get shifted off at compile time) */
605 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
606 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
607 | (39 << 24) | (47 << 16)
608 | (55 << 8) | (63 << 0));
609 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
611 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
612 word = ((word + 1) >> 3) - 1;
614 # if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
616 /* And invert the result */
617 word = CHARBITS - word - 1;
621 return (unsigned int) word;
625 #if defined(PERL_CORE) || defined(PERL_EXT)
628 =for apidoc variant_under_utf8_count
630 This function looks at the sequence of bytes between C<s> and C<e>, which are
631 assumed to be encoded in ASCII/Latin1, and returns how many of them would
632 change should the string be translated into UTF-8. Due to the nature of UTF-8,
633 each of these would occupy two bytes instead of the single one in the input
634 string. Thus, this function returns the precise number of bytes the string
635 would expand by when translated to UTF-8.
637 Unlike most of the other functions that have C<utf8> in their name, the input
638 to this function is NOT a UTF-8-encoded string. The function name is slightly
639 I<odd> to emphasize this.
641 This function is internal to Perl because khw thinks that any XS code that
642 would want this is probably operating too close to the internals. Presenting a
643 valid use case could change that.
646 C<L<perlapi/is_utf8_invariant_string>>
648 C<L<perlapi/is_utf8_invariant_string_loc>>,
654 PERL_STATIC_INLINE Size_t
655 S_variant_under_utf8_count(const U8* const s, const U8* const e)
660 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
664 /* Test if the string is long enough to use word-at-a-time. (Logic is the
665 * same as for is_utf8_invariant_string()) */
666 if ((STRLEN) (e - x) >= PERL_WORDSIZE
667 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
668 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
671 /* Process per-byte until reach word boundary. XXX This loop could be
672 * eliminated if we knew that this platform had fast unaligned reads */
673 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
674 count += ! UTF8_IS_INVARIANT(*x++);
677 /* Process per-word as long as we have at least a full word left */
678 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
679 explanation of how this works */
680 PERL_UINTMAX_T increment
681 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
682 * PERL_COUNT_MULTIPLIER)
683 >> ((PERL_WORDSIZE - 1) * CHARBITS);
684 count += (Size_t) increment;
686 } while (x + PERL_WORDSIZE <= e);
691 /* Process per-byte */
693 if (! UTF8_IS_INVARIANT(*x)) {
705 #ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
706 # undef PERL_WORDSIZE
707 # undef PERL_COUNT_MULTIPLIER
708 # undef PERL_WORD_BOUNDARY_MASK
709 # undef PERL_VARIANTS_WORD_MASK
713 =for apidoc is_utf8_string
715 Returns TRUE if the first C<len> bytes of string C<s> form a valid
716 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
717 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
718 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
719 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
721 This function considers Perl's extended UTF-8 to be valid. That means that
722 code points above Unicode, surrogates, and non-character code points are
723 considered valid by this function. Use C<L</is_strict_utf8_string>>,
724 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
725 code points are considered valid.
728 C<L</is_utf8_invariant_string>>,
729 C<L</is_utf8_invariant_string_loc>>,
730 C<L</is_utf8_string_loc>>,
731 C<L</is_utf8_string_loclen>>,
732 C<L</is_utf8_fixed_width_buf_flags>>,
733 C<L</is_utf8_fixed_width_buf_loc_flags>>,
734 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
739 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
741 #if defined(PERL_CORE) || defined (PERL_EXT)
744 =for apidoc is_utf8_non_invariant_string
746 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
747 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
748 UTF-8; otherwise returns FALSE.
750 A TRUE return means that at least one code point represented by the sequence
751 either is a wide character not representable as a single byte, or the
752 representation differs depending on whether the sequence is encoded in UTF-8 or
756 C<L<perlapi/is_utf8_invariant_string>>,
757 C<L<perlapi/is_utf8_string>>
761 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
762 It generally needn't be if its string is entirely UTF-8 invariant, and it
763 shouldn't be if it otherwise contains invalid UTF-8.
765 It is an internal function because khw thinks that XS code shouldn't be working
766 at this low a level. A valid use case could change that.
770 PERL_STATIC_INLINE bool
771 S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
773 const U8 * first_variant;
775 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
777 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
781 return is_utf8_string(first_variant, len - (first_variant - s));
787 =for apidoc is_strict_utf8_string
789 Returns TRUE if the first C<len> bytes of string C<s> form a valid
790 UTF-8-encoded string that is fully interchangeable by any application using
791 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
792 calculated using C<strlen(s)> (which means if you use this option, that C<s>
793 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
794 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
796 This function returns FALSE for strings containing any
797 code points above the Unicode max of 0x10FFFF, surrogate code points, or
798 non-character code points.
801 C<L</is_utf8_invariant_string>>,
802 C<L</is_utf8_invariant_string_loc>>,
803 C<L</is_utf8_string>>,
804 C<L</is_utf8_string_flags>>,
805 C<L</is_utf8_string_loc>>,
806 C<L</is_utf8_string_loc_flags>>,
807 C<L</is_utf8_string_loclen>>,
808 C<L</is_utf8_string_loclen_flags>>,
809 C<L</is_utf8_fixed_width_buf_flags>>,
810 C<L</is_utf8_fixed_width_buf_loc_flags>>,
811 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
812 C<L</is_strict_utf8_string_loc>>,
813 C<L</is_strict_utf8_string_loclen>>,
814 C<L</is_c9strict_utf8_string>>,
815 C<L</is_c9strict_utf8_string_loc>>,
817 C<L</is_c9strict_utf8_string_loclen>>.
822 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
825 =for apidoc is_c9strict_utf8_string
827 Returns TRUE if the first C<len> bytes of string C<s> form a valid
828 UTF-8-encoded string that conforms to
829 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
830 otherwise it returns FALSE. If C<len> is 0, it will be calculated using
831 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
832 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
833 characters being ASCII constitute 'a valid UTF-8 string'.
835 This function returns FALSE for strings containing any code points above the
836 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
838 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
841 C<L</is_utf8_invariant_string>>,
842 C<L</is_utf8_invariant_string_loc>>,
843 C<L</is_utf8_string>>,
844 C<L</is_utf8_string_flags>>,
845 C<L</is_utf8_string_loc>>,
846 C<L</is_utf8_string_loc_flags>>,
847 C<L</is_utf8_string_loclen>>,
848 C<L</is_utf8_string_loclen_flags>>,
849 C<L</is_utf8_fixed_width_buf_flags>>,
850 C<L</is_utf8_fixed_width_buf_loc_flags>>,
851 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
852 C<L</is_strict_utf8_string>>,
853 C<L</is_strict_utf8_string_loc>>,
854 C<L</is_strict_utf8_string_loclen>>,
855 C<L</is_c9strict_utf8_string_loc>>,
857 C<L</is_c9strict_utf8_string_loclen>>.
862 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
865 =for apidoc is_utf8_string_flags
867 Returns TRUE if the first C<len> bytes of string C<s> form a valid
868 UTF-8 string, subject to the restrictions imposed by C<flags>;
869 returns FALSE otherwise. If C<len> is 0, it will be calculated
870 using C<strlen(s)> (which means if you use this option, that C<s> can't have
871 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
872 that all characters being ASCII constitute 'a valid UTF-8 string'.
874 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
875 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
876 as C<L</is_strict_utf8_string>>; and if C<flags> is
877 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
878 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
879 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
880 C<L</utf8n_to_uvchr>>, with the same meanings.
883 C<L</is_utf8_invariant_string>>,
884 C<L</is_utf8_invariant_string_loc>>,
885 C<L</is_utf8_string>>,
886 C<L</is_utf8_string_loc>>,
887 C<L</is_utf8_string_loc_flags>>,
888 C<L</is_utf8_string_loclen>>,
889 C<L</is_utf8_string_loclen_flags>>,
890 C<L</is_utf8_fixed_width_buf_flags>>,
891 C<L</is_utf8_fixed_width_buf_loc_flags>>,
892 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
893 C<L</is_strict_utf8_string>>,
894 C<L</is_strict_utf8_string_loc>>,
895 C<L</is_strict_utf8_string_loclen>>,
896 C<L</is_c9strict_utf8_string>>,
897 C<L</is_c9strict_utf8_string_loc>>,
899 C<L</is_c9strict_utf8_string_loclen>>.
904 PERL_STATIC_INLINE bool
905 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
907 const U8 * first_variant;
909 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
910 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
911 |UTF8_DISALLOW_PERL_EXTENDED)));
914 len = strlen((const char *)s);
918 return is_utf8_string(s, len);
921 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
922 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
924 return is_strict_utf8_string(s, len);
927 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
928 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
930 return is_c9strict_utf8_string(s, len);
933 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
934 const U8* const send = s + len;
935 const U8* x = first_variant;
938 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
939 if (UNLIKELY(! cur_len)) {
951 =for apidoc is_utf8_string_loc
953 Like C<L</is_utf8_string>> but stores the location of the failure (in the
954 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
955 "utf8ness success") in the C<ep> pointer.
957 See also C<L</is_utf8_string_loclen>>.
962 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
966 =for apidoc is_utf8_string_loclen
968 Like C<L</is_utf8_string>> but stores the location of the failure (in the
969 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
970 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
971 encoded characters in the C<el> pointer.
973 See also C<L</is_utf8_string_loc>>.
978 PERL_STATIC_INLINE bool
979 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
981 const U8 * first_variant;
983 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
986 len = strlen((const char *) s);
989 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1001 const U8* const send = s + len;
1002 const U8* x = first_variant;
1003 STRLEN outlen = first_variant - s;
1006 const STRLEN cur_len = isUTF8_CHAR(x, send);
1007 if (UNLIKELY(! cur_len)) {
1027 =for apidoc isUTF8_CHAR
1029 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1030 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1031 that represents some code point; otherwise it evaluates to 0. If non-zero, the
1032 value gives how many bytes starting at C<s> comprise the code point's
1033 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1034 form the first code point in C<s>, are not examined.
1036 The code point can be any that will fit in an IV on this machine, using Perl's
1037 extension to official UTF-8 to represent those higher than the Unicode maximum
1038 of 0x10FFFF. That means that this macro is used to efficiently decide if the
1039 next few bytes in C<s> is legal UTF-8 for a single character.
1041 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1042 defined by Unicode to be fully interchangeable across applications;
1043 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1044 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1045 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1047 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1048 C<L</is_utf8_string_loclen>> to check entire strings.
1050 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1051 machines) is a valid UTF-8 character.
1055 This uses an adaptation of the table and algorithm given in
1056 http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1057 documentation of the original version. A copyright notice for the original
1058 version is given at the beginning of this file. The Perl adapation is
1059 documented at the definition of PL_extended_utf8_dfa_tab[].
1063 PERL_STATIC_INLINE Size_t
1064 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1069 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1071 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1072 * code point, which can be returned immediately. Otherwise, it is either
1073 * malformed, or for the start byte FF which the dfa doesn't handle (except
1074 * on 32-bit ASCII platforms where it trivially is an error). Call a
1075 * helper function for the other platforms. */
1077 while (s < e && LIKELY(state != 1)) {
1078 state = PL_extended_utf8_dfa_tab[256
1080 + PL_extended_utf8_dfa_tab[*s]];
1089 #if defined(UV_IS_QUAD) || defined(EBCDIC)
1091 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1092 return is_utf8_char_helper(s0, e, 0);
1102 =for apidoc isSTRICT_UTF8_CHAR
1104 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1105 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1106 Unicode code point completely acceptable for open interchange between all
1107 applications; otherwise it evaluates to 0. If non-zero, the value gives how
1108 many bytes starting at C<s> comprise the code point's representation. Any
1109 bytes remaining before C<e>, but beyond the ones needed to form the first code
1110 point in C<s>, are not examined.
1112 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1113 be a surrogate nor a non-character code point. Thus this excludes any code
1114 point from Perl's extended UTF-8.
1116 This is used to efficiently decide if the next few bytes in C<s> is
1117 legal Unicode-acceptable UTF-8 for a single character.
1119 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1120 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1121 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1122 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1124 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1125 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1129 This uses an adaptation of the tables and algorithm given in
1130 http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1131 documentation of the original version. A copyright notice for the original
1132 version is given at the beginning of this file. The Perl adapation is
1133 documented at the definition of strict_extended_utf8_dfa_tab[].
1137 PERL_STATIC_INLINE Size_t
1138 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1143 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1145 while (s < e && LIKELY(state != 1)) {
1146 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
1158 /* The dfa above drops out for certain Hanguls; handle them specially */
1159 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1170 =for apidoc isC9_STRICT_UTF8_CHAR
1172 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1173 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1174 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1175 the value gives how many bytes starting at C<s> comprise the code point's
1176 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1177 form the first code point in C<s>, are not examined.
1179 The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1180 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1181 code points. This corresponds to
1182 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1183 which said that non-character code points are merely discouraged rather than
1184 completely forbidden in open interchange. See
1185 L<perlunicode/Noncharacter code points>.
1187 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1188 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1190 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1191 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1195 This uses an adaptation of the tables and algorithm given in
1196 http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1197 documentation of the original version. A copyright notice for the original
1198 version is given at the beginning of this file. The Perl adapation is
1199 documented at the definition of PL_c9_utf8_dfa_tab[].
1203 PERL_STATIC_INLINE Size_t
1204 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1209 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1211 while (s < e && LIKELY(state != 1)) {
1212 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
1227 =for apidoc is_strict_utf8_string_loc
1229 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1230 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1231 "utf8ness success") in the C<ep> pointer.
1233 See also C<L</is_strict_utf8_string_loclen>>.
1238 #define is_strict_utf8_string_loc(s, len, ep) \
1239 is_strict_utf8_string_loclen(s, len, ep, 0)
1243 =for apidoc is_strict_utf8_string_loclen
1245 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1246 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1247 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1248 encoded characters in the C<el> pointer.
1250 See also C<L</is_strict_utf8_string_loc>>.
1255 PERL_STATIC_INLINE bool
1256 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1258 const U8 * first_variant;
1260 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1263 len = strlen((const char *) s);
1266 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1278 const U8* const send = s + len;
1279 const U8* x = first_variant;
1280 STRLEN outlen = first_variant - s;
1283 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1284 if (UNLIKELY(! cur_len)) {
1304 =for apidoc is_c9strict_utf8_string_loc
1306 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1307 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1308 "utf8ness success") in the C<ep> pointer.
1310 See also C<L</is_c9strict_utf8_string_loclen>>.
1315 #define is_c9strict_utf8_string_loc(s, len, ep) \
1316 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1320 =for apidoc is_c9strict_utf8_string_loclen
1322 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1323 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1324 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1325 characters in the C<el> pointer.
1327 See also C<L</is_c9strict_utf8_string_loc>>.
1332 PERL_STATIC_INLINE bool
1333 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1335 const U8 * first_variant;
1337 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1340 len = strlen((const char *) s);
1343 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1355 const U8* const send = s + len;
1356 const U8* x = first_variant;
1357 STRLEN outlen = first_variant - s;
1360 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1361 if (UNLIKELY(! cur_len)) {
1381 =for apidoc is_utf8_string_loc_flags
1383 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1384 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1385 "utf8ness success") in the C<ep> pointer.
1387 See also C<L</is_utf8_string_loclen_flags>>.
1392 #define is_utf8_string_loc_flags(s, len, ep, flags) \
1393 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1396 /* The above 3 actual functions could have been moved into the more general one
1397 * just below, and made #defines that call it with the right 'flags'. They are
1398 * currently kept separate to increase their chances of getting inlined */
1402 =for apidoc is_utf8_string_loclen_flags
1404 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1405 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1406 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1407 encoded characters in the C<el> pointer.
1409 See also C<L</is_utf8_string_loc_flags>>.
1414 PERL_STATIC_INLINE bool
1415 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1417 const U8 * first_variant;
1419 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1420 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1421 |UTF8_DISALLOW_PERL_EXTENDED)));
1424 len = strlen((const char *) s);
1428 return is_utf8_string_loclen(s, len, ep, el);
1431 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1432 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1434 return is_strict_utf8_string_loclen(s, len, ep, el);
1437 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1438 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1440 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1443 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1455 const U8* send = s + len;
1456 const U8* x = first_variant;
1457 STRLEN outlen = first_variant - s;
1460 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1461 if (UNLIKELY(! cur_len)) {
1480 =for apidoc utf8_distance
1482 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1485 WARNING: use only if you *know* that the pointers point inside the
1491 PERL_STATIC_INLINE IV
1492 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1494 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1496 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1500 =for apidoc utf8_hop
1502 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1503 forward or backward.
1505 WARNING: do not use the following unless you *know* C<off> is within
1506 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1507 on the first byte of character or just after the last byte of a character.
1512 PERL_STATIC_INLINE U8 *
1513 Perl_utf8_hop(const U8 *s, SSize_t off)
1515 PERL_ARGS_ASSERT_UTF8_HOP;
1517 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1518 * the bitops (especially ~) can create illegal UTF-8.
1519 * In other words: in Perl UTF-8 is not just for Unicode. */
1528 while (UTF8_IS_CONTINUATION(*s))
1532 GCC_DIAG_IGNORE(-Wcast-qual)
1538 =for apidoc utf8_hop_forward
1540 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1543 C<off> must be non-negative.
1545 C<s> must be before or equal to C<end>.
1547 When moving forward it will not move beyond C<end>.
1549 Will not exceed this limit even if the string is not valid "UTF-8".
1554 PERL_STATIC_INLINE U8 *
1555 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1557 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1559 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1560 * the bitops (especially ~) can create illegal UTF-8.
1561 * In other words: in Perl UTF-8 is not just for Unicode. */
1567 STRLEN skip = UTF8SKIP(s);
1568 if ((STRLEN)(end - s) <= skip) {
1569 GCC_DIAG_IGNORE(-Wcast-qual)
1576 GCC_DIAG_IGNORE(-Wcast-qual)
1582 =for apidoc utf8_hop_back
1584 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1587 C<off> must be non-positive.
1589 C<s> must be after or equal to C<start>.
1591 When moving backward it will not move before C<start>.
1593 Will not exceed this limit even if the string is not valid "UTF-8".
1598 PERL_STATIC_INLINE U8 *
1599 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1601 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1603 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1604 * the bitops (especially ~) can create illegal UTF-8.
1605 * In other words: in Perl UTF-8 is not just for Unicode. */
1610 while (off++ && s > start) {
1613 } while (UTF8_IS_CONTINUATION(*s) && s > start);
1616 GCC_DIAG_IGNORE(-Wcast-qual)
1622 =for apidoc utf8_hop_safe
1624 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1625 either forward or backward.
1627 When moving backward it will not move before C<start>.
1629 When moving forward it will not move beyond C<end>.
1631 Will not exceed those limits even if the string is not valid "UTF-8".
1636 PERL_STATIC_INLINE U8 *
1637 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1639 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1641 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1642 * the bitops (especially ~) can create illegal UTF-8.
1643 * In other words: in Perl UTF-8 is not just for Unicode. */
1645 assert(start <= s && s <= end);
1648 return utf8_hop_forward(s, off, end);
1651 return utf8_hop_back(s, off, start);
1657 =for apidoc is_utf8_valid_partial_char
1659 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1660 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1661 points. Otherwise, it returns 1 if there exists at least one non-empty
1662 sequence of bytes that when appended to sequence C<s>, starting at position
1663 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1664 otherwise returns 0.
1666 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1669 This is useful when a fixed-length buffer is being tested for being well-formed
1670 UTF-8, but the final few bytes in it don't comprise a full character; that is,
1671 it is split somewhere in the middle of the final code point's UTF-8
1672 representation. (Presumably when the buffer is refreshed with the next chunk
1673 of data, the new first bytes will complete the partial code point.) This
1674 function is used to verify that the final bytes in the current buffer are in
1675 fact the legal beginning of some code point, so that if they aren't, the
1676 failure can be signalled without having to wait for the next read.
1680 #define is_utf8_valid_partial_char(s, e) \
1681 is_utf8_valid_partial_char_flags(s, e, 0)
1685 =for apidoc is_utf8_valid_partial_char_flags
1687 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1688 or not the input is a valid UTF-8 encoded partial character, but it takes an
1689 extra parameter, C<flags>, which can further restrict which code points are
1692 If C<flags> is 0, this behaves identically to
1693 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1694 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1695 there is any sequence of bytes that can complete the input partial character in
1696 such a way that a non-prohibited character is formed, the function returns
1697 TRUE; otherwise FALSE. Non character code points cannot be determined based on
1698 partial character input. But many of the other possible excluded types can be
1699 determined from just the first one or two bytes.
1704 PERL_STATIC_INLINE bool
1705 Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
1707 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
1709 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1710 |UTF8_DISALLOW_PERL_EXTENDED)));
1712 if (s >= e || s + UTF8SKIP(s) <= e) {
1716 return cBOOL(is_utf8_char_helper(s, e, flags));
1721 =for apidoc is_utf8_fixed_width_buf_flags
1723 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1724 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1725 otherwise it returns FALSE.
1727 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1728 without restriction. If the final few bytes of the buffer do not form a
1729 complete code point, this will return TRUE anyway, provided that
1730 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1732 If C<flags> in non-zero, it can be any combination of the
1733 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1736 This function differs from C<L</is_utf8_string_flags>> only in that the latter
1737 returns FALSE if the final few bytes of the string don't form a complete code
1742 #define is_utf8_fixed_width_buf_flags(s, len, flags) \
1743 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1747 =for apidoc is_utf8_fixed_width_buf_loc_flags
1749 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1750 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1751 to the beginning of any partial character at the end of the buffer; if there is
1752 no partial character C<*ep> will contain C<s>+C<len>.
1754 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1759 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1760 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1764 =for apidoc is_utf8_fixed_width_buf_loclen_flags
1766 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1767 complete, valid characters found in the C<el> pointer.
1772 PERL_STATIC_INLINE bool
1773 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
1779 const U8 * maybe_partial;
1781 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1784 ep = &maybe_partial;
1787 /* If it's entirely valid, return that; otherwise see if the only error is
1788 * that the final few bytes are for a partial character */
1789 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1790 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1793 PERL_STATIC_INLINE UV
1794 Perl_utf8n_to_uvchr_msgs(const U8 *s,
1801 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1802 * simple cases, and, if necessary calls a helper function to deal with the
1803 * more complex ones. Almost all well-formed non-problematic code points
1804 * are considered simple, so that it's unlikely that the helper function
1805 * will need to be called.
1807 * This is an adaptation of the tables and algorithm given in
1808 * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
1809 * comprehensive documentation of the original version. A copyright notice
1810 * for the original version is given at the beginning of this file. The
1811 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
1814 const U8 * const s0 = s;
1815 const U8 * send = s0 + curlen;
1816 UV uv = 0; /* The 0 silences some stupid compilers */
1819 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1821 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1822 * non-problematic code point, which can be returned immediately.
1823 * Otherwise we call a helper function to figure out the more complicated
1826 while (s < send && LIKELY(state != 1)) {
1827 UV type = PL_strict_utf8_dfa_tab[*s];
1830 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1831 : UTF8_ACCUMULATE(uv, *s);
1832 state = PL_strict_utf8_dfa_tab[256 + state + type];
1840 *retlen = s - s0 + 1;
1849 return UNI_TO_NATIVE(uv);
1852 /* Here is potentially problematic. Use the full mechanism */
1853 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1856 PERL_STATIC_INLINE UV
1857 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1859 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
1863 if (! ckWARN_d(WARN_UTF8)) {
1865 /* EMPTY is not really allowed, and asserts on debugging builds. But
1866 * on non-debugging we have to deal with it, and this causes it to
1867 * return the REPLACEMENT CHARACTER, as the documentation indicates */
1868 return utf8n_to_uvchr(s, send - s, retlen,
1869 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
1872 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1873 if (retlen && ret == 0 && *s != '\0') {
1874 *retlen = (STRLEN) -1;
1881 /* ------------------------------- perl.h ----------------------------- */
1884 =head1 Miscellaneous Functions
1886 =for apidoc is_safe_syscall
1888 Test that the given C<pv> doesn't contain any internal C<NUL> characters.
1889 If it does, set C<errno> to C<ENOENT>, optionally warn, and return FALSE.
1891 Return TRUE if the name is safe.
1893 Used by the C<IS_SAFE_SYSCALL()> macro.
1898 PERL_STATIC_INLINE bool
1899 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1901 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1902 * perl itself uses xce*() functions which accept 8-bit strings.
1905 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1909 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
1910 SETERRNO(ENOENT, LIB_INVARG);
1911 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
1912 "Invalid \\0 character in %s for %s: %s\\0%s",
1913 what, op_name, pv, null_at+1);
1923 Return true if the supplied filename has a newline character
1924 immediately before the first (hopefully only) NUL.
1926 My original look at this incorrectly used the len from SvPV(), but
1927 that's incorrect, since we allow for a NUL in pv[len-1].
1929 So instead, strlen() and work from there.
1931 This allow for the user reading a filename, forgetting to chomp it,
1934 open my $foo, "$file\0";
1940 PERL_STATIC_INLINE bool
1941 S_should_warn_nl(const char *pv)
1945 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1949 return len > 0 && pv[len-1] == '\n';
1954 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1956 PERL_STATIC_INLINE bool
1957 S_lossless_NV_to_IV(const NV nv, IV *ivp)
1959 /* This function determines if the input NV 'nv' may be converted without
1960 * loss of data to an IV. If not, it returns FALSE taking no other action.
1961 * But if it is possible, it does the conversion, returning TRUE, and
1962 * storing the converted result in '*ivp' */
1964 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1966 # if defined(Perl_isnan)
1968 if (UNLIKELY(Perl_isnan(nv))) {
1974 if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1978 if ((IV) nv != nv) {
1988 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
1990 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
1992 #define MAX_CHARSET_NAME_LENGTH 2
1994 PERL_STATIC_INLINE const char *
1995 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
1997 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
1999 /* Returns a string that corresponds to the name of the regex character set
2000 * given by 'flags', and *lenp is set the length of that string, which
2001 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2004 switch (get_regex_charset(flags)) {
2005 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2006 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2007 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2008 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2009 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2011 return ASCII_MORE_RESTRICT_PAT_MODS;
2013 /* The NOT_REACHED; hides an assert() which has a rather complex
2014 * definition in perl.h. */
2015 NOT_REACHED; /* NOTREACHED */
2016 return "?"; /* Unknown */
2023 Return false if any get magic is on the SV other than taint magic.
2027 PERL_STATIC_INLINE bool
2028 Perl_sv_only_taint_gmagic(SV *sv)
2030 MAGIC *mg = SvMAGIC(sv);
2032 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2035 if (mg->mg_type != PERL_MAGIC_taint
2036 && !(mg->mg_flags & MGf_GSKIP)
2037 && mg->mg_virtual->svt_get) {
2040 mg = mg->mg_moremagic;
2046 /* ------------------ cop.h ------------------------------------------- */
2049 /* Enter a block. Push a new base context and return its address. */
2051 PERL_STATIC_INLINE PERL_CONTEXT *
2052 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2056 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2061 cx->blk_gimme = gimme;
2062 cx->blk_oldsaveix = saveix;
2063 cx->blk_oldsp = (I32)(sp - PL_stack_base);
2064 cx->blk_oldcop = PL_curcop;
2065 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
2066 cx->blk_oldscopesp = PL_scopestack_ix;
2067 cx->blk_oldpm = PL_curpm;
2068 cx->blk_old_tmpsfloor = PL_tmps_floor;
2070 PL_tmps_floor = PL_tmps_ix;
2071 CX_DEBUG(cx, "PUSH");
2076 /* Exit a block (RETURN and LAST). */
2078 PERL_STATIC_INLINE void
2079 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2081 PERL_ARGS_ASSERT_CX_POPBLOCK;
2083 CX_DEBUG(cx, "POP");
2084 /* these 3 are common to cx_popblock and cx_topblock */
2085 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2086 PL_scopestack_ix = cx->blk_oldscopesp;
2087 PL_curpm = cx->blk_oldpm;
2089 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2090 * and leaves a CX entry lying around for repeated use, so
2091 * skip for multicall */ \
2092 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2093 || PL_savestack_ix == cx->blk_oldsaveix);
2094 PL_curcop = cx->blk_oldcop;
2095 PL_tmps_floor = cx->blk_old_tmpsfloor;
2098 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2099 * Whereas cx_popblock() restores the state to the point just before
2100 * cx_pushblock() was called, cx_topblock() restores it to the point just
2101 * *after* cx_pushblock() was called. */
2103 PERL_STATIC_INLINE void
2104 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2106 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2108 CX_DEBUG(cx, "TOP");
2109 /* these 3 are common to cx_popblock and cx_topblock */
2110 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2111 PL_scopestack_ix = cx->blk_oldscopesp;
2112 PL_curpm = cx->blk_oldpm;
2114 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2118 PERL_STATIC_INLINE void
2119 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2121 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2123 PERL_ARGS_ASSERT_CX_PUSHSUB;
2125 PERL_DTRACE_PROBE_ENTRY(cv);
2126 cx->blk_sub.cv = cv;
2127 cx->blk_sub.olddepth = CvDEPTH(cv);
2128 cx->blk_sub.prevcomppad = PL_comppad;
2129 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2130 cx->blk_sub.retop = retop;
2131 SvREFCNT_inc_simple_void_NN(cv);
2132 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2136 /* subsets of cx_popsub() */
2138 PERL_STATIC_INLINE void
2139 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2143 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2144 assert(CxTYPE(cx) == CXt_SUB);
2146 PL_comppad = cx->blk_sub.prevcomppad;
2147 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2148 cv = cx->blk_sub.cv;
2149 CvDEPTH(cv) = cx->blk_sub.olddepth;
2150 cx->blk_sub.cv = NULL;
2155 /* handle the @_ part of leaving a sub */
2157 PERL_STATIC_INLINE void
2158 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2162 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2163 assert(CxTYPE(cx) == CXt_SUB);
2164 assert(AvARRAY(MUTABLE_AV(
2165 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2166 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2168 CX_POP_SAVEARRAY(cx);
2169 av = MUTABLE_AV(PAD_SVl(0));
2170 if (UNLIKELY(AvREAL(av)))
2171 /* abandon @_ if it got reified */
2172 clear_defarray(av, 0);
2179 PERL_STATIC_INLINE void
2180 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2182 PERL_ARGS_ASSERT_CX_POPSUB;
2183 assert(CxTYPE(cx) == CXt_SUB);
2185 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2189 cx_popsub_common(cx);
2193 PERL_STATIC_INLINE void
2194 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2196 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2198 cx->blk_format.cv = cv;
2199 cx->blk_format.retop = retop;
2200 cx->blk_format.gv = gv;
2201 cx->blk_format.dfoutgv = PL_defoutgv;
2202 cx->blk_format.prevcomppad = PL_comppad;
2205 SvREFCNT_inc_simple_void_NN(cv);
2207 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2211 PERL_STATIC_INLINE void
2212 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2217 PERL_ARGS_ASSERT_CX_POPFORMAT;
2218 assert(CxTYPE(cx) == CXt_FORMAT);
2220 dfout = cx->blk_format.dfoutgv;
2222 cx->blk_format.dfoutgv = NULL;
2223 SvREFCNT_dec_NN(dfout);
2225 PL_comppad = cx->blk_format.prevcomppad;
2226 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2227 cv = cx->blk_format.cv;
2228 cx->blk_format.cv = NULL;
2230 SvREFCNT_dec_NN(cv);
2234 PERL_STATIC_INLINE void
2235 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2237 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2239 cx->blk_eval.retop = retop;
2240 cx->blk_eval.old_namesv = namesv;
2241 cx->blk_eval.old_eval_root = PL_eval_root;
2242 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2243 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2244 cx->blk_eval.cur_top_env = PL_top_env;
2246 assert(!(PL_in_eval & ~ 0x3F));
2247 assert(!(PL_op->op_type & ~0x1FF));
2248 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2252 PERL_STATIC_INLINE void
2253 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2257 PERL_ARGS_ASSERT_CX_POPEVAL;
2258 assert(CxTYPE(cx) == CXt_EVAL);
2260 PL_in_eval = CxOLD_IN_EVAL(cx);
2261 assert(!(PL_in_eval & 0xc0));
2262 PL_eval_root = cx->blk_eval.old_eval_root;
2263 sv = cx->blk_eval.cur_text;
2264 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2265 cx->blk_eval.cur_text = NULL;
2266 SvREFCNT_dec_NN(sv);
2269 sv = cx->blk_eval.old_namesv;
2271 cx->blk_eval.old_namesv = NULL;
2272 SvREFCNT_dec_NN(sv);
2277 /* push a plain loop, i.e.
2279 * while (cond) { block }
2280 * for (init;cond;continue) { block }
2281 * This loop can be last/redo'ed etc.
2284 PERL_STATIC_INLINE void
2285 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
2287 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2288 cx->blk_loop.my_op = cLOOP;
2292 /* push a true for loop, i.e.
2293 * for var (list) { block }
2296 PERL_STATIC_INLINE void
2297 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
2299 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2301 /* this one line is common with cx_pushloop_plain */
2302 cx->blk_loop.my_op = cLOOP;
2304 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2305 cx->blk_loop.itersave = itersave;
2307 cx->blk_loop.oldcomppad = PL_comppad;
2312 /* pop all loop types, including plain */
2314 PERL_STATIC_INLINE void
2315 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
2317 PERL_ARGS_ASSERT_CX_POPLOOP;
2319 assert(CxTYPE_is_LOOP(cx));
2320 if ( CxTYPE(cx) == CXt_LOOP_ARY
2321 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2323 /* Free ary or cur. This assumes that state_u.ary.ary
2324 * aligns with state_u.lazysv.cur. See cx_dup() */
2325 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2326 cx->blk_loop.state_u.lazysv.cur = NULL;
2327 SvREFCNT_dec_NN(sv);
2328 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2329 sv = cx->blk_loop.state_u.lazysv.end;
2330 cx->blk_loop.state_u.lazysv.end = NULL;
2331 SvREFCNT_dec_NN(sv);
2334 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2336 SV **svp = (cx)->blk_loop.itervar_u.svp;
2337 if ((cx->cx_type & CXp_FOR_GV))
2338 svp = &GvSV((GV*)svp);
2340 *svp = cx->blk_loop.itersave;
2341 cx->blk_loop.itersave = NULL;
2342 SvREFCNT_dec(cursv);
2347 PERL_STATIC_INLINE void
2348 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2350 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2352 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2356 PERL_STATIC_INLINE void
2357 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2359 PERL_ARGS_ASSERT_CX_POPWHEN;
2360 assert(CxTYPE(cx) == CXt_WHEN);
2362 PERL_UNUSED_ARG(cx);
2363 PERL_UNUSED_CONTEXT;
2364 /* currently NOOP */
2368 PERL_STATIC_INLINE void
2369 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
2371 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2373 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2374 cx->blk_givwhen.defsv_save = orig_defsv;
2378 PERL_STATIC_INLINE void
2379 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
2383 PERL_ARGS_ASSERT_CX_POPGIVEN;
2384 assert(CxTYPE(cx) == CXt_GIVEN);
2386 sv = GvSV(PL_defgv);
2387 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2388 cx->blk_givwhen.defsv_save = NULL;
2392 /* ------------------ util.h ------------------------------------------- */
2395 =head1 Miscellaneous Functions
2399 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2401 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2402 match themselves and their opposite case counterparts. Non-cased and non-ASCII
2403 range bytes match only themselves.
2408 PERL_STATIC_INLINE I32
2409 Perl_foldEQ(const char *s1, const char *s2, I32 len)
2411 const U8 *a = (const U8 *)s1;
2412 const U8 *b = (const U8 *)s2;
2414 PERL_ARGS_ASSERT_FOLDEQ;
2419 if (*a != *b && *a != PL_fold[*b])
2426 PERL_STATIC_INLINE I32
2427 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2429 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2430 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2431 * does not check for this. Nor does it check that the strings each have
2432 * at least 'len' characters. */
2434 const U8 *a = (const U8 *)s1;
2435 const U8 *b = (const U8 *)s2;
2437 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2442 if (*a != *b && *a != PL_fold_latin1[*b]) {
2451 =for apidoc foldEQ_locale
2453 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2454 same case-insensitively in the current locale; false otherwise.
2459 PERL_STATIC_INLINE I32
2460 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2463 const U8 *a = (const U8 *)s1;
2464 const U8 *b = (const U8 *)s2;
2466 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2471 if (*a != *b && *a != PL_fold_locale[*b])
2478 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2480 PERL_STATIC_INLINE void *
2481 S_my_memrchr(const char * s, const char c, const STRLEN len)
2483 /* memrchr(), since many platforms lack it */
2485 const char * t = s + len - 1;
2487 PERL_ARGS_ASSERT_MY_MEMRCHR;
2502 * ex: set ts=8 sts=4 sw=4 et: