This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
SvTRUE might need to take aTHX
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9 8 * This file contains tables and code adapted from
f6521f7c 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
8ed185f9
KW
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
37 * Each section names the header file that the functions "belong" to.
38 */
27669aa4 39
be3a7a5d
KW
40/* ------------------------------- av.h ------------------------------- */
41
c70927a6 42PERL_STATIC_INLINE SSize_t
c9182d9c 43Perl_av_top_index(pTHX_ AV *av)
be3a7a5d
KW
44{
45 PERL_ARGS_ASSERT_AV_TOP_INDEX;
46 assert(SvTYPE(av) == SVt_PVAV);
47
48 return AvFILL(av);
49}
50
1afe1db1
FC
51/* ------------------------------- cv.h ------------------------------- */
52
ae77754a 53PERL_STATIC_INLINE GV *
c9182d9c 54Perl_CvGV(pTHX_ CV *sv)
ae77754a 55{
74804ad1
KW
56 PERL_ARGS_ASSERT_CVGV;
57
ae77754a
FC
58 return CvNAMED(sv)
59 ? Perl_cvgv_from_hek(aTHX_ sv)
60 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
61}
62
1afe1db1 63PERL_STATIC_INLINE I32 *
74804ad1 64Perl_CvDEPTH(const CV * const sv)
1afe1db1 65{
74804ad1 66 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 67 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 68
8de47657 69 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
70}
71
d16269d8
PM
72/*
73 CvPROTO returns the prototype as stored, which is not necessarily what
74 the interpreter should be using. Specifically, the interpreter assumes
75 that spaces have been stripped, which has been the case if the prototype
76 was added by toke.c, but is generally not the case if it was added elsewhere.
77 Since we can't enforce the spacelessness at assignment time, this routine
78 provides a temporary copy at parse time with spaces removed.
79 I<orig> is the start of the original buffer, I<len> is the length of the
80 prototype and will be updated when this returns.
81 */
82
5b67adb8 83#ifdef PERL_CORE
d16269d8
PM
84PERL_STATIC_INLINE char *
85S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
86{
87 SV * tmpsv;
88 char * tmps;
89 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
90 tmps = SvPVX(tmpsv);
91 while ((*len)--) {
92 if (!isSPACE(*orig))
93 *tmps++ = *orig;
94 orig++;
95 }
96 *tmps = '\0';
97 *len = tmps - SvPVX(tmpsv);
98 return SvPVX(tmpsv);
99}
5b67adb8 100#endif
d16269d8 101
25fdce4a
FC
102/* ------------------------------- mg.h ------------------------------- */
103
104#if defined(PERL_CORE) || defined(PERL_EXT)
105/* assumes get-magic and stringification have already occurred */
106PERL_STATIC_INLINE STRLEN
107S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
108{
109 assert(mg->mg_type == PERL_MAGIC_regex_global);
110 assert(mg->mg_len != -1);
111 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
112 return (STRLEN)mg->mg_len;
113 else {
114 const STRLEN pos = (STRLEN)mg->mg_len;
115 /* Without this check, we may read past the end of the buffer: */
116 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
117 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
118 }
119}
120#endif
121
03414f05
FC
122/* ------------------------------- pad.h ------------------------------ */
123
124#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
125PERL_STATIC_INLINE bool
b9d5702c 126S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 127{
b9d5702c
KW
128 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
129
03414f05
FC
130 /* is seq within the range _LOW to _HIGH ?
131 * This is complicated by the fact that PL_cop_seqmax
132 * may have wrapped around at some point */
133 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
134 return FALSE; /* not yet introduced */
135
136 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
137 /* in compiling scope */
138 if (
139 (seq > COP_SEQ_RANGE_LOW(pn))
140 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
141 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
142 )
143 return TRUE;
144 }
145 else if (
146 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
147 ?
148 ( seq > COP_SEQ_RANGE_LOW(pn)
149 || seq <= COP_SEQ_RANGE_HIGH(pn))
150
151 : ( seq > COP_SEQ_RANGE_LOW(pn)
152 && seq <= COP_SEQ_RANGE_HIGH(pn))
153 )
154 return TRUE;
155 return FALSE;
156}
157#endif
158
33a4312b
FC
159/* ------------------------------- pp.h ------------------------------- */
160
161PERL_STATIC_INLINE I32
c9182d9c 162Perl_TOPMARK(pTHX)
33a4312b
FC
163{
164 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 165 "MARK top %p %" IVdf "\n",
33a4312b
FC
166 PL_markstack_ptr,
167 (IV)*PL_markstack_ptr)));
168 return *PL_markstack_ptr;
169}
170
171PERL_STATIC_INLINE I32
c9182d9c 172Perl_POPMARK(pTHX)
33a4312b
FC
173{
174 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 175 "MARK pop %p %" IVdf "\n",
33a4312b
FC
176 (PL_markstack_ptr-1),
177 (IV)*(PL_markstack_ptr-1))));
178 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
179 return *PL_markstack_ptr--;
180}
181
8d919b0a
FC
182/* ----------------------------- regexp.h ----------------------------- */
183
184PERL_STATIC_INLINE struct regexp *
c9182d9c 185Perl_ReANY(const REGEXP * const re)
8d919b0a 186{
df6b4bd5 187 XPV* const p = (XPV*)SvANY(re);
bdef45de
KW
188
189 PERL_ARGS_ASSERT_REANY;
8d919b0a 190 assert(isREGEXP(re));
bdef45de 191
df6b4bd5
DM
192 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
193 : (struct regexp *)p;
8d919b0a
FC
194}
195
27669aa4
FC
196/* ------------------------------- sv.h ------------------------------- */
197
a887b094 198PERL_STATIC_INLINE bool
9d0469db
PLE
199Perl_SvTRUE(pTHX_ SV *sv) {
200 if (!LIKELY(sv))
201 return FALSE;
202 return SvTRUE_NN(sv);
a887b094
PLE
203}
204
27669aa4 205PERL_STATIC_INLINE SV *
c9182d9c 206Perl_SvREFCNT_inc(SV *sv)
27669aa4 207{
2439e033 208 if (LIKELY(sv != NULL))
27669aa4
FC
209 SvREFCNT(sv)++;
210 return sv;
211}
212PERL_STATIC_INLINE SV *
c9182d9c 213Perl_SvREFCNT_inc_NN(SV *sv)
27669aa4 214{
3f2f854a
KW
215 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
216
27669aa4
FC
217 SvREFCNT(sv)++;
218 return sv;
219}
220PERL_STATIC_INLINE void
c9182d9c 221Perl_SvREFCNT_inc_void(SV *sv)
27669aa4 222{
2439e033 223 if (LIKELY(sv != NULL))
27669aa4
FC
224 SvREFCNT(sv)++;
225}
75e16a44 226PERL_STATIC_INLINE void
c9182d9c 227Perl_SvREFCNT_dec(pTHX_ SV *sv)
75e16a44 228{
2439e033 229 if (LIKELY(sv != NULL)) {
75a9bf96 230 U32 rc = SvREFCNT(sv);
79e2a32a 231 if (LIKELY(rc > 1))
75a9bf96
DM
232 SvREFCNT(sv) = rc - 1;
233 else
234 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
235 }
236}
541377b1
FC
237
238PERL_STATIC_INLINE void
c9182d9c 239Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
4a9a56a7
DM
240{
241 U32 rc = SvREFCNT(sv);
3f2f854a
KW
242
243 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
244
79e2a32a 245 if (LIKELY(rc > 1))
4a9a56a7
DM
246 SvREFCNT(sv) = rc - 1;
247 else
248 Perl_sv_free2(aTHX_ sv, rc);
249}
250
251PERL_STATIC_INLINE void
1bd041dc 252Perl_SvAMAGIC_on(SV *sv)
541377b1 253{
1bd041dc 254 PERL_ARGS_ASSERT_SVAMAGIC_ON;
541377b1 255 assert(SvROK(sv));
1bd041dc 256
541377b1
FC
257 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
258}
259PERL_STATIC_INLINE void
1bd041dc 260Perl_SvAMAGIC_off(SV *sv)
541377b1 261{
1bd041dc
KW
262 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
263
541377b1
FC
264 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
265 HvAMAGIC_off(SvSTASH(SvRV(sv)));
266}
267
268PERL_STATIC_INLINE U32
c9182d9c 269Perl_SvPADSTALE_on(SV *sv)
541377b1 270{
c0683843 271 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
272 return SvFLAGS(sv) |= SVs_PADSTALE;
273}
274PERL_STATIC_INLINE U32
c9182d9c 275Perl_SvPADSTALE_off(SV *sv)
541377b1 276{
c0683843 277 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
278 return SvFLAGS(sv) &= ~SVs_PADSTALE;
279}
25fdce4a 280#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 281PERL_STATIC_INLINE STRLEN
6964422a 282S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 283{
25fdce4a 284 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
285 if (SvGAMAGIC(sv)) {
286 U8 *hopped = utf8_hop((U8 *)pv, pos);
287 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
288 return (STRLEN)(hopped - (U8 *)pv);
289 }
290 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
291}
292#endif
f019c49e 293
d1decf2b
TC
294/* ------------------------------- handy.h ------------------------------- */
295
296/* saves machine code for a common noreturn idiom typically used in Newx*() */
7347ee54 297GCC_DIAG_IGNORE_DECL(-Wunused-function);
d1decf2b 298static void
c9182d9c 299Perl_croak_memory_wrap(void)
d1decf2b
TC
300{
301 Perl_croak_nocontext("%s",PL_memory_wrap);
302}
7347ee54 303GCC_DIAG_RESTORE_DECL;
d1decf2b 304
a8a2ceaa
KW
305/* ------------------------------- utf8.h ------------------------------- */
306
2fe720e2
KW
307/*
308=head1 Unicode Support
309*/
310
55d09dc8 311PERL_STATIC_INLINE void
c9182d9c 312Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
313{
314 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
315 * encoded string at '*dest', updating '*dest' to include it */
316
55d09dc8
KW
317 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
318
6f2d5cbc 319 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 320 *((*dest)++) = byte;
55d09dc8 321 else {
a09ec51a
KW
322 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
323 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
324 }
325}
326
e123187a 327/*
2fe720e2 328=for apidoc valid_utf8_to_uvchr
09232555
KW
329Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
330known that the next character in the input UTF-8 string C<s> is well-formed
331(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
332points, and non-Unicode code points are allowed.
2fe720e2
KW
333
334=cut
335
336 */
337
338PERL_STATIC_INLINE UV
339Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
340{
c41b2540 341 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
342 const U8* send = s + expectlen;
343 UV uv = *s;
344
345 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
346
347 if (retlen) {
348 *retlen = expectlen;
349 }
350
351 /* An invariant is trivially returned */
352 if (expectlen == 1) {
353 return uv;
354 }
355
356 /* Remove the leading bits that indicate the number of bytes, leaving just
357 * the bits that are part of the value */
358 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
359
360 /* Now, loop through the remaining bytes, accumulating each into the
361 * working total as we go. (I khw tried unrolling the loop for up to 4
362 * bytes, but there was no performance improvement) */
363 for (++s; s < send; s++) {
364 uv = UTF8_ACCUMULATE(uv, *s);
365 }
366
367 return UNI_TO_NATIVE(uv);
368
369}
370
1e599354
KW
371/*
372=for apidoc is_utf8_invariant_string
373
82c5d941 374Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 375regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
376EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
377are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
378the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
379characters are invariant, but so also are the C1 controls.
1e599354
KW
380
381If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
382use this option, that C<s> can't have embedded C<NUL> characters and has to
383have a terminating C<NUL> byte).
384
9f2abfde
KW
385See also
386C<L</is_utf8_string>>,
387C<L</is_utf8_string_flags>>,
388C<L</is_utf8_string_loc>>,
389C<L</is_utf8_string_loc_flags>>,
390C<L</is_utf8_string_loclen>>,
391C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
392C<L</is_utf8_fixed_width_buf_flags>>,
393C<L</is_utf8_fixed_width_buf_loc_flags>>,
394C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
395C<L</is_strict_utf8_string>>,
396C<L</is_strict_utf8_string_loc>>,
397C<L</is_strict_utf8_string_loclen>>,
398C<L</is_c9strict_utf8_string>>,
399C<L</is_c9strict_utf8_string_loc>>,
400and
401C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
402
403=cut
0cbf5865
KW
404
405*/
406
407#define is_utf8_invariant_string(s, len) \
408 is_utf8_invariant_string_loc(s, len, NULL)
409
410/*
411=for apidoc is_utf8_invariant_string_loc
412
413Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
414the first UTF-8 variant character in the C<ep> pointer; if all characters are
415UTF-8 invariant, this function does not change the contents of C<*ep>.
416
417=cut
418
1e599354
KW
419*/
420
421PERL_STATIC_INLINE bool
c9182d9c 422Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 423{
e17544a6 424 const U8* send;
1e599354
KW
425 const U8* x = s;
426
0cbf5865
KW
427 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
428
e17544a6
KW
429 if (len == 0) {
430 len = strlen((const char *)s);
431 }
432
433 send = s + len;
434
4ab2fd9b 435/* This looks like 0x010101... */
2c5c8af5 436# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
437
438/* This looks like 0x808080... */
2c5c8af5 439# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
e099ea69 440# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 441# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 442
099e59a4
KW
443/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
444 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
445 * optimized out completely on a 32-bit system, and its mask gets optimized out
446 * on a 64-bit system */
2c5c8af5 447# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
448 | ( PTR2nat(x) >> 1) \
449 | ( ( (PTR2nat(x) \
450 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 451
3f515a2e
KW
452#ifndef EBCDIC
453
099e59a4
KW
454 /* Do the word-at-a-time iff there is at least one usable full word. That
455 * means that after advancing to a word boundary, there still is at least a
456 * full word left. The number of bytes needed to advance is 'wordsize -
457 * offset' unless offset is 0. */
458 if ((STRLEN) (send - x) >= PERL_WORDSIZE
459
460 /* This term is wordsize if subword; 0 if not */
461 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
462
463 /* 'offset' */
464 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
465 {
b40579ff 466
46bb68f6
KW
467 /* Process per-byte until reach word boundary. XXX This loop could be
468 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 469 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
470 if (! UTF8_IS_INVARIANT(*x)) {
471 if (ep) {
472 *ep = x;
473 }
e17544a6 474
46bb68f6
KW
475 return FALSE;
476 }
477 x++;
e17544a6 478 }
e17544a6 479
099e59a4
KW
480 /* Here, we know we have at least one full word to process. Process
481 * per-word as long as we have at least a full word left */
482 do {
4ab2fd9b 483 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 484
46bb68f6
KW
485 /* Found a variant. Just return if caller doesn't want its
486 * exact position */
487 if (! ep) {
488 return FALSE;
489 }
e17544a6 490
2c5c8af5
KW
491# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
492 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 493
73f0a2eb 494 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
495 assert(*ep >= s && *ep < send);
496
497 return FALSE;
498
2c5c8af5 499# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
500 checks. */
501
46bb68f6 502 break;
2c5c8af5 503# endif
46bb68f6 504 }
1d2af574 505
46bb68f6 506 x += PERL_WORDSIZE;
1d2af574 507
099e59a4 508 } while (x + PERL_WORDSIZE <= send);
b40579ff 509 }
e17544a6 510
0b08cab0 511#endif /* End of ! EBCDIC */
e17544a6
KW
512
513 /* Process per-byte */
514 while (x < send) {
515 if (! UTF8_IS_INVARIANT(*x)) {
516 if (ep) {
517 *ep = x;
518 }
0cbf5865 519
e17544a6 520 return FALSE;
0cbf5865 521 }
1e599354 522
e17544a6 523 x++;
1e599354
KW
524 }
525
526 return TRUE;
527}
528
23a7ee81
KW
529#ifndef EBCDIC
530
1d2af574 531PERL_STATIC_INLINE unsigned int
73f0a2eb 532Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574
KW
533{
534
535 /* This returns the position in a word (0..7) of the first variant byte in
536 * it. This is a helper function. Note that there are no branches */
537
538 assert(word);
539
540 /* Get just the msb bits of each byte */
541 word &= PERL_VARIANTS_WORD_MASK;
542
7adf2470 543# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
544
545 /* Bytes are stored like
546 * Byte8 ... Byte2 Byte1
547 * 63..56...15...8 7...0
548 *
549 * Isolate the lsb;
550 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
551 *
552 * The word will look this this, with a rightmost set bit in position 's':
553 * ('x's are don't cares)
554 * s
555 * x..x100..0
556 * x..xx10..0 Right shift (rightmost 0 is shifted off)
557 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
558 * the 1 just to their left into a 0; the remainder is
559 * untouched
4fa92663
KW
560 * 0..0011..1 The xor with the original, x..xx10..0, clears that
561 * remainder, sets the bottom to all 1
1d2af574
KW
562 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
563 *
564 * Another method is to do 'word &= -word'; but it generates a compiler
565 * message on some platforms about taking the negative of an unsigned */
566
567 word >>= 1;
568 word = 1 + (word ^ (word - 1));
569
570# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
571
572 /* Bytes are stored like
573 * Byte1 Byte2 ... Byte8
574 * 63..56 55..47 ... 7...0
575 *
576 * Isolate the msb; http://codeforces.com/blog/entry/10330
577 *
578 * Only the most significant set bit matters. Or'ing word with its right
579 * shift of 1 makes that bit and the next one to its right both 1. Then
580 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
581 * msb and all to the right being 1. */
582 word |= word >> 1;
583 word |= word >> 2;
584 word |= word >> 4;
585 word |= word >> 8;
586 word |= word >> 16;
587 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
588
589 /* Then subtracting the right shift by 1 clears all but the left-most of
590 * the 1 bits, which is our desired result */
591 word -= (word >> 1);
592
593# else
594# error Unexpected byte order
595# endif
596
7cf2d6c7
KW
597 /* Here 'word' has a single bit set: the msb of the first byte in which it
598 * is set. Calculate that position in the word. We can use this
1d2af574 599 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
67e12c5c
KW
600 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
601 * just get shifted off at compile time) */
602 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
603 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
604 | (39 << 24) | (47 << 16)
605 | (55 << 8) | (63 << 0));
1d2af574
KW
606 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
607
608 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
609 word = ((word + 1) >> 3) - 1;
610
611# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
612
613 /* And invert the result */
614 word = CHARBITS - word - 1;
615
616# endif
617
618 return (unsigned int) word;
619}
620
23a7ee81 621#endif
03c1e4ab
KW
622#if defined(PERL_CORE) || defined(PERL_EXT)
623
624/*
625=for apidoc variant_under_utf8_count
626
627This function looks at the sequence of bytes between C<s> and C<e>, which are
628assumed to be encoded in ASCII/Latin1, and returns how many of them would
629change should the string be translated into UTF-8. Due to the nature of UTF-8,
630each of these would occupy two bytes instead of the single one in the input
631string. Thus, this function returns the precise number of bytes the string
632would expand by when translated to UTF-8.
633
634Unlike most of the other functions that have C<utf8> in their name, the input
635to this function is NOT a UTF-8-encoded string. The function name is slightly
636I<odd> to emphasize this.
637
638This function is internal to Perl because khw thinks that any XS code that
639would want this is probably operating too close to the internals. Presenting a
640valid use case could change that.
641
642See also
643C<L<perlapi/is_utf8_invariant_string>>
644and
645C<L<perlapi/is_utf8_invariant_string_loc>>,
646
647=cut
648
649*/
650
651PERL_STATIC_INLINE Size_t
652S_variant_under_utf8_count(const U8* const s, const U8* const e)
653{
654 const U8* x = s;
655 Size_t count = 0;
656
657 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
658
659# ifndef EBCDIC
660
5d0379de
KW
661 /* Test if the string is long enough to use word-at-a-time. (Logic is the
662 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
663 if ((STRLEN) (e - x) >= PERL_WORDSIZE
664 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
665 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
666 {
667
668 /* Process per-byte until reach word boundary. XXX This loop could be
669 * eliminated if we knew that this platform had fast unaligned reads */
670 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
671 count += ! UTF8_IS_INVARIANT(*x++);
672 }
673
674 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
675 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
676 explanation of how this works */
e5863284
KW
677 PERL_UINTMAX_T increment
678 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
679 * PERL_COUNT_MULTIPLIER)
680 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 681 count += (Size_t) increment;
03c1e4ab
KW
682 x += PERL_WORDSIZE;
683 } while (x + PERL_WORDSIZE <= e);
684 }
685
686# endif
687
688 /* Process per-byte */
689 while (x < e) {
690 if (! UTF8_IS_INVARIANT(*x)) {
691 count++;
692 }
693
694 x++;
695 }
696
697 return count;
698}
699
700#endif
701
aff4cafe
KW
702#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
703# undef PERL_WORDSIZE
704# undef PERL_COUNT_MULTIPLIER
705# undef PERL_WORD_BOUNDARY_MASK
706# undef PERL_VARIANTS_WORD_MASK
707#endif
03c1e4ab 708
7c93d8f0 709/*
5ff889fb
KW
710=for apidoc is_utf8_string
711
82c5d941
KW
712Returns TRUE if the first C<len> bytes of string C<s> form a valid
713Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
714be calculated using C<strlen(s)> (which means if you use this option, that C<s>
715can't have embedded C<NUL> characters and has to have a terminating C<NUL>
716byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
717
2717076a
KW
718This function considers Perl's extended UTF-8 to be valid. That means that
719code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
720considered valid by this function. Use C<L</is_strict_utf8_string>>,
721C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
722code points are considered valid.
5ff889fb 723
9f2abfde
KW
724See also
725C<L</is_utf8_invariant_string>>,
0cbf5865 726C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
727C<L</is_utf8_string_loc>>,
728C<L</is_utf8_string_loclen>>,
8bc127bf
KW
729C<L</is_utf8_fixed_width_buf_flags>>,
730C<L</is_utf8_fixed_width_buf_loc_flags>>,
731C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
732
733=cut
734*/
735
dd237e82 736#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 737
c9cd936b
KW
738#if defined(PERL_CORE) || defined (PERL_EXT)
739
740/*
741=for apidoc is_utf8_non_invariant_string
742
743Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
744C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
745UTF-8; otherwise returns FALSE.
746
747A TRUE return means that at least one code point represented by the sequence
748either is a wide character not representable as a single byte, or the
749representation differs depending on whether the sequence is encoded in UTF-8 or
750not.
751
752See also
753C<L<perlapi/is_utf8_invariant_string>>,
754C<L<perlapi/is_utf8_string>>
755
756=cut
757
758This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
759It generally needn't be if its string is entirely UTF-8 invariant, and it
760shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
761
762It is an internal function because khw thinks that XS code shouldn't be working
763at this low a level. A valid use case could change that.
764
765*/
766
767PERL_STATIC_INLINE bool
86a87e17 768Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
c9cd936b
KW
769{
770 const U8 * first_variant;
771
772 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
773
774 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
775 return FALSE;
776 }
777
778 return is_utf8_string(first_variant, len - (first_variant - s));
779}
780
781#endif
782
5ff889fb 783/*
9f2abfde
KW
784=for apidoc is_strict_utf8_string
785
786Returns TRUE if the first C<len> bytes of string C<s> form a valid
787UTF-8-encoded string that is fully interchangeable by any application using
788Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
789calculated using C<strlen(s)> (which means if you use this option, that C<s>
790can't have embedded C<NUL> characters and has to have a terminating C<NUL>
791byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
792
793This function returns FALSE for strings containing any
794code points above the Unicode max of 0x10FFFF, surrogate code points, or
795non-character code points.
796
797See also
798C<L</is_utf8_invariant_string>>,
0cbf5865 799C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
800C<L</is_utf8_string>>,
801C<L</is_utf8_string_flags>>,
802C<L</is_utf8_string_loc>>,
803C<L</is_utf8_string_loc_flags>>,
804C<L</is_utf8_string_loclen>>,
805C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
806C<L</is_utf8_fixed_width_buf_flags>>,
807C<L</is_utf8_fixed_width_buf_loc_flags>>,
808C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
809C<L</is_strict_utf8_string_loc>>,
810C<L</is_strict_utf8_string_loclen>>,
811C<L</is_c9strict_utf8_string>>,
812C<L</is_c9strict_utf8_string_loc>>,
813and
814C<L</is_c9strict_utf8_string_loclen>>.
815
816=cut
817*/
818
dd237e82 819#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
820
821/*
822=for apidoc is_c9strict_utf8_string
823
824Returns TRUE if the first C<len> bytes of string C<s> form a valid
825UTF-8-encoded string that conforms to
826L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
827otherwise it returns FALSE. If C<len> is 0, it will be calculated using
828C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
829C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
830characters being ASCII constitute 'a valid UTF-8 string'.
831
832This function returns FALSE for strings containing any code points above the
833Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
834code points per
835L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
836
837See also
838C<L</is_utf8_invariant_string>>,
0cbf5865 839C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
840C<L</is_utf8_string>>,
841C<L</is_utf8_string_flags>>,
842C<L</is_utf8_string_loc>>,
843C<L</is_utf8_string_loc_flags>>,
844C<L</is_utf8_string_loclen>>,
845C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
846C<L</is_utf8_fixed_width_buf_flags>>,
847C<L</is_utf8_fixed_width_buf_loc_flags>>,
848C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
849C<L</is_strict_utf8_string>>,
850C<L</is_strict_utf8_string_loc>>,
851C<L</is_strict_utf8_string_loclen>>,
852C<L</is_c9strict_utf8_string_loc>>,
853and
854C<L</is_c9strict_utf8_string_loclen>>.
855
856=cut
857*/
858
dd237e82 859#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
860
861/*
862=for apidoc is_utf8_string_flags
863
864Returns TRUE if the first C<len> bytes of string C<s> form a valid
865UTF-8 string, subject to the restrictions imposed by C<flags>;
866returns FALSE otherwise. If C<len> is 0, it will be calculated
867using C<strlen(s)> (which means if you use this option, that C<s> can't have
868embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
869that all characters being ASCII constitute 'a valid UTF-8 string'.
870
871If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
872C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
873as C<L</is_strict_utf8_string>>; and if C<flags> is
874C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
875C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
876combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
877C<L</utf8n_to_uvchr>>, with the same meanings.
878
879See also
880C<L</is_utf8_invariant_string>>,
0cbf5865 881C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
882C<L</is_utf8_string>>,
883C<L</is_utf8_string_loc>>,
884C<L</is_utf8_string_loc_flags>>,
885C<L</is_utf8_string_loclen>>,
886C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
887C<L</is_utf8_fixed_width_buf_flags>>,
888C<L</is_utf8_fixed_width_buf_loc_flags>>,
889C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
890C<L</is_strict_utf8_string>>,
891C<L</is_strict_utf8_string_loc>>,
892C<L</is_strict_utf8_string_loclen>>,
893C<L</is_c9strict_utf8_string>>,
894C<L</is_c9strict_utf8_string_loc>>,
895and
896C<L</is_c9strict_utf8_string_loclen>>.
897
898=cut
899*/
900
901PERL_STATIC_INLINE bool
c9182d9c 902Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 903{
33756530 904 const U8 * first_variant;
9f2abfde
KW
905
906 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
907 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 908 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 909
f60f61fd
KW
910 if (len == 0) {
911 len = strlen((const char *)s);
912 }
913
9f2abfde
KW
914 if (flags == 0) {
915 return is_utf8_string(s, len);
916 }
917
d044b7a7 918 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
919 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
920 {
921 return is_strict_utf8_string(s, len);
922 }
923
d044b7a7 924 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
925 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
926 {
927 return is_c9strict_utf8_string(s, len);
928 }
929
33756530
KW
930 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
931 const U8* const send = s + len;
932 const U8* x = first_variant;
933
a0d7f935
KW
934 while (x < send) {
935 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
936 if (UNLIKELY(! cur_len)) {
937 return FALSE;
938 }
939 x += cur_len;
9f2abfde 940 }
33756530 941 }
9f2abfde
KW
942
943 return TRUE;
944}
945
946/*
5ff889fb
KW
947
948=for apidoc is_utf8_string_loc
949
2717076a 950Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 951case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 952"utf8ness success") in the C<ep> pointer.
5ff889fb 953
2717076a 954See also C<L</is_utf8_string_loclen>>.
5ff889fb 955
3964c812
KW
956=cut
957*/
958
959#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
960
961/*
962
5ff889fb
KW
963=for apidoc is_utf8_string_loclen
964
2717076a 965Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 966case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 967"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 968encoded characters in the C<el> pointer.
5ff889fb 969
2717076a 970See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
971
972=cut
973*/
974
56e4cf64 975PERL_STATIC_INLINE bool
33756530 976Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 977{
33756530 978 const U8 * first_variant;
5ff889fb
KW
979
980 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
981
33756530
KW
982 if (len == 0) {
983 len = strlen((const char *) s);
984 }
985
986 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
987 if (el)
988 *el = len;
989
990 if (ep) {
991 *ep = s + len;
992 }
993
994 return TRUE;
995 }
996
997 {
998 const U8* const send = s + len;
999 const U8* x = first_variant;
1000 STRLEN outlen = first_variant - s;
1001
a0d7f935
KW
1002 while (x < send) {
1003 const STRLEN cur_len = isUTF8_CHAR(x, send);
1004 if (UNLIKELY(! cur_len)) {
1005 break;
1006 }
1007 x += cur_len;
1008 outlen++;
5ff889fb 1009 }
5ff889fb 1010
a0d7f935
KW
1011 if (el)
1012 *el = outlen;
5ff889fb 1013
a0d7f935
KW
1014 if (ep) {
1015 *ep = x;
1016 }
5ff889fb 1017
a0d7f935 1018 return (x == send);
33756530 1019 }
5ff889fb
KW
1020}
1021
1022/*
9f2abfde 1023
44170c9a 1024=for apidoc isUTF8_CHAR
8ed185f9
KW
1025
1026Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1027looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1028that represents some code point; otherwise it evaluates to 0. If non-zero, the
1029value gives how many bytes starting at C<s> comprise the code point's
1030representation. Any bytes remaining before C<e>, but beyond the ones needed to
1031form the first code point in C<s>, are not examined.
1032
13aab5dd 1033The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1034extension to official UTF-8 to represent those higher than the Unicode maximum
1035of 0x10FFFF. That means that this macro is used to efficiently decide if the
1036next few bytes in C<s> is legal UTF-8 for a single character.
1037
1038Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1039defined by Unicode to be fully interchangeable across applications;
1040C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1041#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1042code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1043
1044Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1045C<L</is_utf8_string_loclen>> to check entire strings.
1046
13aab5dd
KW
1047Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1048machines) is a valid UTF-8 character.
8ed185f9
KW
1049
1050=cut
1051
1052This uses an adaptation of the table and algorithm given in
f6521f7c 1053https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
8ed185f9
KW
1054documentation of the original version. A copyright notice for the original
1055version is given at the beginning of this file. The Perl adapation is
71525f77 1056documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1057
1058*/
1059
1060PERL_STATIC_INLINE Size_t
c9182d9c 1061Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9
KW
1062{
1063 const U8 * s = s0;
1064 UV state = 0;
1065
1066 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1067
1068 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1069 * code point, which can be returned immediately. Otherwise, it is either
1070 * malformed, or for the start byte FF which the dfa doesn't handle (except
1071 * on 32-bit ASCII platforms where it trivially is an error). Call a
1072 * helper function for the other platforms. */
1073
1074 while (s < e && LIKELY(state != 1)) {
71525f77 1075 state = PL_extended_utf8_dfa_tab[256
8ed185f9 1076 + state
71525f77 1077 + PL_extended_utf8_dfa_tab[*s]];
8ed185f9
KW
1078 if (state != 0) {
1079 s++;
1080 continue;
1081 }
1082
1083 return s - s0 + 1;
1084 }
1085
1086#if defined(UV_IS_QUAD) || defined(EBCDIC)
1087
1088 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1376b35c 1089 return is_utf8_char_helper(s0, e, 0);
8ed185f9
KW
1090 }
1091
1092#endif
1093
1094 return 0;
1095}
1096
1097/*
1098
67049a5f
KW
1099=for apidoc isSTRICT_UTF8_CHAR
1100
1101Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1102looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1103Unicode code point completely acceptable for open interchange between all
1104applications; otherwise it evaluates to 0. If non-zero, the value gives how
1105many bytes starting at C<s> comprise the code point's representation. Any
1106bytes remaining before C<e>, but beyond the ones needed to form the first code
1107point in C<s>, are not examined.
1108
1109The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1110be a surrogate nor a non-character code point. Thus this excludes any code
1111point from Perl's extended UTF-8.
1112
1113This is used to efficiently decide if the next few bytes in C<s> is
1114legal Unicode-acceptable UTF-8 for a single character.
1115
1116Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1117#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1118code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1119and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1120
1121Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1122C<L</is_strict_utf8_string_loclen>> to check entire strings.
1123
1124=cut
1125
1126This uses an adaptation of the tables and algorithm given in
f6521f7c 1127https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
67049a5f
KW
1128documentation of the original version. A copyright notice for the original
1129version is given at the beginning of this file. The Perl adapation is
1130documented at the definition of strict_extended_utf8_dfa_tab[].
1131
1132*/
1133
1134PERL_STATIC_INLINE Size_t
c9182d9c 1135Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f
KW
1136{
1137 const U8 * s = s0;
1138 UV state = 0;
1139
1140 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1141
1142 while (s < e && LIKELY(state != 1)) {
71525f77 1143 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
67049a5f
KW
1144
1145 if (state != 0) {
1146 s++;
1147 continue;
1148 }
1149
1150 return s - s0 + 1;
1151 }
1152
1153#ifndef EBCDIC
1154
1155 /* The dfa above drops out for certain Hanguls; handle them specially */
1156 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1157 return 3;
1158 }
1159
1160#endif
1161
1162 return 0;
1163}
1164
1165/*
1166
44170c9a 1167=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1168
1169Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1170looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1171Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1172the value gives how many bytes starting at C<s> comprise the code point's
1173representation. Any bytes remaining before C<e>, but beyond the ones needed to
1174form the first code point in C<s>, are not examined.
1175
1176The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1177differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1178code points. This corresponds to
1179L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1180which said that non-character code points are merely discouraged rather than
1181completely forbidden in open interchange. See
1182L<perlunicode/Noncharacter code points>.
1183
1184Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1185C<L</isUTF8_CHAR_flags>> for a more customized definition.
1186
1187Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1188C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1189
1190=cut
1191
1192This uses an adaptation of the tables and algorithm given in
f6521f7c 1193https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
c5bfbb64
KW
1194documentation of the original version. A copyright notice for the original
1195version is given at the beginning of this file. The Perl adapation is
71525f77 1196documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1197
1198*/
1199
1200PERL_STATIC_INLINE Size_t
c9182d9c 1201Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64
KW
1202{
1203 const U8 * s = s0;
1204 UV state = 0;
1205
1206 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1207
1208 while (s < e && LIKELY(state != 1)) {
71525f77 1209 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
c5bfbb64
KW
1210
1211 if (state != 0) {
1212 s++;
1213 continue;
1214 }
1215
1216 return s - s0 + 1;
1217 }
1218
1219 return 0;
1220}
1221
1222/*
1223
9f2abfde
KW
1224=for apidoc is_strict_utf8_string_loc
1225
1226Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1227case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1228"utf8ness success") in the C<ep> pointer.
1229
1230See also C<L</is_strict_utf8_string_loclen>>.
1231
1232=cut
1233*/
1234
1235#define is_strict_utf8_string_loc(s, len, ep) \
1236 is_strict_utf8_string_loclen(s, len, ep, 0)
1237
1238/*
1239
1240=for apidoc is_strict_utf8_string_loclen
1241
1242Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1243case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1244"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1245encoded characters in the C<el> pointer.
1246
1247See also C<L</is_strict_utf8_string_loc>>.
1248
1249=cut
1250*/
1251
1252PERL_STATIC_INLINE bool
c9182d9c 1253Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1254{
33756530 1255 const U8 * first_variant;
9f2abfde
KW
1256
1257 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1258
33756530
KW
1259 if (len == 0) {
1260 len = strlen((const char *) s);
1261 }
1262
1263 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1264 if (el)
1265 *el = len;
1266
1267 if (ep) {
1268 *ep = s + len;
1269 }
1270
1271 return TRUE;
1272 }
1273
1274 {
1275 const U8* const send = s + len;
1276 const U8* x = first_variant;
1277 STRLEN outlen = first_variant - s;
1278
a0d7f935
KW
1279 while (x < send) {
1280 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1281 if (UNLIKELY(! cur_len)) {
1282 break;
1283 }
1284 x += cur_len;
1285 outlen++;
9f2abfde 1286 }
9f2abfde 1287
a0d7f935
KW
1288 if (el)
1289 *el = outlen;
9f2abfde 1290
a0d7f935
KW
1291 if (ep) {
1292 *ep = x;
1293 }
9f2abfde 1294
a0d7f935 1295 return (x == send);
33756530 1296 }
9f2abfde
KW
1297}
1298
1299/*
1300
1301=for apidoc is_c9strict_utf8_string_loc
1302
1303Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1304the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1305"utf8ness success") in the C<ep> pointer.
1306
1307See also C<L</is_c9strict_utf8_string_loclen>>.
1308
1309=cut
1310*/
1311
1312#define is_c9strict_utf8_string_loc(s, len, ep) \
1313 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1314
1315/*
1316
1317=for apidoc is_c9strict_utf8_string_loclen
1318
1319Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1320the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1321"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1322characters in the C<el> pointer.
1323
1324See also C<L</is_c9strict_utf8_string_loc>>.
1325
1326=cut
1327*/
1328
1329PERL_STATIC_INLINE bool
c9182d9c 1330Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1331{
33756530 1332 const U8 * first_variant;
9f2abfde
KW
1333
1334 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1335
33756530
KW
1336 if (len == 0) {
1337 len = strlen((const char *) s);
1338 }
1339
1340 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1341 if (el)
1342 *el = len;
1343
1344 if (ep) {
1345 *ep = s + len;
1346 }
1347
1348 return TRUE;
1349 }
1350
1351 {
1352 const U8* const send = s + len;
1353 const U8* x = first_variant;
1354 STRLEN outlen = first_variant - s;
1355
a0d7f935
KW
1356 while (x < send) {
1357 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1358 if (UNLIKELY(! cur_len)) {
1359 break;
1360 }
1361 x += cur_len;
1362 outlen++;
9f2abfde 1363 }
9f2abfde 1364
a0d7f935
KW
1365 if (el)
1366 *el = outlen;
9f2abfde 1367
a0d7f935
KW
1368 if (ep) {
1369 *ep = x;
1370 }
9f2abfde 1371
a0d7f935 1372 return (x == send);
33756530 1373 }
9f2abfde
KW
1374}
1375
1376/*
1377
1378=for apidoc is_utf8_string_loc_flags
1379
1380Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1381case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1382"utf8ness success") in the C<ep> pointer.
1383
1384See also C<L</is_utf8_string_loclen_flags>>.
1385
1386=cut
1387*/
1388
1389#define is_utf8_string_loc_flags(s, len, ep, flags) \
1390 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1391
1392
1393/* The above 3 actual functions could have been moved into the more general one
1394 * just below, and made #defines that call it with the right 'flags'. They are
1395 * currently kept separate to increase their chances of getting inlined */
1396
1397/*
1398
1399=for apidoc is_utf8_string_loclen_flags
1400
1401Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1402case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1403"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1404encoded characters in the C<el> pointer.
1405
1406See also C<L</is_utf8_string_loc_flags>>.
1407
1408=cut
1409*/
1410
1411PERL_STATIC_INLINE bool
c9182d9c 1412Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1413{
33756530 1414 const U8 * first_variant;
9f2abfde
KW
1415
1416 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1417 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1418 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1419
f60f61fd 1420 if (len == 0) {
a0d7f935 1421 len = strlen((const char *) s);
f60f61fd
KW
1422 }
1423
9f2abfde
KW
1424 if (flags == 0) {
1425 return is_utf8_string_loclen(s, len, ep, el);
1426 }
1427
d044b7a7 1428 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1429 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1430 {
1431 return is_strict_utf8_string_loclen(s, len, ep, el);
1432 }
1433
d044b7a7 1434 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1435 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1436 {
1437 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1438 }
1439
33756530
KW
1440 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1441 if (el)
1442 *el = len;
1443
1444 if (ep) {
1445 *ep = s + len;
1446 }
1447
1448 return TRUE;
1449 }
1450
1451 {
1452 const U8* send = s + len;
1453 const U8* x = first_variant;
1454 STRLEN outlen = first_variant - s;
1455
a0d7f935
KW
1456 while (x < send) {
1457 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1458 if (UNLIKELY(! cur_len)) {
1459 break;
1460 }
1461 x += cur_len;
1462 outlen++;
9f2abfde 1463 }
9f2abfde 1464
a0d7f935
KW
1465 if (el)
1466 *el = outlen;
9f2abfde 1467
a0d7f935
KW
1468 if (ep) {
1469 *ep = x;
1470 }
9f2abfde 1471
a0d7f935 1472 return (x == send);
33756530 1473 }
9f2abfde
KW
1474}
1475
1476/*
7c93d8f0
KW
1477=for apidoc utf8_distance
1478
1479Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1480and C<b>.
1481
1482WARNING: use only if you *know* that the pointers point inside the
1483same UTF-8 buffer.
1484
1485=cut
1486*/
1487
1488PERL_STATIC_INLINE IV
1489Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1490{
1491 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1492
1493 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1494}
1495
1496/*
1497=for apidoc utf8_hop
1498
1499Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1500forward or backward.
1501
1502WARNING: do not use the following unless you *know* C<off> is within
1503the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1504on the first byte of character or just after the last byte of a character.
1505
1506=cut
1507*/
1508
1509PERL_STATIC_INLINE U8 *
1510Perl_utf8_hop(const U8 *s, SSize_t off)
1511{
1512 PERL_ARGS_ASSERT_UTF8_HOP;
1513
1514 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1515 * the bitops (especially ~) can create illegal UTF-8.
1516 * In other words: in Perl UTF-8 is not just for Unicode. */
1517
1518 if (off >= 0) {
1519 while (off--)
1520 s += UTF8SKIP(s);
1521 }
1522 else {
1523 while (off++) {
1524 s--;
1525 while (UTF8_IS_CONTINUATION(*s))
1526 s--;
1527 }
1528 }
e099ea69 1529 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 1530 return (U8 *)s;
e099ea69 1531 GCC_DIAG_RESTORE
7c93d8f0
KW
1532}
1533
4dab108f 1534/*
65df57a8
TC
1535=for apidoc utf8_hop_forward
1536
1537Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1538forward.
1539
1540C<off> must be non-negative.
1541
1542C<s> must be before or equal to C<end>.
1543
1544When moving forward it will not move beyond C<end>.
1545
1546Will not exceed this limit even if the string is not valid "UTF-8".
1547
1548=cut
1549*/
1550
1551PERL_STATIC_INLINE U8 *
1552Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1553{
1554 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1555
1556 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1557 * the bitops (especially ~) can create illegal UTF-8.
1558 * In other words: in Perl UTF-8 is not just for Unicode. */
1559
1560 assert(s <= end);
1561 assert(off >= 0);
1562
1563 while (off--) {
1564 STRLEN skip = UTF8SKIP(s);
de979548 1565 if ((STRLEN)(end - s) <= skip) {
e099ea69 1566 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1567 return (U8 *)end;
e099ea69 1568 GCC_DIAG_RESTORE
de979548 1569 }
65df57a8
TC
1570 s += skip;
1571 }
1572
e099ea69 1573 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1574 return (U8 *)s;
e099ea69 1575 GCC_DIAG_RESTORE
65df57a8
TC
1576}
1577
1578/*
1579=for apidoc utf8_hop_back
1580
1581Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1582backward.
1583
1584C<off> must be non-positive.
1585
1586C<s> must be after or equal to C<start>.
1587
1588When moving backward it will not move before C<start>.
1589
1590Will not exceed this limit even if the string is not valid "UTF-8".
1591
1592=cut
1593*/
1594
1595PERL_STATIC_INLINE U8 *
1596Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1597{
1598 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1599
1600 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1601 * the bitops (especially ~) can create illegal UTF-8.
1602 * In other words: in Perl UTF-8 is not just for Unicode. */
1603
1604 assert(start <= s);
1605 assert(off <= 0);
1606
1607 while (off++ && s > start) {
e7185695 1608 do {
65df57a8 1609 s--;
e7185695 1610 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8 1611 }
f6521f7c 1612
e099ea69 1613 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1614 return (U8 *)s;
e099ea69 1615 GCC_DIAG_RESTORE
65df57a8
TC
1616}
1617
1618/*
1619=for apidoc utf8_hop_safe
1620
1621Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1622either forward or backward.
1623
1624When moving backward it will not move before C<start>.
1625
1626When moving forward it will not move beyond C<end>.
1627
1628Will not exceed those limits even if the string is not valid "UTF-8".
1629
1630=cut
1631*/
1632
1633PERL_STATIC_INLINE U8 *
1634Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1635{
1636 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1637
1638 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1639 * the bitops (especially ~) can create illegal UTF-8.
1640 * In other words: in Perl UTF-8 is not just for Unicode. */
1641
1642 assert(start <= s && s <= end);
1643
1644 if (off >= 0) {
1645 return utf8_hop_forward(s, off, end);
1646 }
1647 else {
1648 return utf8_hop_back(s, off, start);
1649 }
1650}
1651
1652/*
4dab108f
KW
1653
1654=for apidoc is_utf8_valid_partial_char
1655
6cbb9248
KW
1656Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1657S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1658points. Otherwise, it returns 1 if there exists at least one non-empty
1659sequence of bytes that when appended to sequence C<s>, starting at position
1660C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1661otherwise returns 0.
1662
1663In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1664point.
1665
1666This is useful when a fixed-length buffer is being tested for being well-formed
1667UTF-8, but the final few bytes in it don't comprise a full character; that is,
1668it is split somewhere in the middle of the final code point's UTF-8
1669representation. (Presumably when the buffer is refreshed with the next chunk
1670of data, the new first bytes will complete the partial code point.) This
1671function is used to verify that the final bytes in the current buffer are in
1672fact the legal beginning of some code point, so that if they aren't, the
1673failure can be signalled without having to wait for the next read.
4dab108f
KW
1674
1675=cut
1676*/
2717076a
KW
1677#define is_utf8_valid_partial_char(s, e) \
1678 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1679
1680/*
1681
1682=for apidoc is_utf8_valid_partial_char_flags
1683
1684Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1685or not the input is a valid UTF-8 encoded partial character, but it takes an
1686extra parameter, C<flags>, which can further restrict which code points are
1687considered valid.
1688
1689If C<flags> is 0, this behaves identically to
1690C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1691of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1692there is any sequence of bytes that can complete the input partial character in
1693such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1694TRUE; otherwise FALSE. Non character code points cannot be determined based on
1695partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1696determined from just the first one or two bytes.
1697
1698=cut
1699 */
1700
56e4cf64 1701PERL_STATIC_INLINE bool
c9182d9c 1702Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1703{
f1c999a7 1704 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1705
f1c999a7 1706 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1707 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1708
8875bd48 1709 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1710 return FALSE;
1711 }
1712
1376b35c 1713 return cBOOL(is_utf8_char_helper(s, e, flags));
4dab108f
KW
1714}
1715
8bc127bf
KW
1716/*
1717
1718=for apidoc is_utf8_fixed_width_buf_flags
1719
1720Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1721is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1722otherwise it returns FALSE.
1723
1724If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1725without restriction. If the final few bytes of the buffer do not form a
1726complete code point, this will return TRUE anyway, provided that
1727C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1728
1729If C<flags> in non-zero, it can be any combination of the
1730C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1731same meanings.
1732
1733This function differs from C<L</is_utf8_string_flags>> only in that the latter
1734returns FALSE if the final few bytes of the string don't form a complete code
1735point.
1736
1737=cut
1738 */
1739#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1740 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1741
1742/*
1743
1744=for apidoc is_utf8_fixed_width_buf_loc_flags
1745
1746Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1747failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1748to the beginning of any partial character at the end of the buffer; if there is
1749no partial character C<*ep> will contain C<s>+C<len>.
1750
1751See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1752
1753=cut
1754*/
1755
1756#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1757 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1758
1759/*
1760
1761=for apidoc is_utf8_fixed_width_buf_loclen_flags
1762
1763Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1764complete, valid characters found in the C<el> pointer.
1765
1766=cut
1767*/
1768
1769PERL_STATIC_INLINE bool
c9182d9c 1770Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 1771 STRLEN len,
8bc127bf
KW
1772 const U8 **ep,
1773 STRLEN *el,
1774 const U32 flags)
1775{
1776 const U8 * maybe_partial;
1777
1778 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1779
1780 if (! ep) {
1781 ep = &maybe_partial;
1782 }
1783
1784 /* If it's entirely valid, return that; otherwise see if the only error is
1785 * that the final few bytes are for a partial character */
1786 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1787 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1788}
1789
e6a4ffc3 1790PERL_STATIC_INLINE UV
c9182d9c 1791Perl_utf8n_to_uvchr_msgs(const U8 *s,
e6a4ffc3
KW
1792 STRLEN curlen,
1793 STRLEN *retlen,
1794 const U32 flags,
1795 U32 * errors,
1796 AV ** msgs)
1797{
1798 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1799 * simple cases, and, if necessary calls a helper function to deal with the
1800 * more complex ones. Almost all well-formed non-problematic code points
1801 * are considered simple, so that it's unlikely that the helper function
1802 * will need to be called.
1803 *
1804 * This is an adaptation of the tables and algorithm given in
f6521f7c 1805 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
e6a4ffc3
KW
1806 * comprehensive documentation of the original version. A copyright notice
1807 * for the original version is given at the beginning of this file. The
71525f77 1808 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
1809 */
1810
1811 const U8 * const s0 = s;
1812 const U8 * send = s0 + curlen;
f73cfe7d 1813 UV uv = 0; /* The 0 silences some stupid compilers */
e6a4ffc3
KW
1814 UV state = 0;
1815
1816 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1817
1818 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1819 * non-problematic code point, which can be returned immediately.
1820 * Otherwise we call a helper function to figure out the more complicated
1821 * cases. */
1822
1823 while (s < send && LIKELY(state != 1)) {
71525f77 1824 UV type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3
KW
1825
1826 uv = (state == 0)
1827 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1828 : UTF8_ACCUMULATE(uv, *s);
71525f77 1829 state = PL_strict_utf8_dfa_tab[256 + state + type];
e6a4ffc3
KW
1830
1831 if (state != 0) {
1832 s++;
1833 continue;
1834 }
1835
1836 if (retlen) {
1837 *retlen = s - s0 + 1;
1838 }
1839 if (errors) {
1840 *errors = 0;
1841 }
1842 if (msgs) {
1843 *msgs = NULL;
1844 }
1845
cfebc7aa 1846 return UNI_TO_NATIVE(uv);
e6a4ffc3
KW
1847 }
1848
1849 /* Here is potentially problematic. Use the full mechanism */
1850 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1851}
1852
82651abe 1853PERL_STATIC_INLINE UV
9a9a6c98 1854Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 1855{
9a9a6c98 1856 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
1857
1858 assert(s < send);
1859
1860 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
1861
1862 /* EMPTY is not really allowed, and asserts on debugging builds. But
1863 * on non-debugging we have to deal with it, and this causes it to
1864 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 1865 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 1866 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
1867 }
1868 else {
1869 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1870 if (retlen && ret == 0 && *s != '\0') {
1871 *retlen = (STRLEN) -1;
1872 }
1873
1874 return ret;
1875 }
1876}
1877
c8028aa6
TC
1878/* ------------------------------- perl.h ----------------------------- */
1879
1880/*
dcccc8ff
KW
1881=head1 Miscellaneous Functions
1882
44170c9a 1883=for apidoc is_safe_syscall
c8028aa6 1884
1a0efc9a
KW
1885Test that the given C<pv> (with length C<len>) doesn't contain any internal
1886C<NUL> characters.
1887If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
1888category, and return FALSE.
c8028aa6
TC
1889
1890Return TRUE if the name is safe.
1891
1a0efc9a
KW
1892C<what> and C<op_name> are used in any warning.
1893
796b6530 1894Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1895
1896=cut
1897*/
1898
1899PERL_STATIC_INLINE bool
ffd62fc2
KW
1900Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1901{
c8028aa6
TC
1902 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1903 * perl itself uses xce*() functions which accept 8-bit strings.
1904 */
1905
1906 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1907
6c4650b3 1908 if (len > 1) {
c8028aa6 1909 char *null_at;
41188aa0 1910 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1911 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1912 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1913 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1914 what, op_name, pv, null_at+1);
c8028aa6
TC
1915 return FALSE;
1916 }
1917 }
1918
1919 return TRUE;
1920}
1921
1922/*
7cb3f959
TC
1923
1924Return true if the supplied filename has a newline character
fa6c7d00 1925immediately before the first (hopefully only) NUL.
7cb3f959
TC
1926
1927My original look at this incorrectly used the len from SvPV(), but
1928that's incorrect, since we allow for a NUL in pv[len-1].
1929
1930So instead, strlen() and work from there.
1931
1932This allow for the user reading a filename, forgetting to chomp it,
1933then calling:
1934
1935 open my $foo, "$file\0";
1936
1937*/
1938
1939#ifdef PERL_CORE
1940
1941PERL_STATIC_INLINE bool
ffd62fc2
KW
1942S_should_warn_nl(const char *pv)
1943{
7cb3f959
TC
1944 STRLEN len;
1945
1946 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1947
1948 len = strlen(pv);
1949
1950 return len > 0 && pv[len-1] == '\n';
1951}
1952
1953#endif
1954
3a019afd
KW
1955#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1956
1957PERL_STATIC_INLINE bool
1958S_lossless_NV_to_IV(const NV nv, IV *ivp)
1959{
1960 /* This function determines if the input NV 'nv' may be converted without
1961 * loss of data to an IV. If not, it returns FALSE taking no other action.
1962 * But if it is possible, it does the conversion, returning TRUE, and
1963 * storing the converted result in '*ivp' */
1964
1965 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1966
1967# if defined(Perl_isnan)
1968
1969 if (UNLIKELY(Perl_isnan(nv))) {
1970 return FALSE;
1971 }
1972
1973# endif
1974
1975 if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1976 return FALSE;
1977 }
1978
1979 if ((IV) nv != nv) {
1980 return FALSE;
1981 }
1982
1983 *ivp = (IV) nv;
1984 return TRUE;
1985}
1986
1987#endif
1988
14fe5f8a
KW
1989/* ------------------ regcomp.c, toke.c ------------ */
1990
1991#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C)
1992
1993/*
1994 - regcurly - a little FSA that accepts {\d+,?\d*}
1995 Pulled from reg.c.
1996 */
6bbb95d4 1997PERL_STATIC_INLINE bool
14fe5f8a
KW
1998S_regcurly(const char *s)
1999{
2000 PERL_ARGS_ASSERT_REGCURLY;
2001
2002 if (*s++ != '{')
2003 return FALSE;
2004 if (!isDIGIT(*s))
2005 return FALSE;
2006 while (isDIGIT(*s))
2007 s++;
2008 if (*s == ',') {
2009 s++;
2010 while (isDIGIT(*s))
2011 s++;
2012 }
2013
2014 return *s == '}';
2015}
2016
2017#endif
2018
81d52ecd
JH
2019/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2020
94b0cb42
KW
2021#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2022
81d52ecd
JH
2023#define MAX_CHARSET_NAME_LENGTH 2
2024
2025PERL_STATIC_INLINE const char *
94b0cb42 2026S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 2027{
94b0cb42
KW
2028 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2029
81d52ecd
JH
2030 /* Returns a string that corresponds to the name of the regex character set
2031 * given by 'flags', and *lenp is set the length of that string, which
2032 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2033
2034 *lenp = 1;
2035 switch (get_regex_charset(flags)) {
2036 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2037 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2038 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2039 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2040 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2041 *lenp = 2;
2042 return ASCII_MORE_RESTRICT_PAT_MODS;
2043 }
2044 /* The NOT_REACHED; hides an assert() which has a rather complex
2045 * definition in perl.h. */
2046 NOT_REACHED; /* NOTREACHED */
2047 return "?"; /* Unknown */
2048}
2049
94b0cb42
KW
2050#endif
2051
7cb3f959 2052/*
ed382232
TC
2053
2054Return false if any get magic is on the SV other than taint magic.
2055
2056*/
2057
2058PERL_STATIC_INLINE bool
ffd62fc2
KW
2059Perl_sv_only_taint_gmagic(SV *sv)
2060{
ed382232
TC
2061 MAGIC *mg = SvMAGIC(sv);
2062
2063 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2064
2065 while (mg) {
2066 if (mg->mg_type != PERL_MAGIC_taint
2067 && !(mg->mg_flags & MGf_GSKIP)
2068 && mg->mg_virtual->svt_get) {
2069 return FALSE;
2070 }
2071 mg = mg->mg_moremagic;
2072 }
2073
2074 return TRUE;
2075}
2076
ed8ff0f3
DM
2077/* ------------------ cop.h ------------------------------------------- */
2078
5b6f7443
DM
2079/* implement GIMME_V() macro */
2080
2081PERL_STATIC_INLINE U8
2082Perl_gimme_V(pTHX)
2083{
2084 I32 cxix;
2085 U8 gimme = (PL_op->op_flags & OPf_WANT);
2086
2087 if (gimme)
2088 return gimme;
2089 cxix = PL_curstackinfo->si_cxsubix;
2090 if (cxix < 0)
2091 return G_VOID;
2092 assert(cxstack[cxix].blk_gimme & G_WANT);
2093 return (cxstack[cxix].blk_gimme & G_WANT);
2094}
2095
ed8ff0f3
DM
2096
2097/* Enter a block. Push a new base context and return its address. */
2098
2099PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2100Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2101{
2102 PERL_CONTEXT * cx;
2103
2104 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2105
2106 CXINC;
2107 cx = CX_CUR();
2108 cx->cx_type = type;
2109 cx->blk_gimme = gimme;
2110 cx->blk_oldsaveix = saveix;
4caf7d8c 2111 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2112 cx->blk_oldcop = PL_curcop;
4caf7d8c 2113 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2114 cx->blk_oldscopesp = PL_scopestack_ix;
2115 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2116 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2117
2118 PL_tmps_floor = PL_tmps_ix;
2119 CX_DEBUG(cx, "PUSH");
2120 return cx;
2121}
2122
2123
2124/* Exit a block (RETURN and LAST). */
2125
2126PERL_STATIC_INLINE void
c9182d9c 2127Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2128{
2129 PERL_ARGS_ASSERT_CX_POPBLOCK;
2130
2131 CX_DEBUG(cx, "POP");
2132 /* these 3 are common to cx_popblock and cx_topblock */
2133 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2134 PL_scopestack_ix = cx->blk_oldscopesp;
2135 PL_curpm = cx->blk_oldpm;
2136
2137 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2138 * and leaves a CX entry lying around for repeated use, so
2139 * skip for multicall */ \
2140 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2141 || PL_savestack_ix == cx->blk_oldsaveix);
2142 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2143 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2144}
2145
2146/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2147 * Whereas cx_popblock() restores the state to the point just before
2148 * cx_pushblock() was called, cx_topblock() restores it to the point just
2149 * *after* cx_pushblock() was called. */
2150
2151PERL_STATIC_INLINE void
c9182d9c 2152Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2153{
2154 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2155
2156 CX_DEBUG(cx, "TOP");
2157 /* these 3 are common to cx_popblock and cx_topblock */
2158 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2159 PL_scopestack_ix = cx->blk_oldscopesp;
2160 PL_curpm = cx->blk_oldpm;
2161
2162 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2163}
2164
2165
a73d8813 2166PERL_STATIC_INLINE void
c9182d9c 2167Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2168{
2169 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2170
2171 PERL_ARGS_ASSERT_CX_PUSHSUB;
2172
3f6bd23a 2173 PERL_DTRACE_PROBE_ENTRY(cv);
5b6f7443
DM
2174 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2175 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
a73d8813
DM
2176 cx->blk_sub.cv = cv;
2177 cx->blk_sub.olddepth = CvDEPTH(cv);
2178 cx->blk_sub.prevcomppad = PL_comppad;
2179 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2180 cx->blk_sub.retop = retop;
2181 SvREFCNT_inc_simple_void_NN(cv);
2182 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2183}
2184
2185
2186/* subsets of cx_popsub() */
2187
2188PERL_STATIC_INLINE void
c9182d9c 2189Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2190{
2191 CV *cv;
2192
2193 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2194 assert(CxTYPE(cx) == CXt_SUB);
2195
2196 PL_comppad = cx->blk_sub.prevcomppad;
2197 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2198 cv = cx->blk_sub.cv;
2199 CvDEPTH(cv) = cx->blk_sub.olddepth;
2200 cx->blk_sub.cv = NULL;
2201 SvREFCNT_dec(cv);
5b6f7443 2202 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
a73d8813
DM
2203}
2204
2205
2206/* handle the @_ part of leaving a sub */
2207
2208PERL_STATIC_INLINE void
c9182d9c 2209Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2210{
2211 AV *av;
2212
2213 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2214 assert(CxTYPE(cx) == CXt_SUB);
2215 assert(AvARRAY(MUTABLE_AV(
2216 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2217 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2218
2219 CX_POP_SAVEARRAY(cx);
2220 av = MUTABLE_AV(PAD_SVl(0));
2221 if (UNLIKELY(AvREAL(av)))
2222 /* abandon @_ if it got reified */
2223 clear_defarray(av, 0);
2224 else {
2225 CLEAR_ARGARRAY(av);
2226 }
2227}
2228
2229
2230PERL_STATIC_INLINE void
c9182d9c 2231Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2232{
2233 PERL_ARGS_ASSERT_CX_POPSUB;
2234 assert(CxTYPE(cx) == CXt_SUB);
2235
3f6bd23a 2236 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2237
2238 if (CxHASARGS(cx))
2239 cx_popsub_args(cx);
2240 cx_popsub_common(cx);
2241}
2242
2243
6a7d52cc 2244PERL_STATIC_INLINE void
c9182d9c 2245Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2246{
2247 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2248
5b6f7443
DM
2249 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2250 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
6a7d52cc
DM
2251 cx->blk_format.cv = cv;
2252 cx->blk_format.retop = retop;
2253 cx->blk_format.gv = gv;
2254 cx->blk_format.dfoutgv = PL_defoutgv;
2255 cx->blk_format.prevcomppad = PL_comppad;
2256 cx->blk_u16 = 0;
2257
2258 SvREFCNT_inc_simple_void_NN(cv);
2259 CvDEPTH(cv)++;
2260 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2261}
2262
2263
2264PERL_STATIC_INLINE void
c9182d9c 2265Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2266{
2267 CV *cv;
2268 GV *dfout;
2269
2270 PERL_ARGS_ASSERT_CX_POPFORMAT;
2271 assert(CxTYPE(cx) == CXt_FORMAT);
2272
2273 dfout = cx->blk_format.dfoutgv;
2274 setdefout(dfout);
2275 cx->blk_format.dfoutgv = NULL;
2276 SvREFCNT_dec_NN(dfout);
2277
2278 PL_comppad = cx->blk_format.prevcomppad;
2279 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2280 cv = cx->blk_format.cv;
2281 cx->blk_format.cv = NULL;
2282 --CvDEPTH(cv);
2283 SvREFCNT_dec_NN(cv);
5b6f7443 2284 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
6a7d52cc
DM
2285}
2286
2287
13febba5 2288PERL_STATIC_INLINE void
c9182d9c 2289Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5
DM
2290{
2291 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2292
5b6f7443
DM
2293 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2294 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
13febba5
DM
2295 cx->blk_eval.retop = retop;
2296 cx->blk_eval.old_namesv = namesv;
2297 cx->blk_eval.old_eval_root = PL_eval_root;
2298 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2299 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2300 cx->blk_eval.cur_top_env = PL_top_env;
2301
4c57ced5 2302 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2303 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2304 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2305}
2306
2307
2308PERL_STATIC_INLINE void
c9182d9c 2309Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2310{
2311 SV *sv;
2312
2313 PERL_ARGS_ASSERT_CX_POPEVAL;
2314 assert(CxTYPE(cx) == CXt_EVAL);
2315
2316 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2317 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2318 PL_eval_root = cx->blk_eval.old_eval_root;
2319 sv = cx->blk_eval.cur_text;
4c57ced5 2320 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2321 cx->blk_eval.cur_text = NULL;
2322 SvREFCNT_dec_NN(sv);
2323 }
2324
2325 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2326 if (sv) {
2327 cx->blk_eval.old_namesv = NULL;
2328 SvREFCNT_dec_NN(sv);
2329 }
5b6f7443 2330 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
13febba5 2331}
6a7d52cc 2332
a73d8813 2333
d1b6bf72
DM
2334/* push a plain loop, i.e.
2335 * { block }
2336 * while (cond) { block }
2337 * for (init;cond;continue) { block }
2338 * This loop can be last/redo'ed etc.
2339 */
2340
2341PERL_STATIC_INLINE void
c9182d9c 2342Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2343{
2344 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2345 cx->blk_loop.my_op = cLOOP;
2346}
2347
2348
2349/* push a true for loop, i.e.
2350 * for var (list) { block }
2351 */
2352
2353PERL_STATIC_INLINE void
c9182d9c 2354Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
2355{
2356 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2357
2358 /* this one line is common with cx_pushloop_plain */
2359 cx->blk_loop.my_op = cLOOP;
2360
2361 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2362 cx->blk_loop.itersave = itersave;
2363#ifdef USE_ITHREADS
2364 cx->blk_loop.oldcomppad = PL_comppad;
2365#endif
2366}
2367
2368
2369/* pop all loop types, including plain */
2370
2371PERL_STATIC_INLINE void
c9182d9c 2372Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2373{
2374 PERL_ARGS_ASSERT_CX_POPLOOP;
2375
2376 assert(CxTYPE_is_LOOP(cx));
2377 if ( CxTYPE(cx) == CXt_LOOP_ARY
2378 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2379 {
2380 /* Free ary or cur. This assumes that state_u.ary.ary
2381 * aligns with state_u.lazysv.cur. See cx_dup() */
2382 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2383 cx->blk_loop.state_u.lazysv.cur = NULL;
2384 SvREFCNT_dec_NN(sv);
2385 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2386 sv = cx->blk_loop.state_u.lazysv.end;
2387 cx->blk_loop.state_u.lazysv.end = NULL;
2388 SvREFCNT_dec_NN(sv);
2389 }
2390 }
2391 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2392 SV *cursv;
2393 SV **svp = (cx)->blk_loop.itervar_u.svp;
2394 if ((cx->cx_type & CXp_FOR_GV))
2395 svp = &GvSV((GV*)svp);
2396 cursv = *svp;
2397 *svp = cx->blk_loop.itersave;
2398 cx->blk_loop.itersave = NULL;
2399 SvREFCNT_dec(cursv);
2400 }
2401}
2402
2a7b7c61
DM
2403
2404PERL_STATIC_INLINE void
c9182d9c 2405Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2406{
7896dde7 2407 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 2408
7896dde7 2409 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
2410}
2411
2412
2413PERL_STATIC_INLINE void
c9182d9c 2414Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2415{
7896dde7
Z
2416 PERL_ARGS_ASSERT_CX_POPWHEN;
2417 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
2418
2419 PERL_UNUSED_ARG(cx);
59a14f30 2420 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
2421 /* currently NOOP */
2422}
2423
2424
7896dde7 2425PERL_STATIC_INLINE void
c9182d9c 2426Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
2427{
2428 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2429
2430 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2431 cx->blk_givwhen.defsv_save = orig_defsv;
2432}
2433
2434
2435PERL_STATIC_INLINE void
c9182d9c 2436Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
2437{
2438 SV *sv;
2439
2440 PERL_ARGS_ASSERT_CX_POPGIVEN;
2441 assert(CxTYPE(cx) == CXt_GIVEN);
2442
2443 sv = GvSV(PL_defgv);
2444 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2445 cx->blk_givwhen.defsv_save = NULL;
2446 SvREFCNT_dec(sv);
2447}
2448
ec2c235b
KW
2449/* ------------------ util.h ------------------------------------------- */
2450
2451/*
2452=head1 Miscellaneous Functions
2453
2454=for apidoc foldEQ
2455
2456Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2457same
2458case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2459match themselves and their opposite case counterparts. Non-cased and non-ASCII
2460range bytes match only themselves.
2461
2462=cut
2463*/
2464
2465PERL_STATIC_INLINE I32
2466Perl_foldEQ(const char *s1, const char *s2, I32 len)
2467{
2468 const U8 *a = (const U8 *)s1;
2469 const U8 *b = (const U8 *)s2;
2470
2471 PERL_ARGS_ASSERT_FOLDEQ;
2472
2473 assert(len >= 0);
2474
2475 while (len--) {
2476 if (*a != *b && *a != PL_fold[*b])
2477 return 0;
2478 a++,b++;
2479 }
2480 return 1;
2481}
2482
0f9cb40c 2483PERL_STATIC_INLINE I32
ec2c235b
KW
2484Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2485{
79a1fabd
KW
2486 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2487 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2488 * does not check for this. Nor does it check that the strings each have
2489 * at least 'len' characters. */
ec2c235b
KW
2490
2491 const U8 *a = (const U8 *)s1;
2492 const U8 *b = (const U8 *)s2;
2493
2494 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2495
2496 assert(len >= 0);
2497
2498 while (len--) {
2499 if (*a != *b && *a != PL_fold_latin1[*b]) {
2500 return 0;
2501 }
2502 a++, b++;
2503 }
2504 return 1;
2505}
2506
2507/*
2508=for apidoc foldEQ_locale
2509
2510Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2511same case-insensitively in the current locale; false otherwise.
2512
2513=cut
2514*/
2515
0f9cb40c 2516PERL_STATIC_INLINE I32
ec2c235b
KW
2517Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2518{
2519 dVAR;
2520 const U8 *a = (const U8 *)s1;
2521 const U8 *b = (const U8 *)s2;
2522
2523 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2524
2525 assert(len >= 0);
2526
2527 while (len--) {
2528 if (*a != *b && *a != PL_fold_locale[*b])
2529 return 0;
2530 a++,b++;
2531 }
2532 return 1;
2533}
2534
1ab100a8
KW
2535/*
2536=for apidoc my_strnlen
2537
2538The C library C<strnlen> if available, or a Perl implementation of it.
2539
2540C<my_strnlen()> computes the length of the string, up to C<maxlen>
2541characters. It will will never attempt to address more than C<maxlen>
2542characters, making it suitable for use with strings that are not
2543guaranteed to be NUL-terminated.
2544
2545=cut
2546
2547Description stolen from http://man.openbsd.org/strnlen.3,
2548implementation stolen from PostgreSQL.
2549*/
2550#ifndef HAS_STRNLEN
2551
2552PERL_STATIC_INLINE Size_t
2553Perl_my_strnlen(const char *str, Size_t maxlen)
2554{
2555 const char *end = (char *) memchr(str, '\0', maxlen);
2556
2557 PERL_ARGS_ASSERT_MY_STRNLEN;
2558
2559 if (end == NULL) return maxlen;
2560 return end - str;
2561}
2562
2563#endif
2564
6dba01e2
KW
2565#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2566
2567PERL_STATIC_INLINE void *
2568S_my_memrchr(const char * s, const char c, const STRLEN len)
2569{
2570 /* memrchr(), since many platforms lack it */
2571
2572 const char * t = s + len - 1;
2573
2574 PERL_ARGS_ASSERT_MY_MEMRCHR;
2575
2576 while (t >= s) {
2577 if (*t == c) {
2578 return (void *) t;
2579 }
2580 t--;
2581 }
2582
2583 return NULL;
2584}
2585
2586#endif
2587
ed382232 2588/*
c8028aa6
TC
2589 * ex: set ts=8 sts=4 sw=4 et:
2590 */