This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Initial experiment at moving SvTRUE into a static inline macro
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9 8 * This file contains tables and code adapted from
f6521f7c 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
8ed185f9
KW
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
37 * Each section names the header file that the functions "belong" to.
38 */
27669aa4 39
be3a7a5d
KW
40/* ------------------------------- av.h ------------------------------- */
41
c70927a6 42PERL_STATIC_INLINE SSize_t
c9182d9c 43Perl_av_top_index(pTHX_ AV *av)
be3a7a5d
KW
44{
45 PERL_ARGS_ASSERT_AV_TOP_INDEX;
46 assert(SvTYPE(av) == SVt_PVAV);
47
48 return AvFILL(av);
49}
50
1afe1db1
FC
51/* ------------------------------- cv.h ------------------------------- */
52
ae77754a 53PERL_STATIC_INLINE GV *
c9182d9c 54Perl_CvGV(pTHX_ CV *sv)
ae77754a 55{
74804ad1
KW
56 PERL_ARGS_ASSERT_CVGV;
57
ae77754a
FC
58 return CvNAMED(sv)
59 ? Perl_cvgv_from_hek(aTHX_ sv)
60 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
61}
62
1afe1db1 63PERL_STATIC_INLINE I32 *
74804ad1 64Perl_CvDEPTH(const CV * const sv)
1afe1db1 65{
74804ad1 66 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 67 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 68
8de47657 69 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
70}
71
d16269d8
PM
72/*
73 CvPROTO returns the prototype as stored, which is not necessarily what
74 the interpreter should be using. Specifically, the interpreter assumes
75 that spaces have been stripped, which has been the case if the prototype
76 was added by toke.c, but is generally not the case if it was added elsewhere.
77 Since we can't enforce the spacelessness at assignment time, this routine
78 provides a temporary copy at parse time with spaces removed.
79 I<orig> is the start of the original buffer, I<len> is the length of the
80 prototype and will be updated when this returns.
81 */
82
5b67adb8 83#ifdef PERL_CORE
d16269d8
PM
84PERL_STATIC_INLINE char *
85S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
86{
87 SV * tmpsv;
88 char * tmps;
89 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
90 tmps = SvPVX(tmpsv);
91 while ((*len)--) {
92 if (!isSPACE(*orig))
93 *tmps++ = *orig;
94 orig++;
95 }
96 *tmps = '\0';
97 *len = tmps - SvPVX(tmpsv);
98 return SvPVX(tmpsv);
99}
5b67adb8 100#endif
d16269d8 101
25fdce4a
FC
102/* ------------------------------- mg.h ------------------------------- */
103
104#if defined(PERL_CORE) || defined(PERL_EXT)
105/* assumes get-magic and stringification have already occurred */
106PERL_STATIC_INLINE STRLEN
107S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
108{
109 assert(mg->mg_type == PERL_MAGIC_regex_global);
110 assert(mg->mg_len != -1);
111 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
112 return (STRLEN)mg->mg_len;
113 else {
114 const STRLEN pos = (STRLEN)mg->mg_len;
115 /* Without this check, we may read past the end of the buffer: */
116 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
117 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
118 }
119}
120#endif
121
03414f05
FC
122/* ------------------------------- pad.h ------------------------------ */
123
124#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
125PERL_STATIC_INLINE bool
b9d5702c 126S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 127{
b9d5702c
KW
128 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
129
03414f05
FC
130 /* is seq within the range _LOW to _HIGH ?
131 * This is complicated by the fact that PL_cop_seqmax
132 * may have wrapped around at some point */
133 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
134 return FALSE; /* not yet introduced */
135
136 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
137 /* in compiling scope */
138 if (
139 (seq > COP_SEQ_RANGE_LOW(pn))
140 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
141 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
142 )
143 return TRUE;
144 }
145 else if (
146 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
147 ?
148 ( seq > COP_SEQ_RANGE_LOW(pn)
149 || seq <= COP_SEQ_RANGE_HIGH(pn))
150
151 : ( seq > COP_SEQ_RANGE_LOW(pn)
152 && seq <= COP_SEQ_RANGE_HIGH(pn))
153 )
154 return TRUE;
155 return FALSE;
156}
157#endif
158
33a4312b
FC
159/* ------------------------------- pp.h ------------------------------- */
160
161PERL_STATIC_INLINE I32
c9182d9c 162Perl_TOPMARK(pTHX)
33a4312b
FC
163{
164 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 165 "MARK top %p %" IVdf "\n",
33a4312b
FC
166 PL_markstack_ptr,
167 (IV)*PL_markstack_ptr)));
168 return *PL_markstack_ptr;
169}
170
171PERL_STATIC_INLINE I32
c9182d9c 172Perl_POPMARK(pTHX)
33a4312b
FC
173{
174 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 175 "MARK pop %p %" IVdf "\n",
33a4312b
FC
176 (PL_markstack_ptr-1),
177 (IV)*(PL_markstack_ptr-1))));
178 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
179 return *PL_markstack_ptr--;
180}
181
8d919b0a
FC
182/* ----------------------------- regexp.h ----------------------------- */
183
184PERL_STATIC_INLINE struct regexp *
c9182d9c 185Perl_ReANY(const REGEXP * const re)
8d919b0a 186{
df6b4bd5 187 XPV* const p = (XPV*)SvANY(re);
bdef45de
KW
188
189 PERL_ARGS_ASSERT_REANY;
8d919b0a 190 assert(isREGEXP(re));
bdef45de 191
df6b4bd5
DM
192 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
193 : (struct regexp *)p;
8d919b0a
FC
194}
195
27669aa4
FC
196/* ------------------------------- sv.h ------------------------------- */
197
a887b094
PLE
198PERL_STATIC_INLINE bool
199Perl_SvTRUE(SV *sv) {
200 return LIKELY(sv) && SvTRUE_NN(sv);
201}
202
27669aa4 203PERL_STATIC_INLINE SV *
c9182d9c 204Perl_SvREFCNT_inc(SV *sv)
27669aa4 205{
2439e033 206 if (LIKELY(sv != NULL))
27669aa4
FC
207 SvREFCNT(sv)++;
208 return sv;
209}
210PERL_STATIC_INLINE SV *
c9182d9c 211Perl_SvREFCNT_inc_NN(SV *sv)
27669aa4 212{
3f2f854a
KW
213 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
214
27669aa4
FC
215 SvREFCNT(sv)++;
216 return sv;
217}
218PERL_STATIC_INLINE void
c9182d9c 219Perl_SvREFCNT_inc_void(SV *sv)
27669aa4 220{
2439e033 221 if (LIKELY(sv != NULL))
27669aa4
FC
222 SvREFCNT(sv)++;
223}
75e16a44 224PERL_STATIC_INLINE void
c9182d9c 225Perl_SvREFCNT_dec(pTHX_ SV *sv)
75e16a44 226{
2439e033 227 if (LIKELY(sv != NULL)) {
75a9bf96 228 U32 rc = SvREFCNT(sv);
79e2a32a 229 if (LIKELY(rc > 1))
75a9bf96
DM
230 SvREFCNT(sv) = rc - 1;
231 else
232 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
233 }
234}
541377b1
FC
235
236PERL_STATIC_INLINE void
c9182d9c 237Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
4a9a56a7
DM
238{
239 U32 rc = SvREFCNT(sv);
3f2f854a
KW
240
241 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
242
79e2a32a 243 if (LIKELY(rc > 1))
4a9a56a7
DM
244 SvREFCNT(sv) = rc - 1;
245 else
246 Perl_sv_free2(aTHX_ sv, rc);
247}
248
249PERL_STATIC_INLINE void
1bd041dc 250Perl_SvAMAGIC_on(SV *sv)
541377b1 251{
1bd041dc 252 PERL_ARGS_ASSERT_SVAMAGIC_ON;
541377b1 253 assert(SvROK(sv));
1bd041dc 254
541377b1
FC
255 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
256}
257PERL_STATIC_INLINE void
1bd041dc 258Perl_SvAMAGIC_off(SV *sv)
541377b1 259{
1bd041dc
KW
260 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
261
541377b1
FC
262 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
263 HvAMAGIC_off(SvSTASH(SvRV(sv)));
264}
265
266PERL_STATIC_INLINE U32
c9182d9c 267Perl_SvPADSTALE_on(SV *sv)
541377b1 268{
c0683843 269 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
270 return SvFLAGS(sv) |= SVs_PADSTALE;
271}
272PERL_STATIC_INLINE U32
c9182d9c 273Perl_SvPADSTALE_off(SV *sv)
541377b1 274{
c0683843 275 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
276 return SvFLAGS(sv) &= ~SVs_PADSTALE;
277}
25fdce4a 278#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 279PERL_STATIC_INLINE STRLEN
6964422a 280S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 281{
25fdce4a 282 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
283 if (SvGAMAGIC(sv)) {
284 U8 *hopped = utf8_hop((U8 *)pv, pos);
285 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
286 return (STRLEN)(hopped - (U8 *)pv);
287 }
288 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
289}
290#endif
f019c49e 291
d1decf2b
TC
292/* ------------------------------- handy.h ------------------------------- */
293
294/* saves machine code for a common noreturn idiom typically used in Newx*() */
7347ee54 295GCC_DIAG_IGNORE_DECL(-Wunused-function);
d1decf2b 296static void
c9182d9c 297Perl_croak_memory_wrap(void)
d1decf2b
TC
298{
299 Perl_croak_nocontext("%s",PL_memory_wrap);
300}
7347ee54 301GCC_DIAG_RESTORE_DECL;
d1decf2b 302
a8a2ceaa
KW
303/* ------------------------------- utf8.h ------------------------------- */
304
2fe720e2
KW
305/*
306=head1 Unicode Support
307*/
308
55d09dc8 309PERL_STATIC_INLINE void
c9182d9c 310Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
311{
312 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
313 * encoded string at '*dest', updating '*dest' to include it */
314
55d09dc8
KW
315 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
316
6f2d5cbc 317 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 318 *((*dest)++) = byte;
55d09dc8 319 else {
a09ec51a
KW
320 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
321 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
322 }
323}
324
e123187a 325/*
2fe720e2 326=for apidoc valid_utf8_to_uvchr
09232555
KW
327Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
328known that the next character in the input UTF-8 string C<s> is well-formed
329(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
330points, and non-Unicode code points are allowed.
2fe720e2
KW
331
332=cut
333
334 */
335
336PERL_STATIC_INLINE UV
337Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
338{
c41b2540 339 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
340 const U8* send = s + expectlen;
341 UV uv = *s;
342
343 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
344
345 if (retlen) {
346 *retlen = expectlen;
347 }
348
349 /* An invariant is trivially returned */
350 if (expectlen == 1) {
351 return uv;
352 }
353
354 /* Remove the leading bits that indicate the number of bytes, leaving just
355 * the bits that are part of the value */
356 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
357
358 /* Now, loop through the remaining bytes, accumulating each into the
359 * working total as we go. (I khw tried unrolling the loop for up to 4
360 * bytes, but there was no performance improvement) */
361 for (++s; s < send; s++) {
362 uv = UTF8_ACCUMULATE(uv, *s);
363 }
364
365 return UNI_TO_NATIVE(uv);
366
367}
368
1e599354
KW
369/*
370=for apidoc is_utf8_invariant_string
371
82c5d941 372Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 373regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
374EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
375are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
376the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
377characters are invariant, but so also are the C1 controls.
1e599354
KW
378
379If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
380use this option, that C<s> can't have embedded C<NUL> characters and has to
381have a terminating C<NUL> byte).
382
9f2abfde
KW
383See also
384C<L</is_utf8_string>>,
385C<L</is_utf8_string_flags>>,
386C<L</is_utf8_string_loc>>,
387C<L</is_utf8_string_loc_flags>>,
388C<L</is_utf8_string_loclen>>,
389C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
390C<L</is_utf8_fixed_width_buf_flags>>,
391C<L</is_utf8_fixed_width_buf_loc_flags>>,
392C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
393C<L</is_strict_utf8_string>>,
394C<L</is_strict_utf8_string_loc>>,
395C<L</is_strict_utf8_string_loclen>>,
396C<L</is_c9strict_utf8_string>>,
397C<L</is_c9strict_utf8_string_loc>>,
398and
399C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
400
401=cut
0cbf5865
KW
402
403*/
404
405#define is_utf8_invariant_string(s, len) \
406 is_utf8_invariant_string_loc(s, len, NULL)
407
408/*
409=for apidoc is_utf8_invariant_string_loc
410
411Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
412the first UTF-8 variant character in the C<ep> pointer; if all characters are
413UTF-8 invariant, this function does not change the contents of C<*ep>.
414
415=cut
416
1e599354
KW
417*/
418
419PERL_STATIC_INLINE bool
c9182d9c 420Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 421{
e17544a6 422 const U8* send;
1e599354
KW
423 const U8* x = s;
424
0cbf5865
KW
425 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
426
e17544a6
KW
427 if (len == 0) {
428 len = strlen((const char *)s);
429 }
430
431 send = s + len;
432
4ab2fd9b 433/* This looks like 0x010101... */
2c5c8af5 434# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
435
436/* This looks like 0x808080... */
2c5c8af5 437# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
e099ea69 438# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 439# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 440
099e59a4
KW
441/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
442 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
443 * optimized out completely on a 32-bit system, and its mask gets optimized out
444 * on a 64-bit system */
2c5c8af5 445# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
446 | ( PTR2nat(x) >> 1) \
447 | ( ( (PTR2nat(x) \
448 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 449
3f515a2e
KW
450#ifndef EBCDIC
451
099e59a4
KW
452 /* Do the word-at-a-time iff there is at least one usable full word. That
453 * means that after advancing to a word boundary, there still is at least a
454 * full word left. The number of bytes needed to advance is 'wordsize -
455 * offset' unless offset is 0. */
456 if ((STRLEN) (send - x) >= PERL_WORDSIZE
457
458 /* This term is wordsize if subword; 0 if not */
459 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
460
461 /* 'offset' */
462 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
463 {
b40579ff 464
46bb68f6
KW
465 /* Process per-byte until reach word boundary. XXX This loop could be
466 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 467 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
468 if (! UTF8_IS_INVARIANT(*x)) {
469 if (ep) {
470 *ep = x;
471 }
e17544a6 472
46bb68f6
KW
473 return FALSE;
474 }
475 x++;
e17544a6 476 }
e17544a6 477
099e59a4
KW
478 /* Here, we know we have at least one full word to process. Process
479 * per-word as long as we have at least a full word left */
480 do {
4ab2fd9b 481 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 482
46bb68f6
KW
483 /* Found a variant. Just return if caller doesn't want its
484 * exact position */
485 if (! ep) {
486 return FALSE;
487 }
e17544a6 488
2c5c8af5
KW
489# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
490 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 491
73f0a2eb 492 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
493 assert(*ep >= s && *ep < send);
494
495 return FALSE;
496
2c5c8af5 497# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
498 checks. */
499
46bb68f6 500 break;
2c5c8af5 501# endif
46bb68f6 502 }
1d2af574 503
46bb68f6 504 x += PERL_WORDSIZE;
1d2af574 505
099e59a4 506 } while (x + PERL_WORDSIZE <= send);
b40579ff 507 }
e17544a6 508
0b08cab0 509#endif /* End of ! EBCDIC */
e17544a6
KW
510
511 /* Process per-byte */
512 while (x < send) {
513 if (! UTF8_IS_INVARIANT(*x)) {
514 if (ep) {
515 *ep = x;
516 }
0cbf5865 517
e17544a6 518 return FALSE;
0cbf5865 519 }
1e599354 520
e17544a6 521 x++;
1e599354
KW
522 }
523
524 return TRUE;
525}
526
23a7ee81
KW
527#ifndef EBCDIC
528
1d2af574 529PERL_STATIC_INLINE unsigned int
73f0a2eb 530Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574
KW
531{
532
533 /* This returns the position in a word (0..7) of the first variant byte in
534 * it. This is a helper function. Note that there are no branches */
535
536 assert(word);
537
538 /* Get just the msb bits of each byte */
539 word &= PERL_VARIANTS_WORD_MASK;
540
7adf2470 541# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
542
543 /* Bytes are stored like
544 * Byte8 ... Byte2 Byte1
545 * 63..56...15...8 7...0
546 *
547 * Isolate the lsb;
548 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
549 *
550 * The word will look this this, with a rightmost set bit in position 's':
551 * ('x's are don't cares)
552 * s
553 * x..x100..0
554 * x..xx10..0 Right shift (rightmost 0 is shifted off)
555 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
556 * the 1 just to their left into a 0; the remainder is
557 * untouched
4fa92663
KW
558 * 0..0011..1 The xor with the original, x..xx10..0, clears that
559 * remainder, sets the bottom to all 1
1d2af574
KW
560 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
561 *
562 * Another method is to do 'word &= -word'; but it generates a compiler
563 * message on some platforms about taking the negative of an unsigned */
564
565 word >>= 1;
566 word = 1 + (word ^ (word - 1));
567
568# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
569
570 /* Bytes are stored like
571 * Byte1 Byte2 ... Byte8
572 * 63..56 55..47 ... 7...0
573 *
574 * Isolate the msb; http://codeforces.com/blog/entry/10330
575 *
576 * Only the most significant set bit matters. Or'ing word with its right
577 * shift of 1 makes that bit and the next one to its right both 1. Then
578 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
579 * msb and all to the right being 1. */
580 word |= word >> 1;
581 word |= word >> 2;
582 word |= word >> 4;
583 word |= word >> 8;
584 word |= word >> 16;
585 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
586
587 /* Then subtracting the right shift by 1 clears all but the left-most of
588 * the 1 bits, which is our desired result */
589 word -= (word >> 1);
590
591# else
592# error Unexpected byte order
593# endif
594
7cf2d6c7
KW
595 /* Here 'word' has a single bit set: the msb of the first byte in which it
596 * is set. Calculate that position in the word. We can use this
1d2af574 597 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
67e12c5c
KW
598 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
599 * just get shifted off at compile time) */
600 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
601 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
602 | (39 << 24) | (47 << 16)
603 | (55 << 8) | (63 << 0));
1d2af574
KW
604 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
605
606 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
607 word = ((word + 1) >> 3) - 1;
608
609# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
610
611 /* And invert the result */
612 word = CHARBITS - word - 1;
613
614# endif
615
616 return (unsigned int) word;
617}
618
23a7ee81 619#endif
03c1e4ab
KW
620#if defined(PERL_CORE) || defined(PERL_EXT)
621
622/*
623=for apidoc variant_under_utf8_count
624
625This function looks at the sequence of bytes between C<s> and C<e>, which are
626assumed to be encoded in ASCII/Latin1, and returns how many of them would
627change should the string be translated into UTF-8. Due to the nature of UTF-8,
628each of these would occupy two bytes instead of the single one in the input
629string. Thus, this function returns the precise number of bytes the string
630would expand by when translated to UTF-8.
631
632Unlike most of the other functions that have C<utf8> in their name, the input
633to this function is NOT a UTF-8-encoded string. The function name is slightly
634I<odd> to emphasize this.
635
636This function is internal to Perl because khw thinks that any XS code that
637would want this is probably operating too close to the internals. Presenting a
638valid use case could change that.
639
640See also
641C<L<perlapi/is_utf8_invariant_string>>
642and
643C<L<perlapi/is_utf8_invariant_string_loc>>,
644
645=cut
646
647*/
648
649PERL_STATIC_INLINE Size_t
650S_variant_under_utf8_count(const U8* const s, const U8* const e)
651{
652 const U8* x = s;
653 Size_t count = 0;
654
655 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
656
657# ifndef EBCDIC
658
5d0379de
KW
659 /* Test if the string is long enough to use word-at-a-time. (Logic is the
660 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
661 if ((STRLEN) (e - x) >= PERL_WORDSIZE
662 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
663 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
664 {
665
666 /* Process per-byte until reach word boundary. XXX This loop could be
667 * eliminated if we knew that this platform had fast unaligned reads */
668 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
669 count += ! UTF8_IS_INVARIANT(*x++);
670 }
671
672 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
673 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
674 explanation of how this works */
e5863284
KW
675 PERL_UINTMAX_T increment
676 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
677 * PERL_COUNT_MULTIPLIER)
678 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 679 count += (Size_t) increment;
03c1e4ab
KW
680 x += PERL_WORDSIZE;
681 } while (x + PERL_WORDSIZE <= e);
682 }
683
684# endif
685
686 /* Process per-byte */
687 while (x < e) {
688 if (! UTF8_IS_INVARIANT(*x)) {
689 count++;
690 }
691
692 x++;
693 }
694
695 return count;
696}
697
698#endif
699
aff4cafe
KW
700#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
701# undef PERL_WORDSIZE
702# undef PERL_COUNT_MULTIPLIER
703# undef PERL_WORD_BOUNDARY_MASK
704# undef PERL_VARIANTS_WORD_MASK
705#endif
03c1e4ab 706
7c93d8f0 707/*
5ff889fb
KW
708=for apidoc is_utf8_string
709
82c5d941
KW
710Returns TRUE if the first C<len> bytes of string C<s> form a valid
711Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
712be calculated using C<strlen(s)> (which means if you use this option, that C<s>
713can't have embedded C<NUL> characters and has to have a terminating C<NUL>
714byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
715
2717076a
KW
716This function considers Perl's extended UTF-8 to be valid. That means that
717code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
718considered valid by this function. Use C<L</is_strict_utf8_string>>,
719C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
720code points are considered valid.
5ff889fb 721
9f2abfde
KW
722See also
723C<L</is_utf8_invariant_string>>,
0cbf5865 724C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
725C<L</is_utf8_string_loc>>,
726C<L</is_utf8_string_loclen>>,
8bc127bf
KW
727C<L</is_utf8_fixed_width_buf_flags>>,
728C<L</is_utf8_fixed_width_buf_loc_flags>>,
729C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
730
731=cut
732*/
733
dd237e82 734#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 735
c9cd936b
KW
736#if defined(PERL_CORE) || defined (PERL_EXT)
737
738/*
739=for apidoc is_utf8_non_invariant_string
740
741Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
742C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
743UTF-8; otherwise returns FALSE.
744
745A TRUE return means that at least one code point represented by the sequence
746either is a wide character not representable as a single byte, or the
747representation differs depending on whether the sequence is encoded in UTF-8 or
748not.
749
750See also
751C<L<perlapi/is_utf8_invariant_string>>,
752C<L<perlapi/is_utf8_string>>
753
754=cut
755
756This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
757It generally needn't be if its string is entirely UTF-8 invariant, and it
758shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
759
760It is an internal function because khw thinks that XS code shouldn't be working
761at this low a level. A valid use case could change that.
762
763*/
764
765PERL_STATIC_INLINE bool
86a87e17 766Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
c9cd936b
KW
767{
768 const U8 * first_variant;
769
770 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
771
772 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
773 return FALSE;
774 }
775
776 return is_utf8_string(first_variant, len - (first_variant - s));
777}
778
779#endif
780
5ff889fb 781/*
9f2abfde
KW
782=for apidoc is_strict_utf8_string
783
784Returns TRUE if the first C<len> bytes of string C<s> form a valid
785UTF-8-encoded string that is fully interchangeable by any application using
786Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
787calculated using C<strlen(s)> (which means if you use this option, that C<s>
788can't have embedded C<NUL> characters and has to have a terminating C<NUL>
789byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
790
791This function returns FALSE for strings containing any
792code points above the Unicode max of 0x10FFFF, surrogate code points, or
793non-character code points.
794
795See also
796C<L</is_utf8_invariant_string>>,
0cbf5865 797C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
798C<L</is_utf8_string>>,
799C<L</is_utf8_string_flags>>,
800C<L</is_utf8_string_loc>>,
801C<L</is_utf8_string_loc_flags>>,
802C<L</is_utf8_string_loclen>>,
803C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
804C<L</is_utf8_fixed_width_buf_flags>>,
805C<L</is_utf8_fixed_width_buf_loc_flags>>,
806C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
807C<L</is_strict_utf8_string_loc>>,
808C<L</is_strict_utf8_string_loclen>>,
809C<L</is_c9strict_utf8_string>>,
810C<L</is_c9strict_utf8_string_loc>>,
811and
812C<L</is_c9strict_utf8_string_loclen>>.
813
814=cut
815*/
816
dd237e82 817#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
818
819/*
820=for apidoc is_c9strict_utf8_string
821
822Returns TRUE if the first C<len> bytes of string C<s> form a valid
823UTF-8-encoded string that conforms to
824L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
825otherwise it returns FALSE. If C<len> is 0, it will be calculated using
826C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
827C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
828characters being ASCII constitute 'a valid UTF-8 string'.
829
830This function returns FALSE for strings containing any code points above the
831Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
832code points per
833L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
834
835See also
836C<L</is_utf8_invariant_string>>,
0cbf5865 837C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
838C<L</is_utf8_string>>,
839C<L</is_utf8_string_flags>>,
840C<L</is_utf8_string_loc>>,
841C<L</is_utf8_string_loc_flags>>,
842C<L</is_utf8_string_loclen>>,
843C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
844C<L</is_utf8_fixed_width_buf_flags>>,
845C<L</is_utf8_fixed_width_buf_loc_flags>>,
846C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
847C<L</is_strict_utf8_string>>,
848C<L</is_strict_utf8_string_loc>>,
849C<L</is_strict_utf8_string_loclen>>,
850C<L</is_c9strict_utf8_string_loc>>,
851and
852C<L</is_c9strict_utf8_string_loclen>>.
853
854=cut
855*/
856
dd237e82 857#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
858
859/*
860=for apidoc is_utf8_string_flags
861
862Returns TRUE if the first C<len> bytes of string C<s> form a valid
863UTF-8 string, subject to the restrictions imposed by C<flags>;
864returns FALSE otherwise. If C<len> is 0, it will be calculated
865using C<strlen(s)> (which means if you use this option, that C<s> can't have
866embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
867that all characters being ASCII constitute 'a valid UTF-8 string'.
868
869If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
870C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
871as C<L</is_strict_utf8_string>>; and if C<flags> is
872C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
873C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
874combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
875C<L</utf8n_to_uvchr>>, with the same meanings.
876
877See also
878C<L</is_utf8_invariant_string>>,
0cbf5865 879C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
880C<L</is_utf8_string>>,
881C<L</is_utf8_string_loc>>,
882C<L</is_utf8_string_loc_flags>>,
883C<L</is_utf8_string_loclen>>,
884C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
885C<L</is_utf8_fixed_width_buf_flags>>,
886C<L</is_utf8_fixed_width_buf_loc_flags>>,
887C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
888C<L</is_strict_utf8_string>>,
889C<L</is_strict_utf8_string_loc>>,
890C<L</is_strict_utf8_string_loclen>>,
891C<L</is_c9strict_utf8_string>>,
892C<L</is_c9strict_utf8_string_loc>>,
893and
894C<L</is_c9strict_utf8_string_loclen>>.
895
896=cut
897*/
898
899PERL_STATIC_INLINE bool
c9182d9c 900Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 901{
33756530 902 const U8 * first_variant;
9f2abfde
KW
903
904 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
905 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 906 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 907
f60f61fd
KW
908 if (len == 0) {
909 len = strlen((const char *)s);
910 }
911
9f2abfde
KW
912 if (flags == 0) {
913 return is_utf8_string(s, len);
914 }
915
d044b7a7 916 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
917 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
918 {
919 return is_strict_utf8_string(s, len);
920 }
921
d044b7a7 922 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
923 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
924 {
925 return is_c9strict_utf8_string(s, len);
926 }
927
33756530
KW
928 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
929 const U8* const send = s + len;
930 const U8* x = first_variant;
931
a0d7f935
KW
932 while (x < send) {
933 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
934 if (UNLIKELY(! cur_len)) {
935 return FALSE;
936 }
937 x += cur_len;
9f2abfde 938 }
33756530 939 }
9f2abfde
KW
940
941 return TRUE;
942}
943
944/*
5ff889fb
KW
945
946=for apidoc is_utf8_string_loc
947
2717076a 948Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 949case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 950"utf8ness success") in the C<ep> pointer.
5ff889fb 951
2717076a 952See also C<L</is_utf8_string_loclen>>.
5ff889fb 953
3964c812
KW
954=cut
955*/
956
957#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
958
959/*
960
5ff889fb
KW
961=for apidoc is_utf8_string_loclen
962
2717076a 963Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 964case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 965"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 966encoded characters in the C<el> pointer.
5ff889fb 967
2717076a 968See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
969
970=cut
971*/
972
56e4cf64 973PERL_STATIC_INLINE bool
33756530 974Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 975{
33756530 976 const U8 * first_variant;
5ff889fb
KW
977
978 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
979
33756530
KW
980 if (len == 0) {
981 len = strlen((const char *) s);
982 }
983
984 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
985 if (el)
986 *el = len;
987
988 if (ep) {
989 *ep = s + len;
990 }
991
992 return TRUE;
993 }
994
995 {
996 const U8* const send = s + len;
997 const U8* x = first_variant;
998 STRLEN outlen = first_variant - s;
999
a0d7f935
KW
1000 while (x < send) {
1001 const STRLEN cur_len = isUTF8_CHAR(x, send);
1002 if (UNLIKELY(! cur_len)) {
1003 break;
1004 }
1005 x += cur_len;
1006 outlen++;
5ff889fb 1007 }
5ff889fb 1008
a0d7f935
KW
1009 if (el)
1010 *el = outlen;
5ff889fb 1011
a0d7f935
KW
1012 if (ep) {
1013 *ep = x;
1014 }
5ff889fb 1015
a0d7f935 1016 return (x == send);
33756530 1017 }
5ff889fb
KW
1018}
1019
1020/*
9f2abfde 1021
44170c9a 1022=for apidoc isUTF8_CHAR
8ed185f9
KW
1023
1024Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1025looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1026that represents some code point; otherwise it evaluates to 0. If non-zero, the
1027value gives how many bytes starting at C<s> comprise the code point's
1028representation. Any bytes remaining before C<e>, but beyond the ones needed to
1029form the first code point in C<s>, are not examined.
1030
13aab5dd 1031The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1032extension to official UTF-8 to represent those higher than the Unicode maximum
1033of 0x10FFFF. That means that this macro is used to efficiently decide if the
1034next few bytes in C<s> is legal UTF-8 for a single character.
1035
1036Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1037defined by Unicode to be fully interchangeable across applications;
1038C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1039#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1040code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1041
1042Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1043C<L</is_utf8_string_loclen>> to check entire strings.
1044
13aab5dd
KW
1045Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1046machines) is a valid UTF-8 character.
8ed185f9
KW
1047
1048=cut
1049
1050This uses an adaptation of the table and algorithm given in
f6521f7c 1051https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
8ed185f9
KW
1052documentation of the original version. A copyright notice for the original
1053version is given at the beginning of this file. The Perl adapation is
71525f77 1054documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1055
1056*/
1057
1058PERL_STATIC_INLINE Size_t
c9182d9c 1059Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9
KW
1060{
1061 const U8 * s = s0;
1062 UV state = 0;
1063
1064 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1065
1066 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1067 * code point, which can be returned immediately. Otherwise, it is either
1068 * malformed, or for the start byte FF which the dfa doesn't handle (except
1069 * on 32-bit ASCII platforms where it trivially is an error). Call a
1070 * helper function for the other platforms. */
1071
1072 while (s < e && LIKELY(state != 1)) {
71525f77 1073 state = PL_extended_utf8_dfa_tab[256
8ed185f9 1074 + state
71525f77 1075 + PL_extended_utf8_dfa_tab[*s]];
8ed185f9
KW
1076 if (state != 0) {
1077 s++;
1078 continue;
1079 }
1080
1081 return s - s0 + 1;
1082 }
1083
1084#if defined(UV_IS_QUAD) || defined(EBCDIC)
1085
1086 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1376b35c 1087 return is_utf8_char_helper(s0, e, 0);
8ed185f9
KW
1088 }
1089
1090#endif
1091
1092 return 0;
1093}
1094
1095/*
1096
67049a5f
KW
1097=for apidoc isSTRICT_UTF8_CHAR
1098
1099Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1100looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1101Unicode code point completely acceptable for open interchange between all
1102applications; otherwise it evaluates to 0. If non-zero, the value gives how
1103many bytes starting at C<s> comprise the code point's representation. Any
1104bytes remaining before C<e>, but beyond the ones needed to form the first code
1105point in C<s>, are not examined.
1106
1107The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1108be a surrogate nor a non-character code point. Thus this excludes any code
1109point from Perl's extended UTF-8.
1110
1111This is used to efficiently decide if the next few bytes in C<s> is
1112legal Unicode-acceptable UTF-8 for a single character.
1113
1114Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1115#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1116code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1117and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1118
1119Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1120C<L</is_strict_utf8_string_loclen>> to check entire strings.
1121
1122=cut
1123
1124This uses an adaptation of the tables and algorithm given in
f6521f7c 1125https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
67049a5f
KW
1126documentation of the original version. A copyright notice for the original
1127version is given at the beginning of this file. The Perl adapation is
1128documented at the definition of strict_extended_utf8_dfa_tab[].
1129
1130*/
1131
1132PERL_STATIC_INLINE Size_t
c9182d9c 1133Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f
KW
1134{
1135 const U8 * s = s0;
1136 UV state = 0;
1137
1138 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1139
1140 while (s < e && LIKELY(state != 1)) {
71525f77 1141 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
67049a5f
KW
1142
1143 if (state != 0) {
1144 s++;
1145 continue;
1146 }
1147
1148 return s - s0 + 1;
1149 }
1150
1151#ifndef EBCDIC
1152
1153 /* The dfa above drops out for certain Hanguls; handle them specially */
1154 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1155 return 3;
1156 }
1157
1158#endif
1159
1160 return 0;
1161}
1162
1163/*
1164
44170c9a 1165=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1166
1167Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1168looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1169Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1170the value gives how many bytes starting at C<s> comprise the code point's
1171representation. Any bytes remaining before C<e>, but beyond the ones needed to
1172form the first code point in C<s>, are not examined.
1173
1174The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1175differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1176code points. This corresponds to
1177L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1178which said that non-character code points are merely discouraged rather than
1179completely forbidden in open interchange. See
1180L<perlunicode/Noncharacter code points>.
1181
1182Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1183C<L</isUTF8_CHAR_flags>> for a more customized definition.
1184
1185Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1186C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1187
1188=cut
1189
1190This uses an adaptation of the tables and algorithm given in
f6521f7c 1191https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
c5bfbb64
KW
1192documentation of the original version. A copyright notice for the original
1193version is given at the beginning of this file. The Perl adapation is
71525f77 1194documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1195
1196*/
1197
1198PERL_STATIC_INLINE Size_t
c9182d9c 1199Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64
KW
1200{
1201 const U8 * s = s0;
1202 UV state = 0;
1203
1204 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1205
1206 while (s < e && LIKELY(state != 1)) {
71525f77 1207 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
c5bfbb64
KW
1208
1209 if (state != 0) {
1210 s++;
1211 continue;
1212 }
1213
1214 return s - s0 + 1;
1215 }
1216
1217 return 0;
1218}
1219
1220/*
1221
9f2abfde
KW
1222=for apidoc is_strict_utf8_string_loc
1223
1224Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1225case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1226"utf8ness success") in the C<ep> pointer.
1227
1228See also C<L</is_strict_utf8_string_loclen>>.
1229
1230=cut
1231*/
1232
1233#define is_strict_utf8_string_loc(s, len, ep) \
1234 is_strict_utf8_string_loclen(s, len, ep, 0)
1235
1236/*
1237
1238=for apidoc is_strict_utf8_string_loclen
1239
1240Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1241case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1242"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1243encoded characters in the C<el> pointer.
1244
1245See also C<L</is_strict_utf8_string_loc>>.
1246
1247=cut
1248*/
1249
1250PERL_STATIC_INLINE bool
c9182d9c 1251Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1252{
33756530 1253 const U8 * first_variant;
9f2abfde
KW
1254
1255 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1256
33756530
KW
1257 if (len == 0) {
1258 len = strlen((const char *) s);
1259 }
1260
1261 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1262 if (el)
1263 *el = len;
1264
1265 if (ep) {
1266 *ep = s + len;
1267 }
1268
1269 return TRUE;
1270 }
1271
1272 {
1273 const U8* const send = s + len;
1274 const U8* x = first_variant;
1275 STRLEN outlen = first_variant - s;
1276
a0d7f935
KW
1277 while (x < send) {
1278 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1279 if (UNLIKELY(! cur_len)) {
1280 break;
1281 }
1282 x += cur_len;
1283 outlen++;
9f2abfde 1284 }
9f2abfde 1285
a0d7f935
KW
1286 if (el)
1287 *el = outlen;
9f2abfde 1288
a0d7f935
KW
1289 if (ep) {
1290 *ep = x;
1291 }
9f2abfde 1292
a0d7f935 1293 return (x == send);
33756530 1294 }
9f2abfde
KW
1295}
1296
1297/*
1298
1299=for apidoc is_c9strict_utf8_string_loc
1300
1301Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1302the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1303"utf8ness success") in the C<ep> pointer.
1304
1305See also C<L</is_c9strict_utf8_string_loclen>>.
1306
1307=cut
1308*/
1309
1310#define is_c9strict_utf8_string_loc(s, len, ep) \
1311 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1312
1313/*
1314
1315=for apidoc is_c9strict_utf8_string_loclen
1316
1317Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1318the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1319"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1320characters in the C<el> pointer.
1321
1322See also C<L</is_c9strict_utf8_string_loc>>.
1323
1324=cut
1325*/
1326
1327PERL_STATIC_INLINE bool
c9182d9c 1328Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1329{
33756530 1330 const U8 * first_variant;
9f2abfde
KW
1331
1332 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1333
33756530
KW
1334 if (len == 0) {
1335 len = strlen((const char *) s);
1336 }
1337
1338 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1339 if (el)
1340 *el = len;
1341
1342 if (ep) {
1343 *ep = s + len;
1344 }
1345
1346 return TRUE;
1347 }
1348
1349 {
1350 const U8* const send = s + len;
1351 const U8* x = first_variant;
1352 STRLEN outlen = first_variant - s;
1353
a0d7f935
KW
1354 while (x < send) {
1355 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1356 if (UNLIKELY(! cur_len)) {
1357 break;
1358 }
1359 x += cur_len;
1360 outlen++;
9f2abfde 1361 }
9f2abfde 1362
a0d7f935
KW
1363 if (el)
1364 *el = outlen;
9f2abfde 1365
a0d7f935
KW
1366 if (ep) {
1367 *ep = x;
1368 }
9f2abfde 1369
a0d7f935 1370 return (x == send);
33756530 1371 }
9f2abfde
KW
1372}
1373
1374/*
1375
1376=for apidoc is_utf8_string_loc_flags
1377
1378Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1379case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1380"utf8ness success") in the C<ep> pointer.
1381
1382See also C<L</is_utf8_string_loclen_flags>>.
1383
1384=cut
1385*/
1386
1387#define is_utf8_string_loc_flags(s, len, ep, flags) \
1388 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1389
1390
1391/* The above 3 actual functions could have been moved into the more general one
1392 * just below, and made #defines that call it with the right 'flags'. They are
1393 * currently kept separate to increase their chances of getting inlined */
1394
1395/*
1396
1397=for apidoc is_utf8_string_loclen_flags
1398
1399Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1400case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1401"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1402encoded characters in the C<el> pointer.
1403
1404See also C<L</is_utf8_string_loc_flags>>.
1405
1406=cut
1407*/
1408
1409PERL_STATIC_INLINE bool
c9182d9c 1410Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1411{
33756530 1412 const U8 * first_variant;
9f2abfde
KW
1413
1414 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1415 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1416 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1417
f60f61fd 1418 if (len == 0) {
a0d7f935 1419 len = strlen((const char *) s);
f60f61fd
KW
1420 }
1421
9f2abfde
KW
1422 if (flags == 0) {
1423 return is_utf8_string_loclen(s, len, ep, el);
1424 }
1425
d044b7a7 1426 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1427 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1428 {
1429 return is_strict_utf8_string_loclen(s, len, ep, el);
1430 }
1431
d044b7a7 1432 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1433 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1434 {
1435 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1436 }
1437
33756530
KW
1438 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1439 if (el)
1440 *el = len;
1441
1442 if (ep) {
1443 *ep = s + len;
1444 }
1445
1446 return TRUE;
1447 }
1448
1449 {
1450 const U8* send = s + len;
1451 const U8* x = first_variant;
1452 STRLEN outlen = first_variant - s;
1453
a0d7f935
KW
1454 while (x < send) {
1455 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1456 if (UNLIKELY(! cur_len)) {
1457 break;
1458 }
1459 x += cur_len;
1460 outlen++;
9f2abfde 1461 }
9f2abfde 1462
a0d7f935
KW
1463 if (el)
1464 *el = outlen;
9f2abfde 1465
a0d7f935
KW
1466 if (ep) {
1467 *ep = x;
1468 }
9f2abfde 1469
a0d7f935 1470 return (x == send);
33756530 1471 }
9f2abfde
KW
1472}
1473
1474/*
7c93d8f0
KW
1475=for apidoc utf8_distance
1476
1477Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1478and C<b>.
1479
1480WARNING: use only if you *know* that the pointers point inside the
1481same UTF-8 buffer.
1482
1483=cut
1484*/
1485
1486PERL_STATIC_INLINE IV
1487Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1488{
1489 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1490
1491 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1492}
1493
1494/*
1495=for apidoc utf8_hop
1496
1497Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1498forward or backward.
1499
1500WARNING: do not use the following unless you *know* C<off> is within
1501the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1502on the first byte of character or just after the last byte of a character.
1503
1504=cut
1505*/
1506
1507PERL_STATIC_INLINE U8 *
1508Perl_utf8_hop(const U8 *s, SSize_t off)
1509{
1510 PERL_ARGS_ASSERT_UTF8_HOP;
1511
1512 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1513 * the bitops (especially ~) can create illegal UTF-8.
1514 * In other words: in Perl UTF-8 is not just for Unicode. */
1515
1516 if (off >= 0) {
1517 while (off--)
1518 s += UTF8SKIP(s);
1519 }
1520 else {
1521 while (off++) {
1522 s--;
1523 while (UTF8_IS_CONTINUATION(*s))
1524 s--;
1525 }
1526 }
e099ea69 1527 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 1528 return (U8 *)s;
e099ea69 1529 GCC_DIAG_RESTORE
7c93d8f0
KW
1530}
1531
4dab108f 1532/*
65df57a8
TC
1533=for apidoc utf8_hop_forward
1534
1535Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1536forward.
1537
1538C<off> must be non-negative.
1539
1540C<s> must be before or equal to C<end>.
1541
1542When moving forward it will not move beyond C<end>.
1543
1544Will not exceed this limit even if the string is not valid "UTF-8".
1545
1546=cut
1547*/
1548
1549PERL_STATIC_INLINE U8 *
1550Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1551{
1552 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1553
1554 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1555 * the bitops (especially ~) can create illegal UTF-8.
1556 * In other words: in Perl UTF-8 is not just for Unicode. */
1557
1558 assert(s <= end);
1559 assert(off >= 0);
1560
1561 while (off--) {
1562 STRLEN skip = UTF8SKIP(s);
de979548 1563 if ((STRLEN)(end - s) <= skip) {
e099ea69 1564 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1565 return (U8 *)end;
e099ea69 1566 GCC_DIAG_RESTORE
de979548 1567 }
65df57a8
TC
1568 s += skip;
1569 }
1570
e099ea69 1571 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1572 return (U8 *)s;
e099ea69 1573 GCC_DIAG_RESTORE
65df57a8
TC
1574}
1575
1576/*
1577=for apidoc utf8_hop_back
1578
1579Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1580backward.
1581
1582C<off> must be non-positive.
1583
1584C<s> must be after or equal to C<start>.
1585
1586When moving backward it will not move before C<start>.
1587
1588Will not exceed this limit even if the string is not valid "UTF-8".
1589
1590=cut
1591*/
1592
1593PERL_STATIC_INLINE U8 *
1594Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1595{
1596 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1597
1598 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1599 * the bitops (especially ~) can create illegal UTF-8.
1600 * In other words: in Perl UTF-8 is not just for Unicode. */
1601
1602 assert(start <= s);
1603 assert(off <= 0);
1604
1605 while (off++ && s > start) {
e7185695 1606 do {
65df57a8 1607 s--;
e7185695 1608 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8 1609 }
f6521f7c 1610
e099ea69 1611 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1612 return (U8 *)s;
e099ea69 1613 GCC_DIAG_RESTORE
65df57a8
TC
1614}
1615
1616/*
1617=for apidoc utf8_hop_safe
1618
1619Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1620either forward or backward.
1621
1622When moving backward it will not move before C<start>.
1623
1624When moving forward it will not move beyond C<end>.
1625
1626Will not exceed those limits even if the string is not valid "UTF-8".
1627
1628=cut
1629*/
1630
1631PERL_STATIC_INLINE U8 *
1632Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1633{
1634 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1635
1636 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1637 * the bitops (especially ~) can create illegal UTF-8.
1638 * In other words: in Perl UTF-8 is not just for Unicode. */
1639
1640 assert(start <= s && s <= end);
1641
1642 if (off >= 0) {
1643 return utf8_hop_forward(s, off, end);
1644 }
1645 else {
1646 return utf8_hop_back(s, off, start);
1647 }
1648}
1649
1650/*
4dab108f
KW
1651
1652=for apidoc is_utf8_valid_partial_char
1653
6cbb9248
KW
1654Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1655S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1656points. Otherwise, it returns 1 if there exists at least one non-empty
1657sequence of bytes that when appended to sequence C<s>, starting at position
1658C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1659otherwise returns 0.
1660
1661In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1662point.
1663
1664This is useful when a fixed-length buffer is being tested for being well-formed
1665UTF-8, but the final few bytes in it don't comprise a full character; that is,
1666it is split somewhere in the middle of the final code point's UTF-8
1667representation. (Presumably when the buffer is refreshed with the next chunk
1668of data, the new first bytes will complete the partial code point.) This
1669function is used to verify that the final bytes in the current buffer are in
1670fact the legal beginning of some code point, so that if they aren't, the
1671failure can be signalled without having to wait for the next read.
4dab108f
KW
1672
1673=cut
1674*/
2717076a
KW
1675#define is_utf8_valid_partial_char(s, e) \
1676 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1677
1678/*
1679
1680=for apidoc is_utf8_valid_partial_char_flags
1681
1682Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1683or not the input is a valid UTF-8 encoded partial character, but it takes an
1684extra parameter, C<flags>, which can further restrict which code points are
1685considered valid.
1686
1687If C<flags> is 0, this behaves identically to
1688C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1689of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1690there is any sequence of bytes that can complete the input partial character in
1691such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1692TRUE; otherwise FALSE. Non character code points cannot be determined based on
1693partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1694determined from just the first one or two bytes.
1695
1696=cut
1697 */
1698
56e4cf64 1699PERL_STATIC_INLINE bool
c9182d9c 1700Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1701{
f1c999a7 1702 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1703
f1c999a7 1704 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1705 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1706
8875bd48 1707 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1708 return FALSE;
1709 }
1710
1376b35c 1711 return cBOOL(is_utf8_char_helper(s, e, flags));
4dab108f
KW
1712}
1713
8bc127bf
KW
1714/*
1715
1716=for apidoc is_utf8_fixed_width_buf_flags
1717
1718Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1719is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1720otherwise it returns FALSE.
1721
1722If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1723without restriction. If the final few bytes of the buffer do not form a
1724complete code point, this will return TRUE anyway, provided that
1725C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1726
1727If C<flags> in non-zero, it can be any combination of the
1728C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1729same meanings.
1730
1731This function differs from C<L</is_utf8_string_flags>> only in that the latter
1732returns FALSE if the final few bytes of the string don't form a complete code
1733point.
1734
1735=cut
1736 */
1737#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1738 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1739
1740/*
1741
1742=for apidoc is_utf8_fixed_width_buf_loc_flags
1743
1744Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1745failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1746to the beginning of any partial character at the end of the buffer; if there is
1747no partial character C<*ep> will contain C<s>+C<len>.
1748
1749See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1750
1751=cut
1752*/
1753
1754#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1755 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1756
1757/*
1758
1759=for apidoc is_utf8_fixed_width_buf_loclen_flags
1760
1761Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1762complete, valid characters found in the C<el> pointer.
1763
1764=cut
1765*/
1766
1767PERL_STATIC_INLINE bool
c9182d9c 1768Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 1769 STRLEN len,
8bc127bf
KW
1770 const U8 **ep,
1771 STRLEN *el,
1772 const U32 flags)
1773{
1774 const U8 * maybe_partial;
1775
1776 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1777
1778 if (! ep) {
1779 ep = &maybe_partial;
1780 }
1781
1782 /* If it's entirely valid, return that; otherwise see if the only error is
1783 * that the final few bytes are for a partial character */
1784 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1785 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1786}
1787
e6a4ffc3 1788PERL_STATIC_INLINE UV
c9182d9c 1789Perl_utf8n_to_uvchr_msgs(const U8 *s,
e6a4ffc3
KW
1790 STRLEN curlen,
1791 STRLEN *retlen,
1792 const U32 flags,
1793 U32 * errors,
1794 AV ** msgs)
1795{
1796 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1797 * simple cases, and, if necessary calls a helper function to deal with the
1798 * more complex ones. Almost all well-formed non-problematic code points
1799 * are considered simple, so that it's unlikely that the helper function
1800 * will need to be called.
1801 *
1802 * This is an adaptation of the tables and algorithm given in
f6521f7c 1803 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
e6a4ffc3
KW
1804 * comprehensive documentation of the original version. A copyright notice
1805 * for the original version is given at the beginning of this file. The
71525f77 1806 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
1807 */
1808
1809 const U8 * const s0 = s;
1810 const U8 * send = s0 + curlen;
f73cfe7d 1811 UV uv = 0; /* The 0 silences some stupid compilers */
e6a4ffc3
KW
1812 UV state = 0;
1813
1814 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1815
1816 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1817 * non-problematic code point, which can be returned immediately.
1818 * Otherwise we call a helper function to figure out the more complicated
1819 * cases. */
1820
1821 while (s < send && LIKELY(state != 1)) {
71525f77 1822 UV type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3
KW
1823
1824 uv = (state == 0)
1825 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1826 : UTF8_ACCUMULATE(uv, *s);
71525f77 1827 state = PL_strict_utf8_dfa_tab[256 + state + type];
e6a4ffc3
KW
1828
1829 if (state != 0) {
1830 s++;
1831 continue;
1832 }
1833
1834 if (retlen) {
1835 *retlen = s - s0 + 1;
1836 }
1837 if (errors) {
1838 *errors = 0;
1839 }
1840 if (msgs) {
1841 *msgs = NULL;
1842 }
1843
cfebc7aa 1844 return UNI_TO_NATIVE(uv);
e6a4ffc3
KW
1845 }
1846
1847 /* Here is potentially problematic. Use the full mechanism */
1848 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1849}
1850
82651abe 1851PERL_STATIC_INLINE UV
9a9a6c98 1852Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 1853{
9a9a6c98 1854 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
1855
1856 assert(s < send);
1857
1858 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
1859
1860 /* EMPTY is not really allowed, and asserts on debugging builds. But
1861 * on non-debugging we have to deal with it, and this causes it to
1862 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 1863 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 1864 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
1865 }
1866 else {
1867 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1868 if (retlen && ret == 0 && *s != '\0') {
1869 *retlen = (STRLEN) -1;
1870 }
1871
1872 return ret;
1873 }
1874}
1875
c8028aa6
TC
1876/* ------------------------------- perl.h ----------------------------- */
1877
1878/*
dcccc8ff
KW
1879=head1 Miscellaneous Functions
1880
44170c9a 1881=for apidoc is_safe_syscall
c8028aa6 1882
1a0efc9a
KW
1883Test that the given C<pv> (with length C<len>) doesn't contain any internal
1884C<NUL> characters.
1885If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
1886category, and return FALSE.
c8028aa6
TC
1887
1888Return TRUE if the name is safe.
1889
1a0efc9a
KW
1890C<what> and C<op_name> are used in any warning.
1891
796b6530 1892Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1893
1894=cut
1895*/
1896
1897PERL_STATIC_INLINE bool
ffd62fc2
KW
1898Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1899{
c8028aa6
TC
1900 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1901 * perl itself uses xce*() functions which accept 8-bit strings.
1902 */
1903
1904 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1905
6c4650b3 1906 if (len > 1) {
c8028aa6 1907 char *null_at;
41188aa0 1908 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1909 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1910 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1911 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1912 what, op_name, pv, null_at+1);
c8028aa6
TC
1913 return FALSE;
1914 }
1915 }
1916
1917 return TRUE;
1918}
1919
1920/*
7cb3f959
TC
1921
1922Return true if the supplied filename has a newline character
fa6c7d00 1923immediately before the first (hopefully only) NUL.
7cb3f959
TC
1924
1925My original look at this incorrectly used the len from SvPV(), but
1926that's incorrect, since we allow for a NUL in pv[len-1].
1927
1928So instead, strlen() and work from there.
1929
1930This allow for the user reading a filename, forgetting to chomp it,
1931then calling:
1932
1933 open my $foo, "$file\0";
1934
1935*/
1936
1937#ifdef PERL_CORE
1938
1939PERL_STATIC_INLINE bool
ffd62fc2
KW
1940S_should_warn_nl(const char *pv)
1941{
7cb3f959
TC
1942 STRLEN len;
1943
1944 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1945
1946 len = strlen(pv);
1947
1948 return len > 0 && pv[len-1] == '\n';
1949}
1950
1951#endif
1952
3a019afd
KW
1953#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1954
1955PERL_STATIC_INLINE bool
1956S_lossless_NV_to_IV(const NV nv, IV *ivp)
1957{
1958 /* This function determines if the input NV 'nv' may be converted without
1959 * loss of data to an IV. If not, it returns FALSE taking no other action.
1960 * But if it is possible, it does the conversion, returning TRUE, and
1961 * storing the converted result in '*ivp' */
1962
1963 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1964
1965# if defined(Perl_isnan)
1966
1967 if (UNLIKELY(Perl_isnan(nv))) {
1968 return FALSE;
1969 }
1970
1971# endif
1972
1973 if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1974 return FALSE;
1975 }
1976
1977 if ((IV) nv != nv) {
1978 return FALSE;
1979 }
1980
1981 *ivp = (IV) nv;
1982 return TRUE;
1983}
1984
1985#endif
1986
14fe5f8a
KW
1987/* ------------------ regcomp.c, toke.c ------------ */
1988
1989#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C)
1990
1991/*
1992 - regcurly - a little FSA that accepts {\d+,?\d*}
1993 Pulled from reg.c.
1994 */
6bbb95d4 1995PERL_STATIC_INLINE bool
14fe5f8a
KW
1996S_regcurly(const char *s)
1997{
1998 PERL_ARGS_ASSERT_REGCURLY;
1999
2000 if (*s++ != '{')
2001 return FALSE;
2002 if (!isDIGIT(*s))
2003 return FALSE;
2004 while (isDIGIT(*s))
2005 s++;
2006 if (*s == ',') {
2007 s++;
2008 while (isDIGIT(*s))
2009 s++;
2010 }
2011
2012 return *s == '}';
2013}
2014
2015#endif
2016
81d52ecd
JH
2017/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2018
94b0cb42
KW
2019#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2020
81d52ecd
JH
2021#define MAX_CHARSET_NAME_LENGTH 2
2022
2023PERL_STATIC_INLINE const char *
94b0cb42 2024S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 2025{
94b0cb42
KW
2026 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2027
81d52ecd
JH
2028 /* Returns a string that corresponds to the name of the regex character set
2029 * given by 'flags', and *lenp is set the length of that string, which
2030 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2031
2032 *lenp = 1;
2033 switch (get_regex_charset(flags)) {
2034 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2035 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2036 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2037 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2038 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2039 *lenp = 2;
2040 return ASCII_MORE_RESTRICT_PAT_MODS;
2041 }
2042 /* The NOT_REACHED; hides an assert() which has a rather complex
2043 * definition in perl.h. */
2044 NOT_REACHED; /* NOTREACHED */
2045 return "?"; /* Unknown */
2046}
2047
94b0cb42
KW
2048#endif
2049
7cb3f959 2050/*
ed382232
TC
2051
2052Return false if any get magic is on the SV other than taint magic.
2053
2054*/
2055
2056PERL_STATIC_INLINE bool
ffd62fc2
KW
2057Perl_sv_only_taint_gmagic(SV *sv)
2058{
ed382232
TC
2059 MAGIC *mg = SvMAGIC(sv);
2060
2061 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2062
2063 while (mg) {
2064 if (mg->mg_type != PERL_MAGIC_taint
2065 && !(mg->mg_flags & MGf_GSKIP)
2066 && mg->mg_virtual->svt_get) {
2067 return FALSE;
2068 }
2069 mg = mg->mg_moremagic;
2070 }
2071
2072 return TRUE;
2073}
2074
ed8ff0f3
DM
2075/* ------------------ cop.h ------------------------------------------- */
2076
5b6f7443
DM
2077/* implement GIMME_V() macro */
2078
2079PERL_STATIC_INLINE U8
2080Perl_gimme_V(pTHX)
2081{
2082 I32 cxix;
2083 U8 gimme = (PL_op->op_flags & OPf_WANT);
2084
2085 if (gimme)
2086 return gimme;
2087 cxix = PL_curstackinfo->si_cxsubix;
2088 if (cxix < 0)
2089 return G_VOID;
2090 assert(cxstack[cxix].blk_gimme & G_WANT);
2091 return (cxstack[cxix].blk_gimme & G_WANT);
2092}
2093
ed8ff0f3
DM
2094
2095/* Enter a block. Push a new base context and return its address. */
2096
2097PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2098Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2099{
2100 PERL_CONTEXT * cx;
2101
2102 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2103
2104 CXINC;
2105 cx = CX_CUR();
2106 cx->cx_type = type;
2107 cx->blk_gimme = gimme;
2108 cx->blk_oldsaveix = saveix;
4caf7d8c 2109 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2110 cx->blk_oldcop = PL_curcop;
4caf7d8c 2111 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2112 cx->blk_oldscopesp = PL_scopestack_ix;
2113 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2114 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2115
2116 PL_tmps_floor = PL_tmps_ix;
2117 CX_DEBUG(cx, "PUSH");
2118 return cx;
2119}
2120
2121
2122/* Exit a block (RETURN and LAST). */
2123
2124PERL_STATIC_INLINE void
c9182d9c 2125Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2126{
2127 PERL_ARGS_ASSERT_CX_POPBLOCK;
2128
2129 CX_DEBUG(cx, "POP");
2130 /* these 3 are common to cx_popblock and cx_topblock */
2131 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2132 PL_scopestack_ix = cx->blk_oldscopesp;
2133 PL_curpm = cx->blk_oldpm;
2134
2135 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2136 * and leaves a CX entry lying around for repeated use, so
2137 * skip for multicall */ \
2138 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2139 || PL_savestack_ix == cx->blk_oldsaveix);
2140 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2141 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2142}
2143
2144/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2145 * Whereas cx_popblock() restores the state to the point just before
2146 * cx_pushblock() was called, cx_topblock() restores it to the point just
2147 * *after* cx_pushblock() was called. */
2148
2149PERL_STATIC_INLINE void
c9182d9c 2150Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2151{
2152 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2153
2154 CX_DEBUG(cx, "TOP");
2155 /* these 3 are common to cx_popblock and cx_topblock */
2156 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2157 PL_scopestack_ix = cx->blk_oldscopesp;
2158 PL_curpm = cx->blk_oldpm;
2159
2160 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2161}
2162
2163
a73d8813 2164PERL_STATIC_INLINE void
c9182d9c 2165Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2166{
2167 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2168
2169 PERL_ARGS_ASSERT_CX_PUSHSUB;
2170
3f6bd23a 2171 PERL_DTRACE_PROBE_ENTRY(cv);
5b6f7443
DM
2172 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2173 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
a73d8813
DM
2174 cx->blk_sub.cv = cv;
2175 cx->blk_sub.olddepth = CvDEPTH(cv);
2176 cx->blk_sub.prevcomppad = PL_comppad;
2177 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2178 cx->blk_sub.retop = retop;
2179 SvREFCNT_inc_simple_void_NN(cv);
2180 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2181}
2182
2183
2184/* subsets of cx_popsub() */
2185
2186PERL_STATIC_INLINE void
c9182d9c 2187Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2188{
2189 CV *cv;
2190
2191 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2192 assert(CxTYPE(cx) == CXt_SUB);
2193
2194 PL_comppad = cx->blk_sub.prevcomppad;
2195 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2196 cv = cx->blk_sub.cv;
2197 CvDEPTH(cv) = cx->blk_sub.olddepth;
2198 cx->blk_sub.cv = NULL;
2199 SvREFCNT_dec(cv);
5b6f7443 2200 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
a73d8813
DM
2201}
2202
2203
2204/* handle the @_ part of leaving a sub */
2205
2206PERL_STATIC_INLINE void
c9182d9c 2207Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2208{
2209 AV *av;
2210
2211 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2212 assert(CxTYPE(cx) == CXt_SUB);
2213 assert(AvARRAY(MUTABLE_AV(
2214 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2215 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2216
2217 CX_POP_SAVEARRAY(cx);
2218 av = MUTABLE_AV(PAD_SVl(0));
2219 if (UNLIKELY(AvREAL(av)))
2220 /* abandon @_ if it got reified */
2221 clear_defarray(av, 0);
2222 else {
2223 CLEAR_ARGARRAY(av);
2224 }
2225}
2226
2227
2228PERL_STATIC_INLINE void
c9182d9c 2229Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2230{
2231 PERL_ARGS_ASSERT_CX_POPSUB;
2232 assert(CxTYPE(cx) == CXt_SUB);
2233
3f6bd23a 2234 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2235
2236 if (CxHASARGS(cx))
2237 cx_popsub_args(cx);
2238 cx_popsub_common(cx);
2239}
2240
2241
6a7d52cc 2242PERL_STATIC_INLINE void
c9182d9c 2243Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2244{
2245 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2246
5b6f7443
DM
2247 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2248 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
6a7d52cc
DM
2249 cx->blk_format.cv = cv;
2250 cx->blk_format.retop = retop;
2251 cx->blk_format.gv = gv;
2252 cx->blk_format.dfoutgv = PL_defoutgv;
2253 cx->blk_format.prevcomppad = PL_comppad;
2254 cx->blk_u16 = 0;
2255
2256 SvREFCNT_inc_simple_void_NN(cv);
2257 CvDEPTH(cv)++;
2258 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2259}
2260
2261
2262PERL_STATIC_INLINE void
c9182d9c 2263Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2264{
2265 CV *cv;
2266 GV *dfout;
2267
2268 PERL_ARGS_ASSERT_CX_POPFORMAT;
2269 assert(CxTYPE(cx) == CXt_FORMAT);
2270
2271 dfout = cx->blk_format.dfoutgv;
2272 setdefout(dfout);
2273 cx->blk_format.dfoutgv = NULL;
2274 SvREFCNT_dec_NN(dfout);
2275
2276 PL_comppad = cx->blk_format.prevcomppad;
2277 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2278 cv = cx->blk_format.cv;
2279 cx->blk_format.cv = NULL;
2280 --CvDEPTH(cv);
2281 SvREFCNT_dec_NN(cv);
5b6f7443 2282 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
6a7d52cc
DM
2283}
2284
2285
13febba5 2286PERL_STATIC_INLINE void
c9182d9c 2287Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5
DM
2288{
2289 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2290
5b6f7443
DM
2291 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2292 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
13febba5
DM
2293 cx->blk_eval.retop = retop;
2294 cx->blk_eval.old_namesv = namesv;
2295 cx->blk_eval.old_eval_root = PL_eval_root;
2296 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2297 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2298 cx->blk_eval.cur_top_env = PL_top_env;
2299
4c57ced5 2300 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2301 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2302 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2303}
2304
2305
2306PERL_STATIC_INLINE void
c9182d9c 2307Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2308{
2309 SV *sv;
2310
2311 PERL_ARGS_ASSERT_CX_POPEVAL;
2312 assert(CxTYPE(cx) == CXt_EVAL);
2313
2314 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2315 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2316 PL_eval_root = cx->blk_eval.old_eval_root;
2317 sv = cx->blk_eval.cur_text;
4c57ced5 2318 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2319 cx->blk_eval.cur_text = NULL;
2320 SvREFCNT_dec_NN(sv);
2321 }
2322
2323 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2324 if (sv) {
2325 cx->blk_eval.old_namesv = NULL;
2326 SvREFCNT_dec_NN(sv);
2327 }
5b6f7443 2328 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
13febba5 2329}
6a7d52cc 2330
a73d8813 2331
d1b6bf72
DM
2332/* push a plain loop, i.e.
2333 * { block }
2334 * while (cond) { block }
2335 * for (init;cond;continue) { block }
2336 * This loop can be last/redo'ed etc.
2337 */
2338
2339PERL_STATIC_INLINE void
c9182d9c 2340Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2341{
2342 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2343 cx->blk_loop.my_op = cLOOP;
2344}
2345
2346
2347/* push a true for loop, i.e.
2348 * for var (list) { block }
2349 */
2350
2351PERL_STATIC_INLINE void
c9182d9c 2352Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
2353{
2354 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2355
2356 /* this one line is common with cx_pushloop_plain */
2357 cx->blk_loop.my_op = cLOOP;
2358
2359 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2360 cx->blk_loop.itersave = itersave;
2361#ifdef USE_ITHREADS
2362 cx->blk_loop.oldcomppad = PL_comppad;
2363#endif
2364}
2365
2366
2367/* pop all loop types, including plain */
2368
2369PERL_STATIC_INLINE void
c9182d9c 2370Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2371{
2372 PERL_ARGS_ASSERT_CX_POPLOOP;
2373
2374 assert(CxTYPE_is_LOOP(cx));
2375 if ( CxTYPE(cx) == CXt_LOOP_ARY
2376 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2377 {
2378 /* Free ary or cur. This assumes that state_u.ary.ary
2379 * aligns with state_u.lazysv.cur. See cx_dup() */
2380 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2381 cx->blk_loop.state_u.lazysv.cur = NULL;
2382 SvREFCNT_dec_NN(sv);
2383 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2384 sv = cx->blk_loop.state_u.lazysv.end;
2385 cx->blk_loop.state_u.lazysv.end = NULL;
2386 SvREFCNT_dec_NN(sv);
2387 }
2388 }
2389 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2390 SV *cursv;
2391 SV **svp = (cx)->blk_loop.itervar_u.svp;
2392 if ((cx->cx_type & CXp_FOR_GV))
2393 svp = &GvSV((GV*)svp);
2394 cursv = *svp;
2395 *svp = cx->blk_loop.itersave;
2396 cx->blk_loop.itersave = NULL;
2397 SvREFCNT_dec(cursv);
2398 }
2399}
2400
2a7b7c61
DM
2401
2402PERL_STATIC_INLINE void
c9182d9c 2403Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2404{
7896dde7 2405 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 2406
7896dde7 2407 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
2408}
2409
2410
2411PERL_STATIC_INLINE void
c9182d9c 2412Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2413{
7896dde7
Z
2414 PERL_ARGS_ASSERT_CX_POPWHEN;
2415 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
2416
2417 PERL_UNUSED_ARG(cx);
59a14f30 2418 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
2419 /* currently NOOP */
2420}
2421
2422
7896dde7 2423PERL_STATIC_INLINE void
c9182d9c 2424Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
2425{
2426 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2427
2428 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2429 cx->blk_givwhen.defsv_save = orig_defsv;
2430}
2431
2432
2433PERL_STATIC_INLINE void
c9182d9c 2434Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
2435{
2436 SV *sv;
2437
2438 PERL_ARGS_ASSERT_CX_POPGIVEN;
2439 assert(CxTYPE(cx) == CXt_GIVEN);
2440
2441 sv = GvSV(PL_defgv);
2442 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2443 cx->blk_givwhen.defsv_save = NULL;
2444 SvREFCNT_dec(sv);
2445}
2446
ec2c235b
KW
2447/* ------------------ util.h ------------------------------------------- */
2448
2449/*
2450=head1 Miscellaneous Functions
2451
2452=for apidoc foldEQ
2453
2454Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2455same
2456case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2457match themselves and their opposite case counterparts. Non-cased and non-ASCII
2458range bytes match only themselves.
2459
2460=cut
2461*/
2462
2463PERL_STATIC_INLINE I32
2464Perl_foldEQ(const char *s1, const char *s2, I32 len)
2465{
2466 const U8 *a = (const U8 *)s1;
2467 const U8 *b = (const U8 *)s2;
2468
2469 PERL_ARGS_ASSERT_FOLDEQ;
2470
2471 assert(len >= 0);
2472
2473 while (len--) {
2474 if (*a != *b && *a != PL_fold[*b])
2475 return 0;
2476 a++,b++;
2477 }
2478 return 1;
2479}
2480
0f9cb40c 2481PERL_STATIC_INLINE I32
ec2c235b
KW
2482Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2483{
79a1fabd
KW
2484 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2485 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2486 * does not check for this. Nor does it check that the strings each have
2487 * at least 'len' characters. */
ec2c235b
KW
2488
2489 const U8 *a = (const U8 *)s1;
2490 const U8 *b = (const U8 *)s2;
2491
2492 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2493
2494 assert(len >= 0);
2495
2496 while (len--) {
2497 if (*a != *b && *a != PL_fold_latin1[*b]) {
2498 return 0;
2499 }
2500 a++, b++;
2501 }
2502 return 1;
2503}
2504
2505/*
2506=for apidoc foldEQ_locale
2507
2508Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2509same case-insensitively in the current locale; false otherwise.
2510
2511=cut
2512*/
2513
0f9cb40c 2514PERL_STATIC_INLINE I32
ec2c235b
KW
2515Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2516{
2517 dVAR;
2518 const U8 *a = (const U8 *)s1;
2519 const U8 *b = (const U8 *)s2;
2520
2521 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2522
2523 assert(len >= 0);
2524
2525 while (len--) {
2526 if (*a != *b && *a != PL_fold_locale[*b])
2527 return 0;
2528 a++,b++;
2529 }
2530 return 1;
2531}
2532
1ab100a8
KW
2533/*
2534=for apidoc my_strnlen
2535
2536The C library C<strnlen> if available, or a Perl implementation of it.
2537
2538C<my_strnlen()> computes the length of the string, up to C<maxlen>
2539characters. It will will never attempt to address more than C<maxlen>
2540characters, making it suitable for use with strings that are not
2541guaranteed to be NUL-terminated.
2542
2543=cut
2544
2545Description stolen from http://man.openbsd.org/strnlen.3,
2546implementation stolen from PostgreSQL.
2547*/
2548#ifndef HAS_STRNLEN
2549
2550PERL_STATIC_INLINE Size_t
2551Perl_my_strnlen(const char *str, Size_t maxlen)
2552{
2553 const char *end = (char *) memchr(str, '\0', maxlen);
2554
2555 PERL_ARGS_ASSERT_MY_STRNLEN;
2556
2557 if (end == NULL) return maxlen;
2558 return end - str;
2559}
2560
2561#endif
2562
6dba01e2
KW
2563#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2564
2565PERL_STATIC_INLINE void *
2566S_my_memrchr(const char * s, const char c, const STRLEN len)
2567{
2568 /* memrchr(), since many platforms lack it */
2569
2570 const char * t = s + len - 1;
2571
2572 PERL_ARGS_ASSERT_MY_MEMRCHR;
2573
2574 while (t >= s) {
2575 if (*t == c) {
2576 return (void *) t;
2577 }
2578 t--;
2579 }
2580
2581 return NULL;
2582}
2583
2584#endif
2585
ed382232 2586/*
c8028aa6
TC
2587 * ex: set ts=8 sts=4 sw=4 et:
2588 */