This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: regpiece() 0 times anything is 0
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9 8 * This file contains tables and code adapted from
f6521f7c 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
8ed185f9
KW
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
37 * Each section names the header file that the functions "belong" to.
38 */
27669aa4 39
be3a7a5d
KW
40/* ------------------------------- av.h ------------------------------- */
41
87306e06 42/*
51b56f5c 43=for apidoc_section AV Handling
87306e06
KW
44=for apidoc av_count
45Returns the number of elements in the array C<av>. This is the true length of
46the array, including any undefined elements. It is always the same as
47S<C<av_top_index(av) + 1>>.
48
49=cut
50*/
51PERL_STATIC_INLINE Size_t
52Perl_av_count(pTHX_ AV *av)
be3a7a5d 53{
87306e06 54 PERL_ARGS_ASSERT_AV_COUNT;
be3a7a5d
KW
55 assert(SvTYPE(av) == SVt_PVAV);
56
87306e06 57 return AvFILL(av) + 1;
be3a7a5d
KW
58}
59
1afe1db1
FC
60/* ------------------------------- cv.h ------------------------------- */
61
fa3e44c0
KW
62/*
63=for apidoc_section CV Handling
64=for apidoc CvGV
65Returns the GV associated with the CV C<sv>, reifying it if necessary.
66
67=cut
68*/
ae77754a 69PERL_STATIC_INLINE GV *
c9182d9c 70Perl_CvGV(pTHX_ CV *sv)
ae77754a 71{
74804ad1
KW
72 PERL_ARGS_ASSERT_CVGV;
73
ae77754a
FC
74 return CvNAMED(sv)
75 ? Perl_cvgv_from_hek(aTHX_ sv)
76 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
77}
78
1afe1db1 79PERL_STATIC_INLINE I32 *
74804ad1 80Perl_CvDEPTH(const CV * const sv)
1afe1db1 81{
74804ad1 82 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 83 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 84
8de47657 85 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
86}
87
d16269d8
PM
88/*
89 CvPROTO returns the prototype as stored, which is not necessarily what
90 the interpreter should be using. Specifically, the interpreter assumes
91 that spaces have been stripped, which has been the case if the prototype
92 was added by toke.c, but is generally not the case if it was added elsewhere.
93 Since we can't enforce the spacelessness at assignment time, this routine
94 provides a temporary copy at parse time with spaces removed.
95 I<orig> is the start of the original buffer, I<len> is the length of the
96 prototype and will be updated when this returns.
97 */
98
5b67adb8 99#ifdef PERL_CORE
d16269d8
PM
100PERL_STATIC_INLINE char *
101S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
102{
103 SV * tmpsv;
104 char * tmps;
105 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
106 tmps = SvPVX(tmpsv);
107 while ((*len)--) {
108 if (!isSPACE(*orig))
109 *tmps++ = *orig;
110 orig++;
111 }
112 *tmps = '\0';
113 *len = tmps - SvPVX(tmpsv);
114 return SvPVX(tmpsv);
115}
5b67adb8 116#endif
d16269d8 117
25fdce4a
FC
118/* ------------------------------- mg.h ------------------------------- */
119
120#if defined(PERL_CORE) || defined(PERL_EXT)
121/* assumes get-magic and stringification have already occurred */
122PERL_STATIC_INLINE STRLEN
123S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
124{
125 assert(mg->mg_type == PERL_MAGIC_regex_global);
126 assert(mg->mg_len != -1);
127 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
128 return (STRLEN)mg->mg_len;
129 else {
130 const STRLEN pos = (STRLEN)mg->mg_len;
131 /* Without this check, we may read past the end of the buffer: */
132 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
133 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
134 }
135}
136#endif
137
03414f05
FC
138/* ------------------------------- pad.h ------------------------------ */
139
140#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
141PERL_STATIC_INLINE bool
b9d5702c 142S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 143{
b9d5702c
KW
144 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
145
03414f05
FC
146 /* is seq within the range _LOW to _HIGH ?
147 * This is complicated by the fact that PL_cop_seqmax
148 * may have wrapped around at some point */
149 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
150 return FALSE; /* not yet introduced */
151
152 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
153 /* in compiling scope */
154 if (
155 (seq > COP_SEQ_RANGE_LOW(pn))
156 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
157 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
158 )
159 return TRUE;
160 }
161 else if (
162 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
163 ?
164 ( seq > COP_SEQ_RANGE_LOW(pn)
165 || seq <= COP_SEQ_RANGE_HIGH(pn))
166
167 : ( seq > COP_SEQ_RANGE_LOW(pn)
168 && seq <= COP_SEQ_RANGE_HIGH(pn))
169 )
170 return TRUE;
171 return FALSE;
172}
173#endif
174
33a4312b
FC
175/* ------------------------------- pp.h ------------------------------- */
176
177PERL_STATIC_INLINE I32
c9182d9c 178Perl_TOPMARK(pTHX)
33a4312b
FC
179{
180 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 181 "MARK top %p %" IVdf "\n",
33a4312b
FC
182 PL_markstack_ptr,
183 (IV)*PL_markstack_ptr)));
184 return *PL_markstack_ptr;
185}
186
187PERL_STATIC_INLINE I32
c9182d9c 188Perl_POPMARK(pTHX)
33a4312b
FC
189{
190 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 191 "MARK pop %p %" IVdf "\n",
33a4312b
FC
192 (PL_markstack_ptr-1),
193 (IV)*(PL_markstack_ptr-1))));
194 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
195 return *PL_markstack_ptr--;
196}
197
8d919b0a
FC
198/* ----------------------------- regexp.h ----------------------------- */
199
200PERL_STATIC_INLINE struct regexp *
c9182d9c 201Perl_ReANY(const REGEXP * const re)
8d919b0a 202{
df6b4bd5 203 XPV* const p = (XPV*)SvANY(re);
bdef45de
KW
204
205 PERL_ARGS_ASSERT_REANY;
8d919b0a 206 assert(isREGEXP(re));
bdef45de 207
df6b4bd5
DM
208 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
209 : (struct regexp *)p;
8d919b0a
FC
210}
211
27669aa4
FC
212/* ------------------------------- sv.h ------------------------------- */
213
a887b094 214PERL_STATIC_INLINE bool
9d0469db 215Perl_SvTRUE(pTHX_ SV *sv) {
6aa30f5e 216 if (UNLIKELY(sv == NULL))
9d0469db 217 return FALSE;
7b9b9e32
PE
218 SvGETMAGIC(sv);
219 return SvTRUE_nomg_NN(sv);
a887b094
PE
220}
221
27669aa4 222PERL_STATIC_INLINE SV *
c9182d9c 223Perl_SvREFCNT_inc(SV *sv)
27669aa4 224{
2439e033 225 if (LIKELY(sv != NULL))
27669aa4
FC
226 SvREFCNT(sv)++;
227 return sv;
228}
229PERL_STATIC_INLINE SV *
c9182d9c 230Perl_SvREFCNT_inc_NN(SV *sv)
27669aa4 231{
3f2f854a
KW
232 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
233
27669aa4
FC
234 SvREFCNT(sv)++;
235 return sv;
236}
237PERL_STATIC_INLINE void
c9182d9c 238Perl_SvREFCNT_inc_void(SV *sv)
27669aa4 239{
2439e033 240 if (LIKELY(sv != NULL))
27669aa4
FC
241 SvREFCNT(sv)++;
242}
75e16a44 243PERL_STATIC_INLINE void
c9182d9c 244Perl_SvREFCNT_dec(pTHX_ SV *sv)
75e16a44 245{
2439e033 246 if (LIKELY(sv != NULL)) {
75a9bf96 247 U32 rc = SvREFCNT(sv);
79e2a32a 248 if (LIKELY(rc > 1))
75a9bf96
DM
249 SvREFCNT(sv) = rc - 1;
250 else
251 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
252 }
253}
541377b1
FC
254
255PERL_STATIC_INLINE void
c9182d9c 256Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
4a9a56a7
DM
257{
258 U32 rc = SvREFCNT(sv);
3f2f854a
KW
259
260 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
261
79e2a32a 262 if (LIKELY(rc > 1))
4a9a56a7
DM
263 SvREFCNT(sv) = rc - 1;
264 else
265 Perl_sv_free2(aTHX_ sv, rc);
266}
267
268PERL_STATIC_INLINE void
1bd041dc 269Perl_SvAMAGIC_on(SV *sv)
541377b1 270{
1bd041dc 271 PERL_ARGS_ASSERT_SVAMAGIC_ON;
541377b1 272 assert(SvROK(sv));
1bd041dc 273
541377b1
FC
274 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
275}
276PERL_STATIC_INLINE void
1bd041dc 277Perl_SvAMAGIC_off(SV *sv)
541377b1 278{
1bd041dc
KW
279 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
280
541377b1
FC
281 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
282 HvAMAGIC_off(SvSTASH(SvRV(sv)));
283}
284
285PERL_STATIC_INLINE U32
c9182d9c 286Perl_SvPADSTALE_on(SV *sv)
541377b1 287{
c0683843 288 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
289 return SvFLAGS(sv) |= SVs_PADSTALE;
290}
291PERL_STATIC_INLINE U32
c9182d9c 292Perl_SvPADSTALE_off(SV *sv)
541377b1 293{
c0683843 294 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
295 return SvFLAGS(sv) &= ~SVs_PADSTALE;
296}
25fdce4a 297#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 298PERL_STATIC_INLINE STRLEN
6964422a 299S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 300{
25fdce4a 301 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
302 if (SvGAMAGIC(sv)) {
303 U8 *hopped = utf8_hop((U8 *)pv, pos);
304 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
305 return (STRLEN)(hopped - (U8 *)pv);
306 }
307 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
308}
309#endif
f019c49e 310
a8a2ceaa
KW
311/* ------------------------------- utf8.h ------------------------------- */
312
2fe720e2 313/*
70b05c7c 314=for apidoc_section Unicode Support
2fe720e2
KW
315*/
316
55d09dc8 317PERL_STATIC_INLINE void
c9182d9c 318Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
319{
320 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
321 * encoded string at '*dest', updating '*dest' to include it */
322
55d09dc8
KW
323 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
324
6f2d5cbc 325 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 326 *((*dest)++) = byte;
55d09dc8 327 else {
a09ec51a
KW
328 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
329 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
330 }
331}
332
e123187a 333/*
2fe720e2 334=for apidoc valid_utf8_to_uvchr
09232555
KW
335Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
336known that the next character in the input UTF-8 string C<s> is well-formed
337(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
338points, and non-Unicode code points are allowed.
2fe720e2
KW
339
340=cut
341
342 */
343
344PERL_STATIC_INLINE UV
345Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
346{
c41b2540 347 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
348 const U8* send = s + expectlen;
349 UV uv = *s;
350
351 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
352
353 if (retlen) {
354 *retlen = expectlen;
355 }
356
357 /* An invariant is trivially returned */
358 if (expectlen == 1) {
359 return uv;
360 }
361
362 /* Remove the leading bits that indicate the number of bytes, leaving just
363 * the bits that are part of the value */
364 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
365
366 /* Now, loop through the remaining bytes, accumulating each into the
367 * working total as we go. (I khw tried unrolling the loop for up to 4
368 * bytes, but there was no performance improvement) */
369 for (++s; s < send; s++) {
370 uv = UTF8_ACCUMULATE(uv, *s);
371 }
372
373 return UNI_TO_NATIVE(uv);
374
375}
376
1e599354
KW
377/*
378=for apidoc is_utf8_invariant_string
379
82c5d941 380Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 381regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
382EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
383are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
384the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
385characters are invariant, but so also are the C1 controls.
1e599354
KW
386
387If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
388use this option, that C<s> can't have embedded C<NUL> characters and has to
389have a terminating C<NUL> byte).
390
9f2abfde
KW
391See also
392C<L</is_utf8_string>>,
393C<L</is_utf8_string_flags>>,
394C<L</is_utf8_string_loc>>,
395C<L</is_utf8_string_loc_flags>>,
396C<L</is_utf8_string_loclen>>,
397C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
398C<L</is_utf8_fixed_width_buf_flags>>,
399C<L</is_utf8_fixed_width_buf_loc_flags>>,
400C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
401C<L</is_strict_utf8_string>>,
402C<L</is_strict_utf8_string_loc>>,
403C<L</is_strict_utf8_string_loclen>>,
404C<L</is_c9strict_utf8_string>>,
405C<L</is_c9strict_utf8_string_loc>>,
406and
407C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
408
409=cut
0cbf5865
KW
410
411*/
412
413#define is_utf8_invariant_string(s, len) \
414 is_utf8_invariant_string_loc(s, len, NULL)
415
416/*
417=for apidoc is_utf8_invariant_string_loc
418
419Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
420the first UTF-8 variant character in the C<ep> pointer; if all characters are
421UTF-8 invariant, this function does not change the contents of C<*ep>.
422
423=cut
424
1e599354
KW
425*/
426
427PERL_STATIC_INLINE bool
c9182d9c 428Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 429{
e17544a6 430 const U8* send;
1e599354
KW
431 const U8* x = s;
432
0cbf5865
KW
433 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
434
e17544a6
KW
435 if (len == 0) {
436 len = strlen((const char *)s);
437 }
438
439 send = s + len;
440
4ab2fd9b 441/* This looks like 0x010101... */
2c5c8af5 442# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
443
444/* This looks like 0x808080... */
2c5c8af5 445# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
e099ea69 446# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 447# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 448
099e59a4
KW
449/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
450 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
451 * optimized out completely on a 32-bit system, and its mask gets optimized out
452 * on a 64-bit system */
2c5c8af5 453# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
454 | ( PTR2nat(x) >> 1) \
455 | ( ( (PTR2nat(x) \
456 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 457
3f515a2e
KW
458#ifndef EBCDIC
459
099e59a4
KW
460 /* Do the word-at-a-time iff there is at least one usable full word. That
461 * means that after advancing to a word boundary, there still is at least a
462 * full word left. The number of bytes needed to advance is 'wordsize -
463 * offset' unless offset is 0. */
464 if ((STRLEN) (send - x) >= PERL_WORDSIZE
465
466 /* This term is wordsize if subword; 0 if not */
467 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
468
469 /* 'offset' */
470 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
471 {
b40579ff 472
46bb68f6
KW
473 /* Process per-byte until reach word boundary. XXX This loop could be
474 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 475 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
476 if (! UTF8_IS_INVARIANT(*x)) {
477 if (ep) {
478 *ep = x;
479 }
e17544a6 480
46bb68f6
KW
481 return FALSE;
482 }
483 x++;
e17544a6 484 }
e17544a6 485
099e59a4
KW
486 /* Here, we know we have at least one full word to process. Process
487 * per-word as long as we have at least a full word left */
488 do {
4ab2fd9b 489 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 490
46bb68f6
KW
491 /* Found a variant. Just return if caller doesn't want its
492 * exact position */
493 if (! ep) {
494 return FALSE;
495 }
e17544a6 496
2c5c8af5
KW
497# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
498 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 499
73f0a2eb 500 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
501 assert(*ep >= s && *ep < send);
502
503 return FALSE;
504
2c5c8af5 505# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
506 checks. */
507
46bb68f6 508 break;
2c5c8af5 509# endif
46bb68f6 510 }
1d2af574 511
46bb68f6 512 x += PERL_WORDSIZE;
1d2af574 513
099e59a4 514 } while (x + PERL_WORDSIZE <= send);
b40579ff 515 }
e17544a6 516
0b08cab0 517#endif /* End of ! EBCDIC */
e17544a6
KW
518
519 /* Process per-byte */
520 while (x < send) {
521 if (! UTF8_IS_INVARIANT(*x)) {
522 if (ep) {
523 *ep = x;
524 }
0cbf5865 525
e17544a6 526 return FALSE;
0cbf5865 527 }
1e599354 528
e17544a6 529 x++;
1e599354
KW
530 }
531
532 return TRUE;
533}
534
23a7ee81
KW
535#ifndef EBCDIC
536
1d2af574 537PERL_STATIC_INLINE unsigned int
73f0a2eb 538Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574
KW
539{
540
541 /* This returns the position in a word (0..7) of the first variant byte in
542 * it. This is a helper function. Note that there are no branches */
543
544 assert(word);
545
546 /* Get just the msb bits of each byte */
547 word &= PERL_VARIANTS_WORD_MASK;
548
7adf2470 549# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
550
551 /* Bytes are stored like
552 * Byte8 ... Byte2 Byte1
553 * 63..56...15...8 7...0
554 *
555 * Isolate the lsb;
556 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
557 *
a3815e44 558 * The word will look like this, with a rightmost set bit in position 's':
1d2af574
KW
559 * ('x's are don't cares)
560 * s
561 * x..x100..0
562 * x..xx10..0 Right shift (rightmost 0 is shifted off)
563 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
564 * the 1 just to their left into a 0; the remainder is
565 * untouched
4fa92663
KW
566 * 0..0011..1 The xor with the original, x..xx10..0, clears that
567 * remainder, sets the bottom to all 1
1d2af574
KW
568 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
569 *
570 * Another method is to do 'word &= -word'; but it generates a compiler
571 * message on some platforms about taking the negative of an unsigned */
572
573 word >>= 1;
574 word = 1 + (word ^ (word - 1));
575
576# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
577
578 /* Bytes are stored like
579 * Byte1 Byte2 ... Byte8
580 * 63..56 55..47 ... 7...0
581 *
582 * Isolate the msb; http://codeforces.com/blog/entry/10330
583 *
584 * Only the most significant set bit matters. Or'ing word with its right
585 * shift of 1 makes that bit and the next one to its right both 1. Then
586 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
587 * msb and all to the right being 1. */
588 word |= word >> 1;
589 word |= word >> 2;
590 word |= word >> 4;
591 word |= word >> 8;
592 word |= word >> 16;
593 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
594
595 /* Then subtracting the right shift by 1 clears all but the left-most of
596 * the 1 bits, which is our desired result */
597 word -= (word >> 1);
598
599# else
600# error Unexpected byte order
601# endif
602
7cf2d6c7
KW
603 /* Here 'word' has a single bit set: the msb of the first byte in which it
604 * is set. Calculate that position in the word. We can use this
1d2af574 605 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
67e12c5c
KW
606 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
607 * just get shifted off at compile time) */
608 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
609 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
610 | (39 << 24) | (47 << 16)
611 | (55 << 8) | (63 << 0));
1d2af574
KW
612 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
613
614 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
615 word = ((word + 1) >> 3) - 1;
616
617# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
618
619 /* And invert the result */
620 word = CHARBITS - word - 1;
621
622# endif
623
624 return (unsigned int) word;
625}
626
23a7ee81 627#endif
03c1e4ab
KW
628#if defined(PERL_CORE) || defined(PERL_EXT)
629
630/*
631=for apidoc variant_under_utf8_count
632
633This function looks at the sequence of bytes between C<s> and C<e>, which are
634assumed to be encoded in ASCII/Latin1, and returns how many of them would
635change should the string be translated into UTF-8. Due to the nature of UTF-8,
636each of these would occupy two bytes instead of the single one in the input
637string. Thus, this function returns the precise number of bytes the string
638would expand by when translated to UTF-8.
639
640Unlike most of the other functions that have C<utf8> in their name, the input
641to this function is NOT a UTF-8-encoded string. The function name is slightly
642I<odd> to emphasize this.
643
644This function is internal to Perl because khw thinks that any XS code that
645would want this is probably operating too close to the internals. Presenting a
646valid use case could change that.
647
648See also
649C<L<perlapi/is_utf8_invariant_string>>
650and
651C<L<perlapi/is_utf8_invariant_string_loc>>,
652
653=cut
654
655*/
656
657PERL_STATIC_INLINE Size_t
658S_variant_under_utf8_count(const U8* const s, const U8* const e)
659{
660 const U8* x = s;
661 Size_t count = 0;
662
663 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
664
665# ifndef EBCDIC
666
5d0379de
KW
667 /* Test if the string is long enough to use word-at-a-time. (Logic is the
668 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
669 if ((STRLEN) (e - x) >= PERL_WORDSIZE
670 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
671 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
672 {
673
674 /* Process per-byte until reach word boundary. XXX This loop could be
675 * eliminated if we knew that this platform had fast unaligned reads */
676 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
677 count += ! UTF8_IS_INVARIANT(*x++);
678 }
679
680 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
681 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
682 explanation of how this works */
e5863284
KW
683 PERL_UINTMAX_T increment
684 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
685 * PERL_COUNT_MULTIPLIER)
686 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 687 count += (Size_t) increment;
03c1e4ab
KW
688 x += PERL_WORDSIZE;
689 } while (x + PERL_WORDSIZE <= e);
690 }
691
692# endif
693
694 /* Process per-byte */
695 while (x < e) {
696 if (! UTF8_IS_INVARIANT(*x)) {
697 count++;
698 }
699
700 x++;
701 }
702
703 return count;
704}
705
706#endif
707
aff4cafe
KW
708#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
709# undef PERL_WORDSIZE
710# undef PERL_COUNT_MULTIPLIER
711# undef PERL_WORD_BOUNDARY_MASK
712# undef PERL_VARIANTS_WORD_MASK
713#endif
03c1e4ab 714
7c93d8f0 715/*
5ff889fb
KW
716=for apidoc is_utf8_string
717
82c5d941
KW
718Returns TRUE if the first C<len> bytes of string C<s> form a valid
719Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
720be calculated using C<strlen(s)> (which means if you use this option, that C<s>
721can't have embedded C<NUL> characters and has to have a terminating C<NUL>
722byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
723
2717076a
KW
724This function considers Perl's extended UTF-8 to be valid. That means that
725code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
726considered valid by this function. Use C<L</is_strict_utf8_string>>,
727C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
728code points are considered valid.
5ff889fb 729
9f2abfde
KW
730See also
731C<L</is_utf8_invariant_string>>,
0cbf5865 732C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
733C<L</is_utf8_string_loc>>,
734C<L</is_utf8_string_loclen>>,
8bc127bf
KW
735C<L</is_utf8_fixed_width_buf_flags>>,
736C<L</is_utf8_fixed_width_buf_loc_flags>>,
737C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
738
739=cut
740*/
741
dd237e82 742#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 743
c9cd936b
KW
744#if defined(PERL_CORE) || defined (PERL_EXT)
745
746/*
747=for apidoc is_utf8_non_invariant_string
748
749Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
750C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
751UTF-8; otherwise returns FALSE.
752
753A TRUE return means that at least one code point represented by the sequence
754either is a wide character not representable as a single byte, or the
755representation differs depending on whether the sequence is encoded in UTF-8 or
756not.
757
758See also
759C<L<perlapi/is_utf8_invariant_string>>,
760C<L<perlapi/is_utf8_string>>
761
762=cut
763
764This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
765It generally needn't be if its string is entirely UTF-8 invariant, and it
766shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
767
768It is an internal function because khw thinks that XS code shouldn't be working
769at this low a level. A valid use case could change that.
770
771*/
772
773PERL_STATIC_INLINE bool
86a87e17 774Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
c9cd936b
KW
775{
776 const U8 * first_variant;
777
778 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
779
780 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
781 return FALSE;
782 }
783
784 return is_utf8_string(first_variant, len - (first_variant - s));
785}
786
787#endif
788
5ff889fb 789/*
9f2abfde
KW
790=for apidoc is_strict_utf8_string
791
792Returns TRUE if the first C<len> bytes of string C<s> form a valid
793UTF-8-encoded string that is fully interchangeable by any application using
794Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
795calculated using C<strlen(s)> (which means if you use this option, that C<s>
796can't have embedded C<NUL> characters and has to have a terminating C<NUL>
797byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
798
799This function returns FALSE for strings containing any
800code points above the Unicode max of 0x10FFFF, surrogate code points, or
801non-character code points.
802
803See also
804C<L</is_utf8_invariant_string>>,
0cbf5865 805C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
806C<L</is_utf8_string>>,
807C<L</is_utf8_string_flags>>,
808C<L</is_utf8_string_loc>>,
809C<L</is_utf8_string_loc_flags>>,
810C<L</is_utf8_string_loclen>>,
811C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
812C<L</is_utf8_fixed_width_buf_flags>>,
813C<L</is_utf8_fixed_width_buf_loc_flags>>,
814C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
815C<L</is_strict_utf8_string_loc>>,
816C<L</is_strict_utf8_string_loclen>>,
817C<L</is_c9strict_utf8_string>>,
818C<L</is_c9strict_utf8_string_loc>>,
819and
820C<L</is_c9strict_utf8_string_loclen>>.
821
822=cut
823*/
824
dd237e82 825#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
826
827/*
828=for apidoc is_c9strict_utf8_string
829
830Returns TRUE if the first C<len> bytes of string C<s> form a valid
831UTF-8-encoded string that conforms to
832L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
833otherwise it returns FALSE. If C<len> is 0, it will be calculated using
834C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
835C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
836characters being ASCII constitute 'a valid UTF-8 string'.
837
838This function returns FALSE for strings containing any code points above the
839Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
840code points per
841L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
842
843See also
844C<L</is_utf8_invariant_string>>,
0cbf5865 845C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
846C<L</is_utf8_string>>,
847C<L</is_utf8_string_flags>>,
848C<L</is_utf8_string_loc>>,
849C<L</is_utf8_string_loc_flags>>,
850C<L</is_utf8_string_loclen>>,
851C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
852C<L</is_utf8_fixed_width_buf_flags>>,
853C<L</is_utf8_fixed_width_buf_loc_flags>>,
854C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
855C<L</is_strict_utf8_string>>,
856C<L</is_strict_utf8_string_loc>>,
857C<L</is_strict_utf8_string_loclen>>,
858C<L</is_c9strict_utf8_string_loc>>,
859and
860C<L</is_c9strict_utf8_string_loclen>>.
861
862=cut
863*/
864
dd237e82 865#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
866
867/*
868=for apidoc is_utf8_string_flags
869
870Returns TRUE if the first C<len> bytes of string C<s> form a valid
871UTF-8 string, subject to the restrictions imposed by C<flags>;
872returns FALSE otherwise. If C<len> is 0, it will be calculated
873using C<strlen(s)> (which means if you use this option, that C<s> can't have
874embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
875that all characters being ASCII constitute 'a valid UTF-8 string'.
876
877If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
878C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
879as C<L</is_strict_utf8_string>>; and if C<flags> is
880C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
881C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
882combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
883C<L</utf8n_to_uvchr>>, with the same meanings.
884
885See also
886C<L</is_utf8_invariant_string>>,
0cbf5865 887C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
888C<L</is_utf8_string>>,
889C<L</is_utf8_string_loc>>,
890C<L</is_utf8_string_loc_flags>>,
891C<L</is_utf8_string_loclen>>,
892C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
893C<L</is_utf8_fixed_width_buf_flags>>,
894C<L</is_utf8_fixed_width_buf_loc_flags>>,
895C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
896C<L</is_strict_utf8_string>>,
897C<L</is_strict_utf8_string_loc>>,
898C<L</is_strict_utf8_string_loclen>>,
899C<L</is_c9strict_utf8_string>>,
900C<L</is_c9strict_utf8_string_loc>>,
901and
902C<L</is_c9strict_utf8_string_loclen>>.
903
904=cut
905*/
906
907PERL_STATIC_INLINE bool
c9182d9c 908Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 909{
33756530 910 const U8 * first_variant;
9f2abfde
KW
911
912 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
913 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 914 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 915
f60f61fd
KW
916 if (len == 0) {
917 len = strlen((const char *)s);
918 }
919
9f2abfde
KW
920 if (flags == 0) {
921 return is_utf8_string(s, len);
922 }
923
d044b7a7 924 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
925 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
926 {
927 return is_strict_utf8_string(s, len);
928 }
929
d044b7a7 930 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
931 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
932 {
933 return is_c9strict_utf8_string(s, len);
934 }
935
33756530
KW
936 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
937 const U8* const send = s + len;
938 const U8* x = first_variant;
939
a0d7f935
KW
940 while (x < send) {
941 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
942 if (UNLIKELY(! cur_len)) {
943 return FALSE;
944 }
945 x += cur_len;
9f2abfde 946 }
33756530 947 }
9f2abfde
KW
948
949 return TRUE;
950}
951
952/*
5ff889fb
KW
953
954=for apidoc is_utf8_string_loc
955
2717076a 956Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 957case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 958"utf8ness success") in the C<ep> pointer.
5ff889fb 959
2717076a 960See also C<L</is_utf8_string_loclen>>.
5ff889fb 961
3964c812
KW
962=cut
963*/
964
965#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
966
967/*
968
5ff889fb
KW
969=for apidoc is_utf8_string_loclen
970
2717076a 971Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 972case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 973"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 974encoded characters in the C<el> pointer.
5ff889fb 975
2717076a 976See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
977
978=cut
979*/
980
56e4cf64 981PERL_STATIC_INLINE bool
33756530 982Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 983{
33756530 984 const U8 * first_variant;
5ff889fb
KW
985
986 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
987
33756530
KW
988 if (len == 0) {
989 len = strlen((const char *) s);
990 }
991
992 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
993 if (el)
994 *el = len;
995
996 if (ep) {
997 *ep = s + len;
998 }
999
1000 return TRUE;
1001 }
1002
1003 {
1004 const U8* const send = s + len;
1005 const U8* x = first_variant;
1006 STRLEN outlen = first_variant - s;
1007
a0d7f935
KW
1008 while (x < send) {
1009 const STRLEN cur_len = isUTF8_CHAR(x, send);
1010 if (UNLIKELY(! cur_len)) {
1011 break;
1012 }
1013 x += cur_len;
1014 outlen++;
5ff889fb 1015 }
5ff889fb 1016
a0d7f935
KW
1017 if (el)
1018 *el = outlen;
5ff889fb 1019
a0d7f935
KW
1020 if (ep) {
1021 *ep = x;
1022 }
5ff889fb 1023
a0d7f935 1024 return (x == send);
33756530 1025 }
5ff889fb
KW
1026}
1027
1028/*
9f2abfde 1029
44170c9a 1030=for apidoc isUTF8_CHAR
8ed185f9
KW
1031
1032Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1033looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1034that represents some code point; otherwise it evaluates to 0. If non-zero, the
1035value gives how many bytes starting at C<s> comprise the code point's
1036representation. Any bytes remaining before C<e>, but beyond the ones needed to
1037form the first code point in C<s>, are not examined.
1038
13aab5dd 1039The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1040extension to official UTF-8 to represent those higher than the Unicode maximum
1041of 0x10FFFF. That means that this macro is used to efficiently decide if the
1042next few bytes in C<s> is legal UTF-8 for a single character.
1043
1044Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1045defined by Unicode to be fully interchangeable across applications;
1046C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1047#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1048code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1049
1050Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1051C<L</is_utf8_string_loclen>> to check entire strings.
1052
13aab5dd
KW
1053Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1054machines) is a valid UTF-8 character.
8ed185f9
KW
1055
1056=cut
1057
1058This uses an adaptation of the table and algorithm given in
f6521f7c 1059https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
8ed185f9
KW
1060documentation of the original version. A copyright notice for the original
1061version is given at the beginning of this file. The Perl adapation is
71525f77 1062documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1063
1064*/
1065
1066PERL_STATIC_INLINE Size_t
c9182d9c 1067Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9
KW
1068{
1069 const U8 * s = s0;
1070 UV state = 0;
1071
1072 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1073
1074 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1075 * code point, which can be returned immediately. Otherwise, it is either
1076 * malformed, or for the start byte FF which the dfa doesn't handle (except
1077 * on 32-bit ASCII platforms where it trivially is an error). Call a
1078 * helper function for the other platforms. */
1079
1080 while (s < e && LIKELY(state != 1)) {
71525f77 1081 state = PL_extended_utf8_dfa_tab[256
8ed185f9 1082 + state
71525f77 1083 + PL_extended_utf8_dfa_tab[*s]];
8ed185f9
KW
1084 if (state != 0) {
1085 s++;
1086 continue;
1087 }
1088
1089 return s - s0 + 1;
1090 }
1091
1092#if defined(UV_IS_QUAD) || defined(EBCDIC)
1093
1094 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1376b35c 1095 return is_utf8_char_helper(s0, e, 0);
8ed185f9
KW
1096 }
1097
1098#endif
1099
1100 return 0;
1101}
1102
1103/*
1104
67049a5f
KW
1105=for apidoc isSTRICT_UTF8_CHAR
1106
1107Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1108looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1109Unicode code point completely acceptable for open interchange between all
1110applications; otherwise it evaluates to 0. If non-zero, the value gives how
1111many bytes starting at C<s> comprise the code point's representation. Any
1112bytes remaining before C<e>, but beyond the ones needed to form the first code
1113point in C<s>, are not examined.
1114
1115The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1116be a surrogate nor a non-character code point. Thus this excludes any code
1117point from Perl's extended UTF-8.
1118
1119This is used to efficiently decide if the next few bytes in C<s> is
1120legal Unicode-acceptable UTF-8 for a single character.
1121
1122Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1123#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1124code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1125and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1126
1127Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1128C<L</is_strict_utf8_string_loclen>> to check entire strings.
1129
1130=cut
1131
1132This uses an adaptation of the tables and algorithm given in
f6521f7c 1133https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
67049a5f
KW
1134documentation of the original version. A copyright notice for the original
1135version is given at the beginning of this file. The Perl adapation is
1136documented at the definition of strict_extended_utf8_dfa_tab[].
1137
1138*/
1139
1140PERL_STATIC_INLINE Size_t
c9182d9c 1141Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f
KW
1142{
1143 const U8 * s = s0;
1144 UV state = 0;
1145
1146 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1147
1148 while (s < e && LIKELY(state != 1)) {
71525f77 1149 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
67049a5f
KW
1150
1151 if (state != 0) {
1152 s++;
1153 continue;
1154 }
1155
1156 return s - s0 + 1;
1157 }
1158
1159#ifndef EBCDIC
1160
1161 /* The dfa above drops out for certain Hanguls; handle them specially */
1162 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1163 return 3;
1164 }
1165
1166#endif
1167
1168 return 0;
1169}
1170
1171/*
1172
44170c9a 1173=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1174
1175Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1176looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1177Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1178the value gives how many bytes starting at C<s> comprise the code point's
1179representation. Any bytes remaining before C<e>, but beyond the ones needed to
1180form the first code point in C<s>, are not examined.
1181
1182The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1183differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1184code points. This corresponds to
1185L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1186which said that non-character code points are merely discouraged rather than
1187completely forbidden in open interchange. See
1188L<perlunicode/Noncharacter code points>.
1189
1190Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1191C<L</isUTF8_CHAR_flags>> for a more customized definition.
1192
1193Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1194C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1195
1196=cut
1197
1198This uses an adaptation of the tables and algorithm given in
f6521f7c 1199https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
c5bfbb64
KW
1200documentation of the original version. A copyright notice for the original
1201version is given at the beginning of this file. The Perl adapation is
71525f77 1202documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1203
1204*/
1205
1206PERL_STATIC_INLINE Size_t
c9182d9c 1207Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64
KW
1208{
1209 const U8 * s = s0;
1210 UV state = 0;
1211
1212 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1213
1214 while (s < e && LIKELY(state != 1)) {
71525f77 1215 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
c5bfbb64
KW
1216
1217 if (state != 0) {
1218 s++;
1219 continue;
1220 }
1221
1222 return s - s0 + 1;
1223 }
1224
1225 return 0;
1226}
1227
1228/*
1229
9f2abfde
KW
1230=for apidoc is_strict_utf8_string_loc
1231
1232Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1233case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1234"utf8ness success") in the C<ep> pointer.
1235
1236See also C<L</is_strict_utf8_string_loclen>>.
1237
1238=cut
1239*/
1240
1241#define is_strict_utf8_string_loc(s, len, ep) \
1242 is_strict_utf8_string_loclen(s, len, ep, 0)
1243
1244/*
1245
1246=for apidoc is_strict_utf8_string_loclen
1247
1248Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1249case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1250"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1251encoded characters in the C<el> pointer.
1252
1253See also C<L</is_strict_utf8_string_loc>>.
1254
1255=cut
1256*/
1257
1258PERL_STATIC_INLINE bool
c9182d9c 1259Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1260{
33756530 1261 const U8 * first_variant;
9f2abfde
KW
1262
1263 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1264
33756530
KW
1265 if (len == 0) {
1266 len = strlen((const char *) s);
1267 }
1268
1269 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1270 if (el)
1271 *el = len;
1272
1273 if (ep) {
1274 *ep = s + len;
1275 }
1276
1277 return TRUE;
1278 }
1279
1280 {
1281 const U8* const send = s + len;
1282 const U8* x = first_variant;
1283 STRLEN outlen = first_variant - s;
1284
a0d7f935
KW
1285 while (x < send) {
1286 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1287 if (UNLIKELY(! cur_len)) {
1288 break;
1289 }
1290 x += cur_len;
1291 outlen++;
9f2abfde 1292 }
9f2abfde 1293
a0d7f935
KW
1294 if (el)
1295 *el = outlen;
9f2abfde 1296
a0d7f935
KW
1297 if (ep) {
1298 *ep = x;
1299 }
9f2abfde 1300
a0d7f935 1301 return (x == send);
33756530 1302 }
9f2abfde
KW
1303}
1304
1305/*
1306
1307=for apidoc is_c9strict_utf8_string_loc
1308
1309Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1310the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1311"utf8ness success") in the C<ep> pointer.
1312
1313See also C<L</is_c9strict_utf8_string_loclen>>.
1314
1315=cut
1316*/
1317
1318#define is_c9strict_utf8_string_loc(s, len, ep) \
1319 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1320
1321/*
1322
1323=for apidoc is_c9strict_utf8_string_loclen
1324
1325Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1326the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1327"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1328characters in the C<el> pointer.
1329
1330See also C<L</is_c9strict_utf8_string_loc>>.
1331
1332=cut
1333*/
1334
1335PERL_STATIC_INLINE bool
c9182d9c 1336Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1337{
33756530 1338 const U8 * first_variant;
9f2abfde
KW
1339
1340 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1341
33756530
KW
1342 if (len == 0) {
1343 len = strlen((const char *) s);
1344 }
1345
1346 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1347 if (el)
1348 *el = len;
1349
1350 if (ep) {
1351 *ep = s + len;
1352 }
1353
1354 return TRUE;
1355 }
1356
1357 {
1358 const U8* const send = s + len;
1359 const U8* x = first_variant;
1360 STRLEN outlen = first_variant - s;
1361
a0d7f935
KW
1362 while (x < send) {
1363 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1364 if (UNLIKELY(! cur_len)) {
1365 break;
1366 }
1367 x += cur_len;
1368 outlen++;
9f2abfde 1369 }
9f2abfde 1370
a0d7f935
KW
1371 if (el)
1372 *el = outlen;
9f2abfde 1373
a0d7f935
KW
1374 if (ep) {
1375 *ep = x;
1376 }
9f2abfde 1377
a0d7f935 1378 return (x == send);
33756530 1379 }
9f2abfde
KW
1380}
1381
1382/*
1383
1384=for apidoc is_utf8_string_loc_flags
1385
1386Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1387case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1388"utf8ness success") in the C<ep> pointer.
1389
1390See also C<L</is_utf8_string_loclen_flags>>.
1391
1392=cut
1393*/
1394
1395#define is_utf8_string_loc_flags(s, len, ep, flags) \
1396 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1397
1398
1399/* The above 3 actual functions could have been moved into the more general one
1400 * just below, and made #defines that call it with the right 'flags'. They are
1401 * currently kept separate to increase their chances of getting inlined */
1402
1403/*
1404
1405=for apidoc is_utf8_string_loclen_flags
1406
1407Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1408case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1409"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1410encoded characters in the C<el> pointer.
1411
1412See also C<L</is_utf8_string_loc_flags>>.
1413
1414=cut
1415*/
1416
1417PERL_STATIC_INLINE bool
c9182d9c 1418Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1419{
33756530 1420 const U8 * first_variant;
9f2abfde
KW
1421
1422 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1423 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1424 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1425
f60f61fd 1426 if (len == 0) {
a0d7f935 1427 len = strlen((const char *) s);
f60f61fd
KW
1428 }
1429
9f2abfde
KW
1430 if (flags == 0) {
1431 return is_utf8_string_loclen(s, len, ep, el);
1432 }
1433
d044b7a7 1434 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1435 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1436 {
1437 return is_strict_utf8_string_loclen(s, len, ep, el);
1438 }
1439
d044b7a7 1440 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1441 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1442 {
1443 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1444 }
1445
33756530
KW
1446 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1447 if (el)
1448 *el = len;
1449
1450 if (ep) {
1451 *ep = s + len;
1452 }
1453
1454 return TRUE;
1455 }
1456
1457 {
1458 const U8* send = s + len;
1459 const U8* x = first_variant;
1460 STRLEN outlen = first_variant - s;
1461
a0d7f935
KW
1462 while (x < send) {
1463 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1464 if (UNLIKELY(! cur_len)) {
1465 break;
1466 }
1467 x += cur_len;
1468 outlen++;
9f2abfde 1469 }
9f2abfde 1470
a0d7f935
KW
1471 if (el)
1472 *el = outlen;
9f2abfde 1473
a0d7f935
KW
1474 if (ep) {
1475 *ep = x;
1476 }
9f2abfde 1477
a0d7f935 1478 return (x == send);
33756530 1479 }
9f2abfde
KW
1480}
1481
1482/*
7c93d8f0
KW
1483=for apidoc utf8_distance
1484
1485Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1486and C<b>.
1487
1488WARNING: use only if you *know* that the pointers point inside the
1489same UTF-8 buffer.
1490
1491=cut
1492*/
1493
1494PERL_STATIC_INLINE IV
1495Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1496{
1497 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1498
1499 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1500}
1501
1502/*
1503=for apidoc utf8_hop
1504
1505Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1506forward or backward.
1507
1508WARNING: do not use the following unless you *know* C<off> is within
1509the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1510on the first byte of character or just after the last byte of a character.
1511
1512=cut
1513*/
1514
1515PERL_STATIC_INLINE U8 *
1516Perl_utf8_hop(const U8 *s, SSize_t off)
1517{
1518 PERL_ARGS_ASSERT_UTF8_HOP;
1519
1520 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1521 * the bitops (especially ~) can create illegal UTF-8.
1522 * In other words: in Perl UTF-8 is not just for Unicode. */
1523
1524 if (off >= 0) {
1525 while (off--)
1526 s += UTF8SKIP(s);
1527 }
1528 else {
1529 while (off++) {
1530 s--;
1531 while (UTF8_IS_CONTINUATION(*s))
1532 s--;
1533 }
1534 }
e099ea69 1535 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 1536 return (U8 *)s;
e099ea69 1537 GCC_DIAG_RESTORE
7c93d8f0
KW
1538}
1539
4dab108f 1540/*
65df57a8
TC
1541=for apidoc utf8_hop_forward
1542
1543Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1544forward.
1545
1546C<off> must be non-negative.
1547
1548C<s> must be before or equal to C<end>.
1549
1550When moving forward it will not move beyond C<end>.
1551
1552Will not exceed this limit even if the string is not valid "UTF-8".
1553
1554=cut
1555*/
1556
1557PERL_STATIC_INLINE U8 *
1558Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1559{
1560 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1561
1562 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1563 * the bitops (especially ~) can create illegal UTF-8.
1564 * In other words: in Perl UTF-8 is not just for Unicode. */
1565
1566 assert(s <= end);
1567 assert(off >= 0);
1568
1569 while (off--) {
1570 STRLEN skip = UTF8SKIP(s);
de979548 1571 if ((STRLEN)(end - s) <= skip) {
e099ea69 1572 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1573 return (U8 *)end;
e099ea69 1574 GCC_DIAG_RESTORE
de979548 1575 }
65df57a8
TC
1576 s += skip;
1577 }
1578
e099ea69 1579 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1580 return (U8 *)s;
e099ea69 1581 GCC_DIAG_RESTORE
65df57a8
TC
1582}
1583
1584/*
1585=for apidoc utf8_hop_back
1586
1587Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1588backward.
1589
1590C<off> must be non-positive.
1591
1592C<s> must be after or equal to C<start>.
1593
1594When moving backward it will not move before C<start>.
1595
1596Will not exceed this limit even if the string is not valid "UTF-8".
1597
1598=cut
1599*/
1600
1601PERL_STATIC_INLINE U8 *
1602Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1603{
1604 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1605
1606 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1607 * the bitops (especially ~) can create illegal UTF-8.
1608 * In other words: in Perl UTF-8 is not just for Unicode. */
1609
1610 assert(start <= s);
1611 assert(off <= 0);
1612
1613 while (off++ && s > start) {
e7185695 1614 do {
65df57a8 1615 s--;
e7185695 1616 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8 1617 }
f6521f7c 1618
e099ea69 1619 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1620 return (U8 *)s;
e099ea69 1621 GCC_DIAG_RESTORE
65df57a8
TC
1622}
1623
1624/*
1625=for apidoc utf8_hop_safe
1626
1627Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1628either forward or backward.
1629
1630When moving backward it will not move before C<start>.
1631
1632When moving forward it will not move beyond C<end>.
1633
1634Will not exceed those limits even if the string is not valid "UTF-8".
1635
1636=cut
1637*/
1638
1639PERL_STATIC_INLINE U8 *
1640Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1641{
1642 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1643
1644 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1645 * the bitops (especially ~) can create illegal UTF-8.
1646 * In other words: in Perl UTF-8 is not just for Unicode. */
1647
1648 assert(start <= s && s <= end);
1649
1650 if (off >= 0) {
1651 return utf8_hop_forward(s, off, end);
1652 }
1653 else {
1654 return utf8_hop_back(s, off, start);
1655 }
1656}
1657
1658/*
4dab108f
KW
1659
1660=for apidoc is_utf8_valid_partial_char
1661
6cbb9248
KW
1662Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1663S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1664points. Otherwise, it returns 1 if there exists at least one non-empty
1665sequence of bytes that when appended to sequence C<s>, starting at position
1666C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1667otherwise returns 0.
1668
1669In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1670point.
1671
1672This is useful when a fixed-length buffer is being tested for being well-formed
1673UTF-8, but the final few bytes in it don't comprise a full character; that is,
1674it is split somewhere in the middle of the final code point's UTF-8
1675representation. (Presumably when the buffer is refreshed with the next chunk
1676of data, the new first bytes will complete the partial code point.) This
1677function is used to verify that the final bytes in the current buffer are in
1678fact the legal beginning of some code point, so that if they aren't, the
1679failure can be signalled without having to wait for the next read.
4dab108f
KW
1680
1681=cut
1682*/
2717076a
KW
1683#define is_utf8_valid_partial_char(s, e) \
1684 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1685
1686/*
1687
1688=for apidoc is_utf8_valid_partial_char_flags
1689
1690Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1691or not the input is a valid UTF-8 encoded partial character, but it takes an
1692extra parameter, C<flags>, which can further restrict which code points are
1693considered valid.
1694
1695If C<flags> is 0, this behaves identically to
1696C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1697of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1698there is any sequence of bytes that can complete the input partial character in
1699such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1700TRUE; otherwise FALSE. Non character code points cannot be determined based on
1701partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1702determined from just the first one or two bytes.
1703
1704=cut
1705 */
1706
56e4cf64 1707PERL_STATIC_INLINE bool
c9182d9c 1708Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1709{
f1c999a7 1710 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1711
f1c999a7 1712 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1713 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1714
8875bd48 1715 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1716 return FALSE;
1717 }
1718
1376b35c 1719 return cBOOL(is_utf8_char_helper(s, e, flags));
4dab108f
KW
1720}
1721
8bc127bf
KW
1722/*
1723
1724=for apidoc is_utf8_fixed_width_buf_flags
1725
1726Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1727is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1728otherwise it returns FALSE.
1729
1730If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1731without restriction. If the final few bytes of the buffer do not form a
1732complete code point, this will return TRUE anyway, provided that
1733C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1734
1735If C<flags> in non-zero, it can be any combination of the
1736C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1737same meanings.
1738
1739This function differs from C<L</is_utf8_string_flags>> only in that the latter
1740returns FALSE if the final few bytes of the string don't form a complete code
1741point.
1742
1743=cut
1744 */
1745#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1746 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1747
1748/*
1749
1750=for apidoc is_utf8_fixed_width_buf_loc_flags
1751
1752Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1753failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1754to the beginning of any partial character at the end of the buffer; if there is
1755no partial character C<*ep> will contain C<s>+C<len>.
1756
1757See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1758
1759=cut
1760*/
1761
1762#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1763 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1764
1765/*
1766
1767=for apidoc is_utf8_fixed_width_buf_loclen_flags
1768
1769Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1770complete, valid characters found in the C<el> pointer.
1771
1772=cut
1773*/
1774
1775PERL_STATIC_INLINE bool
c9182d9c 1776Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 1777 STRLEN len,
8bc127bf
KW
1778 const U8 **ep,
1779 STRLEN *el,
1780 const U32 flags)
1781{
1782 const U8 * maybe_partial;
1783
1784 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1785
1786 if (! ep) {
1787 ep = &maybe_partial;
1788 }
1789
1790 /* If it's entirely valid, return that; otherwise see if the only error is
1791 * that the final few bytes are for a partial character */
1792 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1793 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1794}
1795
e6a4ffc3 1796PERL_STATIC_INLINE UV
c9182d9c 1797Perl_utf8n_to_uvchr_msgs(const U8 *s,
e6a4ffc3
KW
1798 STRLEN curlen,
1799 STRLEN *retlen,
1800 const U32 flags,
1801 U32 * errors,
1802 AV ** msgs)
1803{
1804 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1805 * simple cases, and, if necessary calls a helper function to deal with the
1806 * more complex ones. Almost all well-formed non-problematic code points
1807 * are considered simple, so that it's unlikely that the helper function
1808 * will need to be called.
1809 *
1810 * This is an adaptation of the tables and algorithm given in
f6521f7c 1811 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
e6a4ffc3
KW
1812 * comprehensive documentation of the original version. A copyright notice
1813 * for the original version is given at the beginning of this file. The
71525f77 1814 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
1815 */
1816
1817 const U8 * const s0 = s;
1818 const U8 * send = s0 + curlen;
f73cfe7d 1819 UV uv = 0; /* The 0 silences some stupid compilers */
e6a4ffc3
KW
1820 UV state = 0;
1821
1822 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1823
1824 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1825 * non-problematic code point, which can be returned immediately.
1826 * Otherwise we call a helper function to figure out the more complicated
1827 * cases. */
1828
1829 while (s < send && LIKELY(state != 1)) {
71525f77 1830 UV type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3
KW
1831
1832 uv = (state == 0)
1833 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1834 : UTF8_ACCUMULATE(uv, *s);
71525f77 1835 state = PL_strict_utf8_dfa_tab[256 + state + type];
e6a4ffc3
KW
1836
1837 if (state != 0) {
1838 s++;
1839 continue;
1840 }
1841
1842 if (retlen) {
1843 *retlen = s - s0 + 1;
1844 }
1845 if (errors) {
1846 *errors = 0;
1847 }
1848 if (msgs) {
1849 *msgs = NULL;
1850 }
1851
cfebc7aa 1852 return UNI_TO_NATIVE(uv);
e6a4ffc3
KW
1853 }
1854
1855 /* Here is potentially problematic. Use the full mechanism */
1856 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1857}
1858
82651abe 1859PERL_STATIC_INLINE UV
9a9a6c98 1860Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 1861{
9a9a6c98 1862 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
1863
1864 assert(s < send);
1865
1866 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
1867
1868 /* EMPTY is not really allowed, and asserts on debugging builds. But
1869 * on non-debugging we have to deal with it, and this causes it to
1870 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 1871 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 1872 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
1873 }
1874 else {
1875 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1876 if (retlen && ret == 0 && *s != '\0') {
1877 *retlen = (STRLEN) -1;
1878 }
1879
1880 return ret;
1881 }
1882}
1883
c8028aa6
TC
1884/* ------------------------------- perl.h ----------------------------- */
1885
1886/*
51b56f5c 1887=for apidoc_section Utility Functions
dcccc8ff 1888
44170c9a 1889=for apidoc is_safe_syscall
c8028aa6 1890
1a0efc9a
KW
1891Test that the given C<pv> (with length C<len>) doesn't contain any internal
1892C<NUL> characters.
1893If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
1894category, and return FALSE.
c8028aa6
TC
1895
1896Return TRUE if the name is safe.
1897
1a0efc9a
KW
1898C<what> and C<op_name> are used in any warning.
1899
796b6530 1900Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1901
1902=cut
1903*/
1904
1905PERL_STATIC_INLINE bool
ffd62fc2
KW
1906Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1907{
c8028aa6
TC
1908 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1909 * perl itself uses xce*() functions which accept 8-bit strings.
1910 */
1911
1912 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1913
6c4650b3 1914 if (len > 1) {
c8028aa6 1915 char *null_at;
41188aa0 1916 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1917 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1918 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1919 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1920 what, op_name, pv, null_at+1);
c8028aa6
TC
1921 return FALSE;
1922 }
1923 }
1924
1925 return TRUE;
1926}
1927
1928/*
7cb3f959
TC
1929
1930Return true if the supplied filename has a newline character
fa6c7d00 1931immediately before the first (hopefully only) NUL.
7cb3f959
TC
1932
1933My original look at this incorrectly used the len from SvPV(), but
1934that's incorrect, since we allow for a NUL in pv[len-1].
1935
1936So instead, strlen() and work from there.
1937
1938This allow for the user reading a filename, forgetting to chomp it,
1939then calling:
1940
1941 open my $foo, "$file\0";
1942
1943*/
1944
1945#ifdef PERL_CORE
1946
1947PERL_STATIC_INLINE bool
ffd62fc2
KW
1948S_should_warn_nl(const char *pv)
1949{
7cb3f959
TC
1950 STRLEN len;
1951
1952 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1953
1954 len = strlen(pv);
1955
1956 return len > 0 && pv[len-1] == '\n';
1957}
1958
1959#endif
1960
3a019afd
KW
1961#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1962
1963PERL_STATIC_INLINE bool
1964S_lossless_NV_to_IV(const NV nv, IV *ivp)
1965{
1966 /* This function determines if the input NV 'nv' may be converted without
1967 * loss of data to an IV. If not, it returns FALSE taking no other action.
1968 * But if it is possible, it does the conversion, returning TRUE, and
1969 * storing the converted result in '*ivp' */
1970
1971 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1972
cd304e76
DM
1973# if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
1974 /* Normally any comparison with a NaN returns false; if we can't rely
1975 * on that behaviour, check explicitly */
3a019afd
KW
1976 if (UNLIKELY(Perl_isnan(nv))) {
1977 return FALSE;
1978 }
3a019afd
KW
1979# endif
1980
cd304e76
DM
1981 /* Written this way so that with an always-false NaN comparison we
1982 * return false */
1983 if (!(LIKELY(nv >= IV_MIN) && LIKELY(nv <= IV_MAX))) {
3a019afd
KW
1984 return FALSE;
1985 }
1986
1987 if ((IV) nv != nv) {
1988 return FALSE;
1989 }
1990
1991 *ivp = (IV) nv;
1992 return TRUE;
1993}
1994
1995#endif
1996
14fe5f8a
KW
1997/* ------------------ regcomp.c, toke.c ------------ */
1998
1999#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C)
2000
2001/*
2002 - regcurly - a little FSA that accepts {\d+,?\d*}
2003 Pulled from reg.c.
2004 */
6bbb95d4 2005PERL_STATIC_INLINE bool
14fe5f8a
KW
2006S_regcurly(const char *s)
2007{
2008 PERL_ARGS_ASSERT_REGCURLY;
2009
2010 if (*s++ != '{')
2011 return FALSE;
2012 if (!isDIGIT(*s))
2013 return FALSE;
2014 while (isDIGIT(*s))
2015 s++;
2016 if (*s == ',') {
2017 s++;
2018 while (isDIGIT(*s))
2019 s++;
2020 }
2021
2022 return *s == '}';
2023}
2024
2025#endif
2026
81d52ecd
JH
2027/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2028
94b0cb42
KW
2029#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2030
81d52ecd
JH
2031#define MAX_CHARSET_NAME_LENGTH 2
2032
2033PERL_STATIC_INLINE const char *
94b0cb42 2034S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 2035{
94b0cb42
KW
2036 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2037
81d52ecd
JH
2038 /* Returns a string that corresponds to the name of the regex character set
2039 * given by 'flags', and *lenp is set the length of that string, which
2040 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2041
2042 *lenp = 1;
2043 switch (get_regex_charset(flags)) {
2044 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2045 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2046 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2047 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2048 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2049 *lenp = 2;
2050 return ASCII_MORE_RESTRICT_PAT_MODS;
2051 }
2052 /* The NOT_REACHED; hides an assert() which has a rather complex
2053 * definition in perl.h. */
2054 NOT_REACHED; /* NOTREACHED */
2055 return "?"; /* Unknown */
2056}
2057
94b0cb42
KW
2058#endif
2059
7cb3f959 2060/*
ed382232
TC
2061
2062Return false if any get magic is on the SV other than taint magic.
2063
2064*/
2065
2066PERL_STATIC_INLINE bool
ffd62fc2
KW
2067Perl_sv_only_taint_gmagic(SV *sv)
2068{
ed382232
TC
2069 MAGIC *mg = SvMAGIC(sv);
2070
2071 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2072
2073 while (mg) {
2074 if (mg->mg_type != PERL_MAGIC_taint
2075 && !(mg->mg_flags & MGf_GSKIP)
2076 && mg->mg_virtual->svt_get) {
2077 return FALSE;
2078 }
2079 mg = mg->mg_moremagic;
2080 }
2081
2082 return TRUE;
2083}
2084
ed8ff0f3
DM
2085/* ------------------ cop.h ------------------------------------------- */
2086
5b6f7443
DM
2087/* implement GIMME_V() macro */
2088
2089PERL_STATIC_INLINE U8
2090Perl_gimme_V(pTHX)
2091{
2092 I32 cxix;
2093 U8 gimme = (PL_op->op_flags & OPf_WANT);
2094
2095 if (gimme)
2096 return gimme;
2097 cxix = PL_curstackinfo->si_cxsubix;
2098 if (cxix < 0)
390fe0c0 2099 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
5b6f7443
DM
2100 assert(cxstack[cxix].blk_gimme & G_WANT);
2101 return (cxstack[cxix].blk_gimme & G_WANT);
2102}
2103
ed8ff0f3
DM
2104
2105/* Enter a block. Push a new base context and return its address. */
2106
2107PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2108Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2109{
2110 PERL_CONTEXT * cx;
2111
2112 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2113
2114 CXINC;
2115 cx = CX_CUR();
2116 cx->cx_type = type;
2117 cx->blk_gimme = gimme;
2118 cx->blk_oldsaveix = saveix;
4caf7d8c 2119 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2120 cx->blk_oldcop = PL_curcop;
4caf7d8c 2121 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2122 cx->blk_oldscopesp = PL_scopestack_ix;
2123 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2124 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2125
2126 PL_tmps_floor = PL_tmps_ix;
2127 CX_DEBUG(cx, "PUSH");
2128 return cx;
2129}
2130
2131
2132/* Exit a block (RETURN and LAST). */
2133
2134PERL_STATIC_INLINE void
c9182d9c 2135Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2136{
2137 PERL_ARGS_ASSERT_CX_POPBLOCK;
2138
2139 CX_DEBUG(cx, "POP");
2140 /* these 3 are common to cx_popblock and cx_topblock */
2141 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2142 PL_scopestack_ix = cx->blk_oldscopesp;
2143 PL_curpm = cx->blk_oldpm;
2144
2145 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2146 * and leaves a CX entry lying around for repeated use, so
2147 * skip for multicall */ \
2148 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2149 || PL_savestack_ix == cx->blk_oldsaveix);
2150 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2151 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2152}
2153
2154/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2155 * Whereas cx_popblock() restores the state to the point just before
2156 * cx_pushblock() was called, cx_topblock() restores it to the point just
2157 * *after* cx_pushblock() was called. */
2158
2159PERL_STATIC_INLINE void
c9182d9c 2160Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2161{
2162 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2163
2164 CX_DEBUG(cx, "TOP");
2165 /* these 3 are common to cx_popblock and cx_topblock */
2166 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2167 PL_scopestack_ix = cx->blk_oldscopesp;
2168 PL_curpm = cx->blk_oldpm;
2169
2170 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2171}
2172
2173
a73d8813 2174PERL_STATIC_INLINE void
c9182d9c 2175Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2176{
2177 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2178
2179 PERL_ARGS_ASSERT_CX_PUSHSUB;
2180
3f6bd23a 2181 PERL_DTRACE_PROBE_ENTRY(cv);
5b6f7443
DM
2182 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2183 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
a73d8813
DM
2184 cx->blk_sub.cv = cv;
2185 cx->blk_sub.olddepth = CvDEPTH(cv);
2186 cx->blk_sub.prevcomppad = PL_comppad;
2187 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2188 cx->blk_sub.retop = retop;
2189 SvREFCNT_inc_simple_void_NN(cv);
2190 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2191}
2192
2193
2194/* subsets of cx_popsub() */
2195
2196PERL_STATIC_INLINE void
c9182d9c 2197Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2198{
2199 CV *cv;
2200
2201 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2202 assert(CxTYPE(cx) == CXt_SUB);
2203
2204 PL_comppad = cx->blk_sub.prevcomppad;
2205 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2206 cv = cx->blk_sub.cv;
2207 CvDEPTH(cv) = cx->blk_sub.olddepth;
2208 cx->blk_sub.cv = NULL;
2209 SvREFCNT_dec(cv);
5b6f7443 2210 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
a73d8813
DM
2211}
2212
2213
2214/* handle the @_ part of leaving a sub */
2215
2216PERL_STATIC_INLINE void
c9182d9c 2217Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2218{
2219 AV *av;
2220
2221 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2222 assert(CxTYPE(cx) == CXt_SUB);
2223 assert(AvARRAY(MUTABLE_AV(
2224 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2225 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2226
2227 CX_POP_SAVEARRAY(cx);
2228 av = MUTABLE_AV(PAD_SVl(0));
2229 if (UNLIKELY(AvREAL(av)))
2230 /* abandon @_ if it got reified */
2231 clear_defarray(av, 0);
2232 else {
2233 CLEAR_ARGARRAY(av);
2234 }
2235}
2236
2237
2238PERL_STATIC_INLINE void
c9182d9c 2239Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2240{
2241 PERL_ARGS_ASSERT_CX_POPSUB;
2242 assert(CxTYPE(cx) == CXt_SUB);
2243
3f6bd23a 2244 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2245
2246 if (CxHASARGS(cx))
2247 cx_popsub_args(cx);
2248 cx_popsub_common(cx);
2249}
2250
2251
6a7d52cc 2252PERL_STATIC_INLINE void
c9182d9c 2253Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2254{
2255 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2256
5b6f7443
DM
2257 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2258 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
6a7d52cc
DM
2259 cx->blk_format.cv = cv;
2260 cx->blk_format.retop = retop;
2261 cx->blk_format.gv = gv;
2262 cx->blk_format.dfoutgv = PL_defoutgv;
2263 cx->blk_format.prevcomppad = PL_comppad;
2264 cx->blk_u16 = 0;
2265
2266 SvREFCNT_inc_simple_void_NN(cv);
2267 CvDEPTH(cv)++;
2268 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2269}
2270
2271
2272PERL_STATIC_INLINE void
c9182d9c 2273Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2274{
2275 CV *cv;
2276 GV *dfout;
2277
2278 PERL_ARGS_ASSERT_CX_POPFORMAT;
2279 assert(CxTYPE(cx) == CXt_FORMAT);
2280
2281 dfout = cx->blk_format.dfoutgv;
2282 setdefout(dfout);
2283 cx->blk_format.dfoutgv = NULL;
2284 SvREFCNT_dec_NN(dfout);
2285
2286 PL_comppad = cx->blk_format.prevcomppad;
2287 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2288 cv = cx->blk_format.cv;
2289 cx->blk_format.cv = NULL;
2290 --CvDEPTH(cv);
2291 SvREFCNT_dec_NN(cv);
5b6f7443 2292 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
6a7d52cc
DM
2293}
2294
2295
13febba5 2296PERL_STATIC_INLINE void
c9182d9c 2297Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5
DM
2298{
2299 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2300
5b6f7443
DM
2301 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2302 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
13febba5
DM
2303 cx->blk_eval.retop = retop;
2304 cx->blk_eval.old_namesv = namesv;
2305 cx->blk_eval.old_eval_root = PL_eval_root;
2306 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2307 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2308 cx->blk_eval.cur_top_env = PL_top_env;
2309
4c57ced5 2310 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2311 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2312 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2313}
2314
2315
2316PERL_STATIC_INLINE void
c9182d9c 2317Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2318{
2319 SV *sv;
2320
2321 PERL_ARGS_ASSERT_CX_POPEVAL;
2322 assert(CxTYPE(cx) == CXt_EVAL);
2323
2324 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2325 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2326 PL_eval_root = cx->blk_eval.old_eval_root;
2327 sv = cx->blk_eval.cur_text;
4c57ced5 2328 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2329 cx->blk_eval.cur_text = NULL;
2330 SvREFCNT_dec_NN(sv);
2331 }
2332
2333 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2334 if (sv) {
2335 cx->blk_eval.old_namesv = NULL;
2336 SvREFCNT_dec_NN(sv);
2337 }
5b6f7443 2338 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
13febba5 2339}
6a7d52cc 2340
a73d8813 2341
d1b6bf72
DM
2342/* push a plain loop, i.e.
2343 * { block }
2344 * while (cond) { block }
2345 * for (init;cond;continue) { block }
2346 * This loop can be last/redo'ed etc.
2347 */
2348
2349PERL_STATIC_INLINE void
c9182d9c 2350Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2351{
2352 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2353 cx->blk_loop.my_op = cLOOP;
2354}
2355
2356
2357/* push a true for loop, i.e.
2358 * for var (list) { block }
2359 */
2360
2361PERL_STATIC_INLINE void
c9182d9c 2362Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
2363{
2364 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2365
2366 /* this one line is common with cx_pushloop_plain */
2367 cx->blk_loop.my_op = cLOOP;
2368
2369 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2370 cx->blk_loop.itersave = itersave;
2371#ifdef USE_ITHREADS
2372 cx->blk_loop.oldcomppad = PL_comppad;
2373#endif
2374}
2375
2376
2377/* pop all loop types, including plain */
2378
2379PERL_STATIC_INLINE void
c9182d9c 2380Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2381{
2382 PERL_ARGS_ASSERT_CX_POPLOOP;
2383
2384 assert(CxTYPE_is_LOOP(cx));
2385 if ( CxTYPE(cx) == CXt_LOOP_ARY
2386 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2387 {
2388 /* Free ary or cur. This assumes that state_u.ary.ary
2389 * aligns with state_u.lazysv.cur. See cx_dup() */
2390 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2391 cx->blk_loop.state_u.lazysv.cur = NULL;
2392 SvREFCNT_dec_NN(sv);
2393 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2394 sv = cx->blk_loop.state_u.lazysv.end;
2395 cx->blk_loop.state_u.lazysv.end = NULL;
2396 SvREFCNT_dec_NN(sv);
2397 }
2398 }
2399 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2400 SV *cursv;
2401 SV **svp = (cx)->blk_loop.itervar_u.svp;
2402 if ((cx->cx_type & CXp_FOR_GV))
2403 svp = &GvSV((GV*)svp);
2404 cursv = *svp;
2405 *svp = cx->blk_loop.itersave;
2406 cx->blk_loop.itersave = NULL;
2407 SvREFCNT_dec(cursv);
2408 }
2409}
2410
2a7b7c61
DM
2411
2412PERL_STATIC_INLINE void
c9182d9c 2413Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2414{
7896dde7 2415 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 2416
7896dde7 2417 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
2418}
2419
2420
2421PERL_STATIC_INLINE void
c9182d9c 2422Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2423{
7896dde7
Z
2424 PERL_ARGS_ASSERT_CX_POPWHEN;
2425 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
2426
2427 PERL_UNUSED_ARG(cx);
59a14f30 2428 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
2429 /* currently NOOP */
2430}
2431
2432
7896dde7 2433PERL_STATIC_INLINE void
c9182d9c 2434Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
2435{
2436 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2437
2438 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2439 cx->blk_givwhen.defsv_save = orig_defsv;
2440}
2441
2442
2443PERL_STATIC_INLINE void
c9182d9c 2444Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
2445{
2446 SV *sv;
2447
2448 PERL_ARGS_ASSERT_CX_POPGIVEN;
2449 assert(CxTYPE(cx) == CXt_GIVEN);
2450
2451 sv = GvSV(PL_defgv);
2452 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2453 cx->blk_givwhen.defsv_save = NULL;
2454 SvREFCNT_dec(sv);
2455}
2456
ec2c235b
KW
2457/* ------------------ util.h ------------------------------------------- */
2458
2459/*
51b56f5c 2460=for apidoc_section String Handling
ec2c235b
KW
2461
2462=for apidoc foldEQ
2463
2464Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2465same
2466case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2467match themselves and their opposite case counterparts. Non-cased and non-ASCII
2468range bytes match only themselves.
2469
2470=cut
2471*/
2472
2473PERL_STATIC_INLINE I32
2474Perl_foldEQ(const char *s1, const char *s2, I32 len)
2475{
2476 const U8 *a = (const U8 *)s1;
2477 const U8 *b = (const U8 *)s2;
2478
2479 PERL_ARGS_ASSERT_FOLDEQ;
2480
2481 assert(len >= 0);
2482
2483 while (len--) {
2484 if (*a != *b && *a != PL_fold[*b])
2485 return 0;
2486 a++,b++;
2487 }
2488 return 1;
2489}
2490
0f9cb40c 2491PERL_STATIC_INLINE I32
ec2c235b
KW
2492Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2493{
79a1fabd
KW
2494 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2495 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2496 * does not check for this. Nor does it check that the strings each have
2497 * at least 'len' characters. */
ec2c235b
KW
2498
2499 const U8 *a = (const U8 *)s1;
2500 const U8 *b = (const U8 *)s2;
2501
2502 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2503
2504 assert(len >= 0);
2505
2506 while (len--) {
2507 if (*a != *b && *a != PL_fold_latin1[*b]) {
2508 return 0;
2509 }
2510 a++, b++;
2511 }
2512 return 1;
2513}
2514
2515/*
51b56f5c 2516=for apidoc_section Locales
ec2c235b
KW
2517=for apidoc foldEQ_locale
2518
2519Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2520same case-insensitively in the current locale; false otherwise.
2521
2522=cut
2523*/
2524
0f9cb40c 2525PERL_STATIC_INLINE I32
ec2c235b
KW
2526Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2527{
ec2c235b
KW
2528 const U8 *a = (const U8 *)s1;
2529 const U8 *b = (const U8 *)s2;
2530
2531 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2532
2533 assert(len >= 0);
2534
2535 while (len--) {
2536 if (*a != *b && *a != PL_fold_locale[*b])
2537 return 0;
2538 a++,b++;
2539 }
2540 return 1;
2541}
2542
1ab100a8 2543/*
51b56f5c 2544=for apidoc_section String Handling
1ab100a8
KW
2545=for apidoc my_strnlen
2546
2547The C library C<strnlen> if available, or a Perl implementation of it.
2548
2549C<my_strnlen()> computes the length of the string, up to C<maxlen>
a3815e44 2550characters. It will never attempt to address more than C<maxlen>
1ab100a8
KW
2551characters, making it suitable for use with strings that are not
2552guaranteed to be NUL-terminated.
2553
2554=cut
2555
2556Description stolen from http://man.openbsd.org/strnlen.3,
2557implementation stolen from PostgreSQL.
2558*/
2559#ifndef HAS_STRNLEN
2560
2561PERL_STATIC_INLINE Size_t
2562Perl_my_strnlen(const char *str, Size_t maxlen)
2563{
2564 const char *end = (char *) memchr(str, '\0', maxlen);
2565
2566 PERL_ARGS_ASSERT_MY_STRNLEN;
2567
2568 if (end == NULL) return maxlen;
2569 return end - str;
2570}
2571
2572#endif
2573
6dba01e2
KW
2574#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2575
2576PERL_STATIC_INLINE void *
2577S_my_memrchr(const char * s, const char c, const STRLEN len)
2578{
2579 /* memrchr(), since many platforms lack it */
2580
2581 const char * t = s + len - 1;
2582
2583 PERL_ARGS_ASSERT_MY_MEMRCHR;
2584
2585 while (t >= s) {
2586 if (*t == c) {
2587 return (void *) t;
2588 }
2589 t--;
2590 }
2591
2592 return NULL;
2593}
2594
2595#endif
2596
24f3e849
KW
2597PERL_STATIC_INLINE char *
2598Perl_mortal_getenv(const char * str)
2599{
2600 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
2601 *
2602 * It's (mostly) thread-safe because it uses a mutex to prevent
2603 * simultaneous access from other threads that use the same mutex, and
2604 * makes a copy of the result before releasing that mutex. All of the Perl
2605 * core uses that mutex, but, like all mutexes, everything has to cooperate
2606 * for it to completely work. It is possible for code from, say XS, to not
2607 * use this mutex, defeating the safety.
2608 *
2609 * On some platforms, getenv() is not sequential-call-safe, because
2610 * subsequent calls destroy the static storage inside the C library
2611 * returned by an earlier call. The result must be copied or completely
2612 * acted upon before a subsequent getenv call. Those calls could come from
2613 * another thread. Again, making a copy while controlling the mutex
2614 * prevents these problems..
2615 *
2616 * To prevent leaks, the copy is made by creating a new SV containing it,
2617 * mortalizing the SV, and returning the SV's string (the copy). Thus this
2618 * is a drop-in replacement for getenv().
2619 *
2620 * A complication is that this can be called during phases where the
2621 * mortalization process isn't available. These are in interpreter
2622 * destruction or early in construction. khw believes that at these times
2623 * there shouldn't be anything else going on, so plain getenv is safe AS
2624 * LONG AS the caller acts on the return before calling it again. */
2625
2626 char * ret;
2627 dTHX;
2628
2629 PERL_ARGS_ASSERT_MORTAL_GETENV;
2630
2631 /* Can't mortalize without stacks. khw believes that no other threads
2632 * should be running, so no need to lock things, and this may be during a
2633 * phase when locking isn't even available */
2634 if (UNLIKELY(PL_scopestack_ix == 0)) {
2635 return getenv(str);
2636 }
2637
2638 ENV_LOCK;
2639
2640 ret = getenv(str);
2641
2642 if (ret != NULL) {
2643 ret = SvPVX(sv_2mortal(newSVpv(ret, 0)));
2644 }
2645
2646 ENV_UNLOCK;
2647 return ret;
2648}
2649
ed382232 2650/*
c8028aa6
TC
2651 * ex: set ts=8 sts=4 sw=4 et:
2652 */