This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add thread safety to some environment accesses
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9 8 * This file contains tables and code adapted from
f6521f7c 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
8ed185f9
KW
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
37 * Each section names the header file that the functions "belong" to.
38 */
27669aa4 39
be3a7a5d
KW
40/* ------------------------------- av.h ------------------------------- */
41
c70927a6 42PERL_STATIC_INLINE SSize_t
c9182d9c 43Perl_av_top_index(pTHX_ AV *av)
be3a7a5d
KW
44{
45 PERL_ARGS_ASSERT_AV_TOP_INDEX;
46 assert(SvTYPE(av) == SVt_PVAV);
47
48 return AvFILL(av);
49}
50
1afe1db1
FC
51/* ------------------------------- cv.h ------------------------------- */
52
ae77754a 53PERL_STATIC_INLINE GV *
c9182d9c 54Perl_CvGV(pTHX_ CV *sv)
ae77754a 55{
74804ad1
KW
56 PERL_ARGS_ASSERT_CVGV;
57
ae77754a
FC
58 return CvNAMED(sv)
59 ? Perl_cvgv_from_hek(aTHX_ sv)
60 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
61}
62
1afe1db1 63PERL_STATIC_INLINE I32 *
74804ad1 64Perl_CvDEPTH(const CV * const sv)
1afe1db1 65{
74804ad1 66 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 67 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 68
8de47657 69 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
70}
71
d16269d8
PM
72/*
73 CvPROTO returns the prototype as stored, which is not necessarily what
74 the interpreter should be using. Specifically, the interpreter assumes
75 that spaces have been stripped, which has been the case if the prototype
76 was added by toke.c, but is generally not the case if it was added elsewhere.
77 Since we can't enforce the spacelessness at assignment time, this routine
78 provides a temporary copy at parse time with spaces removed.
79 I<orig> is the start of the original buffer, I<len> is the length of the
80 prototype and will be updated when this returns.
81 */
82
5b67adb8 83#ifdef PERL_CORE
d16269d8
PM
84PERL_STATIC_INLINE char *
85S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
86{
87 SV * tmpsv;
88 char * tmps;
89 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
90 tmps = SvPVX(tmpsv);
91 while ((*len)--) {
92 if (!isSPACE(*orig))
93 *tmps++ = *orig;
94 orig++;
95 }
96 *tmps = '\0';
97 *len = tmps - SvPVX(tmpsv);
98 return SvPVX(tmpsv);
99}
5b67adb8 100#endif
d16269d8 101
25fdce4a
FC
102/* ------------------------------- mg.h ------------------------------- */
103
104#if defined(PERL_CORE) || defined(PERL_EXT)
105/* assumes get-magic and stringification have already occurred */
106PERL_STATIC_INLINE STRLEN
107S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
108{
109 assert(mg->mg_type == PERL_MAGIC_regex_global);
110 assert(mg->mg_len != -1);
111 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
112 return (STRLEN)mg->mg_len;
113 else {
114 const STRLEN pos = (STRLEN)mg->mg_len;
115 /* Without this check, we may read past the end of the buffer: */
116 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
117 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
118 }
119}
120#endif
121
03414f05
FC
122/* ------------------------------- pad.h ------------------------------ */
123
124#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
125PERL_STATIC_INLINE bool
b9d5702c 126S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 127{
b9d5702c
KW
128 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
129
03414f05
FC
130 /* is seq within the range _LOW to _HIGH ?
131 * This is complicated by the fact that PL_cop_seqmax
132 * may have wrapped around at some point */
133 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
134 return FALSE; /* not yet introduced */
135
136 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
137 /* in compiling scope */
138 if (
139 (seq > COP_SEQ_RANGE_LOW(pn))
140 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
141 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
142 )
143 return TRUE;
144 }
145 else if (
146 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
147 ?
148 ( seq > COP_SEQ_RANGE_LOW(pn)
149 || seq <= COP_SEQ_RANGE_HIGH(pn))
150
151 : ( seq > COP_SEQ_RANGE_LOW(pn)
152 && seq <= COP_SEQ_RANGE_HIGH(pn))
153 )
154 return TRUE;
155 return FALSE;
156}
157#endif
158
33a4312b
FC
159/* ------------------------------- pp.h ------------------------------- */
160
161PERL_STATIC_INLINE I32
c9182d9c 162Perl_TOPMARK(pTHX)
33a4312b
FC
163{
164 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 165 "MARK top %p %" IVdf "\n",
33a4312b
FC
166 PL_markstack_ptr,
167 (IV)*PL_markstack_ptr)));
168 return *PL_markstack_ptr;
169}
170
171PERL_STATIC_INLINE I32
c9182d9c 172Perl_POPMARK(pTHX)
33a4312b
FC
173{
174 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 175 "MARK pop %p %" IVdf "\n",
33a4312b
FC
176 (PL_markstack_ptr-1),
177 (IV)*(PL_markstack_ptr-1))));
178 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
179 return *PL_markstack_ptr--;
180}
181
8d919b0a
FC
182/* ----------------------------- regexp.h ----------------------------- */
183
184PERL_STATIC_INLINE struct regexp *
c9182d9c 185Perl_ReANY(const REGEXP * const re)
8d919b0a 186{
df6b4bd5 187 XPV* const p = (XPV*)SvANY(re);
bdef45de
KW
188
189 PERL_ARGS_ASSERT_REANY;
8d919b0a 190 assert(isREGEXP(re));
bdef45de 191
df6b4bd5
DM
192 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
193 : (struct regexp *)p;
8d919b0a
FC
194}
195
27669aa4
FC
196/* ------------------------------- sv.h ------------------------------- */
197
a887b094 198PERL_STATIC_INLINE bool
9d0469db
PLE
199Perl_SvTRUE(pTHX_ SV *sv) {
200 if (!LIKELY(sv))
201 return FALSE;
7b9b9e32
PLE
202 SvGETMAGIC(sv);
203 return SvTRUE_nomg_NN(sv);
a887b094
PLE
204}
205
27669aa4 206PERL_STATIC_INLINE SV *
c9182d9c 207Perl_SvREFCNT_inc(SV *sv)
27669aa4 208{
2439e033 209 if (LIKELY(sv != NULL))
27669aa4
FC
210 SvREFCNT(sv)++;
211 return sv;
212}
213PERL_STATIC_INLINE SV *
c9182d9c 214Perl_SvREFCNT_inc_NN(SV *sv)
27669aa4 215{
3f2f854a
KW
216 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
217
27669aa4
FC
218 SvREFCNT(sv)++;
219 return sv;
220}
221PERL_STATIC_INLINE void
c9182d9c 222Perl_SvREFCNT_inc_void(SV *sv)
27669aa4 223{
2439e033 224 if (LIKELY(sv != NULL))
27669aa4
FC
225 SvREFCNT(sv)++;
226}
75e16a44 227PERL_STATIC_INLINE void
c9182d9c 228Perl_SvREFCNT_dec(pTHX_ SV *sv)
75e16a44 229{
2439e033 230 if (LIKELY(sv != NULL)) {
75a9bf96 231 U32 rc = SvREFCNT(sv);
79e2a32a 232 if (LIKELY(rc > 1))
75a9bf96
DM
233 SvREFCNT(sv) = rc - 1;
234 else
235 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
236 }
237}
541377b1
FC
238
239PERL_STATIC_INLINE void
c9182d9c 240Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
4a9a56a7
DM
241{
242 U32 rc = SvREFCNT(sv);
3f2f854a
KW
243
244 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
245
79e2a32a 246 if (LIKELY(rc > 1))
4a9a56a7
DM
247 SvREFCNT(sv) = rc - 1;
248 else
249 Perl_sv_free2(aTHX_ sv, rc);
250}
251
252PERL_STATIC_INLINE void
1bd041dc 253Perl_SvAMAGIC_on(SV *sv)
541377b1 254{
1bd041dc 255 PERL_ARGS_ASSERT_SVAMAGIC_ON;
541377b1 256 assert(SvROK(sv));
1bd041dc 257
541377b1
FC
258 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
259}
260PERL_STATIC_INLINE void
1bd041dc 261Perl_SvAMAGIC_off(SV *sv)
541377b1 262{
1bd041dc
KW
263 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
264
541377b1
FC
265 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
266 HvAMAGIC_off(SvSTASH(SvRV(sv)));
267}
268
269PERL_STATIC_INLINE U32
c9182d9c 270Perl_SvPADSTALE_on(SV *sv)
541377b1 271{
c0683843 272 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
273 return SvFLAGS(sv) |= SVs_PADSTALE;
274}
275PERL_STATIC_INLINE U32
c9182d9c 276Perl_SvPADSTALE_off(SV *sv)
541377b1 277{
c0683843 278 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
279 return SvFLAGS(sv) &= ~SVs_PADSTALE;
280}
25fdce4a 281#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 282PERL_STATIC_INLINE STRLEN
6964422a 283S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 284{
25fdce4a 285 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
286 if (SvGAMAGIC(sv)) {
287 U8 *hopped = utf8_hop((U8 *)pv, pos);
288 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
289 return (STRLEN)(hopped - (U8 *)pv);
290 }
291 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
292}
293#endif
f019c49e 294
d1decf2b
TC
295/* ------------------------------- handy.h ------------------------------- */
296
297/* saves machine code for a common noreturn idiom typically used in Newx*() */
7347ee54 298GCC_DIAG_IGNORE_DECL(-Wunused-function);
d1decf2b 299static void
c9182d9c 300Perl_croak_memory_wrap(void)
d1decf2b
TC
301{
302 Perl_croak_nocontext("%s",PL_memory_wrap);
303}
7347ee54 304GCC_DIAG_RESTORE_DECL;
d1decf2b 305
a8a2ceaa
KW
306/* ------------------------------- utf8.h ------------------------------- */
307
2fe720e2
KW
308/*
309=head1 Unicode Support
310*/
311
55d09dc8 312PERL_STATIC_INLINE void
c9182d9c 313Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
314{
315 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
316 * encoded string at '*dest', updating '*dest' to include it */
317
55d09dc8
KW
318 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
319
6f2d5cbc 320 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 321 *((*dest)++) = byte;
55d09dc8 322 else {
a09ec51a
KW
323 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
324 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
325 }
326}
327
e123187a 328/*
2fe720e2 329=for apidoc valid_utf8_to_uvchr
09232555
KW
330Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
331known that the next character in the input UTF-8 string C<s> is well-formed
332(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
333points, and non-Unicode code points are allowed.
2fe720e2
KW
334
335=cut
336
337 */
338
339PERL_STATIC_INLINE UV
340Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
341{
c41b2540 342 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
343 const U8* send = s + expectlen;
344 UV uv = *s;
345
346 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
347
348 if (retlen) {
349 *retlen = expectlen;
350 }
351
352 /* An invariant is trivially returned */
353 if (expectlen == 1) {
354 return uv;
355 }
356
357 /* Remove the leading bits that indicate the number of bytes, leaving just
358 * the bits that are part of the value */
359 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
360
361 /* Now, loop through the remaining bytes, accumulating each into the
362 * working total as we go. (I khw tried unrolling the loop for up to 4
363 * bytes, but there was no performance improvement) */
364 for (++s; s < send; s++) {
365 uv = UTF8_ACCUMULATE(uv, *s);
366 }
367
368 return UNI_TO_NATIVE(uv);
369
370}
371
1e599354
KW
372/*
373=for apidoc is_utf8_invariant_string
374
82c5d941 375Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 376regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
377EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
378are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
379the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
380characters are invariant, but so also are the C1 controls.
1e599354
KW
381
382If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
383use this option, that C<s> can't have embedded C<NUL> characters and has to
384have a terminating C<NUL> byte).
385
9f2abfde
KW
386See also
387C<L</is_utf8_string>>,
388C<L</is_utf8_string_flags>>,
389C<L</is_utf8_string_loc>>,
390C<L</is_utf8_string_loc_flags>>,
391C<L</is_utf8_string_loclen>>,
392C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
393C<L</is_utf8_fixed_width_buf_flags>>,
394C<L</is_utf8_fixed_width_buf_loc_flags>>,
395C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
396C<L</is_strict_utf8_string>>,
397C<L</is_strict_utf8_string_loc>>,
398C<L</is_strict_utf8_string_loclen>>,
399C<L</is_c9strict_utf8_string>>,
400C<L</is_c9strict_utf8_string_loc>>,
401and
402C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
403
404=cut
0cbf5865
KW
405
406*/
407
408#define is_utf8_invariant_string(s, len) \
409 is_utf8_invariant_string_loc(s, len, NULL)
410
411/*
412=for apidoc is_utf8_invariant_string_loc
413
414Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
415the first UTF-8 variant character in the C<ep> pointer; if all characters are
416UTF-8 invariant, this function does not change the contents of C<*ep>.
417
418=cut
419
1e599354
KW
420*/
421
422PERL_STATIC_INLINE bool
c9182d9c 423Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 424{
e17544a6 425 const U8* send;
1e599354
KW
426 const U8* x = s;
427
0cbf5865
KW
428 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
429
e17544a6
KW
430 if (len == 0) {
431 len = strlen((const char *)s);
432 }
433
434 send = s + len;
435
4ab2fd9b 436/* This looks like 0x010101... */
2c5c8af5 437# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
438
439/* This looks like 0x808080... */
2c5c8af5 440# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
e099ea69 441# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 442# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 443
099e59a4
KW
444/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
445 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
446 * optimized out completely on a 32-bit system, and its mask gets optimized out
447 * on a 64-bit system */
2c5c8af5 448# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
449 | ( PTR2nat(x) >> 1) \
450 | ( ( (PTR2nat(x) \
451 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 452
3f515a2e
KW
453#ifndef EBCDIC
454
099e59a4
KW
455 /* Do the word-at-a-time iff there is at least one usable full word. That
456 * means that after advancing to a word boundary, there still is at least a
457 * full word left. The number of bytes needed to advance is 'wordsize -
458 * offset' unless offset is 0. */
459 if ((STRLEN) (send - x) >= PERL_WORDSIZE
460
461 /* This term is wordsize if subword; 0 if not */
462 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
463
464 /* 'offset' */
465 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
466 {
b40579ff 467
46bb68f6
KW
468 /* Process per-byte until reach word boundary. XXX This loop could be
469 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 470 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
471 if (! UTF8_IS_INVARIANT(*x)) {
472 if (ep) {
473 *ep = x;
474 }
e17544a6 475
46bb68f6
KW
476 return FALSE;
477 }
478 x++;
e17544a6 479 }
e17544a6 480
099e59a4
KW
481 /* Here, we know we have at least one full word to process. Process
482 * per-word as long as we have at least a full word left */
483 do {
4ab2fd9b 484 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 485
46bb68f6
KW
486 /* Found a variant. Just return if caller doesn't want its
487 * exact position */
488 if (! ep) {
489 return FALSE;
490 }
e17544a6 491
2c5c8af5
KW
492# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
493 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 494
73f0a2eb 495 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
496 assert(*ep >= s && *ep < send);
497
498 return FALSE;
499
2c5c8af5 500# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
501 checks. */
502
46bb68f6 503 break;
2c5c8af5 504# endif
46bb68f6 505 }
1d2af574 506
46bb68f6 507 x += PERL_WORDSIZE;
1d2af574 508
099e59a4 509 } while (x + PERL_WORDSIZE <= send);
b40579ff 510 }
e17544a6 511
0b08cab0 512#endif /* End of ! EBCDIC */
e17544a6
KW
513
514 /* Process per-byte */
515 while (x < send) {
516 if (! UTF8_IS_INVARIANT(*x)) {
517 if (ep) {
518 *ep = x;
519 }
0cbf5865 520
e17544a6 521 return FALSE;
0cbf5865 522 }
1e599354 523
e17544a6 524 x++;
1e599354
KW
525 }
526
527 return TRUE;
528}
529
23a7ee81
KW
530#ifndef EBCDIC
531
1d2af574 532PERL_STATIC_INLINE unsigned int
73f0a2eb 533Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574
KW
534{
535
536 /* This returns the position in a word (0..7) of the first variant byte in
537 * it. This is a helper function. Note that there are no branches */
538
539 assert(word);
540
541 /* Get just the msb bits of each byte */
542 word &= PERL_VARIANTS_WORD_MASK;
543
7adf2470 544# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
545
546 /* Bytes are stored like
547 * Byte8 ... Byte2 Byte1
548 * 63..56...15...8 7...0
549 *
550 * Isolate the lsb;
551 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
552 *
553 * The word will look this this, with a rightmost set bit in position 's':
554 * ('x's are don't cares)
555 * s
556 * x..x100..0
557 * x..xx10..0 Right shift (rightmost 0 is shifted off)
558 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
559 * the 1 just to their left into a 0; the remainder is
560 * untouched
4fa92663
KW
561 * 0..0011..1 The xor with the original, x..xx10..0, clears that
562 * remainder, sets the bottom to all 1
1d2af574
KW
563 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
564 *
565 * Another method is to do 'word &= -word'; but it generates a compiler
566 * message on some platforms about taking the negative of an unsigned */
567
568 word >>= 1;
569 word = 1 + (word ^ (word - 1));
570
571# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
572
573 /* Bytes are stored like
574 * Byte1 Byte2 ... Byte8
575 * 63..56 55..47 ... 7...0
576 *
577 * Isolate the msb; http://codeforces.com/blog/entry/10330
578 *
579 * Only the most significant set bit matters. Or'ing word with its right
580 * shift of 1 makes that bit and the next one to its right both 1. Then
581 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
582 * msb and all to the right being 1. */
583 word |= word >> 1;
584 word |= word >> 2;
585 word |= word >> 4;
586 word |= word >> 8;
587 word |= word >> 16;
588 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
589
590 /* Then subtracting the right shift by 1 clears all but the left-most of
591 * the 1 bits, which is our desired result */
592 word -= (word >> 1);
593
594# else
595# error Unexpected byte order
596# endif
597
7cf2d6c7
KW
598 /* Here 'word' has a single bit set: the msb of the first byte in which it
599 * is set. Calculate that position in the word. We can use this
1d2af574 600 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
67e12c5c
KW
601 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
602 * just get shifted off at compile time) */
603 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
604 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
605 | (39 << 24) | (47 << 16)
606 | (55 << 8) | (63 << 0));
1d2af574
KW
607 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
608
609 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
610 word = ((word + 1) >> 3) - 1;
611
612# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
613
614 /* And invert the result */
615 word = CHARBITS - word - 1;
616
617# endif
618
619 return (unsigned int) word;
620}
621
23a7ee81 622#endif
03c1e4ab
KW
623#if defined(PERL_CORE) || defined(PERL_EXT)
624
625/*
626=for apidoc variant_under_utf8_count
627
628This function looks at the sequence of bytes between C<s> and C<e>, which are
629assumed to be encoded in ASCII/Latin1, and returns how many of them would
630change should the string be translated into UTF-8. Due to the nature of UTF-8,
631each of these would occupy two bytes instead of the single one in the input
632string. Thus, this function returns the precise number of bytes the string
633would expand by when translated to UTF-8.
634
635Unlike most of the other functions that have C<utf8> in their name, the input
636to this function is NOT a UTF-8-encoded string. The function name is slightly
637I<odd> to emphasize this.
638
639This function is internal to Perl because khw thinks that any XS code that
640would want this is probably operating too close to the internals. Presenting a
641valid use case could change that.
642
643See also
644C<L<perlapi/is_utf8_invariant_string>>
645and
646C<L<perlapi/is_utf8_invariant_string_loc>>,
647
648=cut
649
650*/
651
652PERL_STATIC_INLINE Size_t
653S_variant_under_utf8_count(const U8* const s, const U8* const e)
654{
655 const U8* x = s;
656 Size_t count = 0;
657
658 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
659
660# ifndef EBCDIC
661
5d0379de
KW
662 /* Test if the string is long enough to use word-at-a-time. (Logic is the
663 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
664 if ((STRLEN) (e - x) >= PERL_WORDSIZE
665 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
666 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
667 {
668
669 /* Process per-byte until reach word boundary. XXX This loop could be
670 * eliminated if we knew that this platform had fast unaligned reads */
671 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
672 count += ! UTF8_IS_INVARIANT(*x++);
673 }
674
675 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
676 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
677 explanation of how this works */
e5863284
KW
678 PERL_UINTMAX_T increment
679 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
680 * PERL_COUNT_MULTIPLIER)
681 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 682 count += (Size_t) increment;
03c1e4ab
KW
683 x += PERL_WORDSIZE;
684 } while (x + PERL_WORDSIZE <= e);
685 }
686
687# endif
688
689 /* Process per-byte */
690 while (x < e) {
691 if (! UTF8_IS_INVARIANT(*x)) {
692 count++;
693 }
694
695 x++;
696 }
697
698 return count;
699}
700
701#endif
702
aff4cafe
KW
703#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
704# undef PERL_WORDSIZE
705# undef PERL_COUNT_MULTIPLIER
706# undef PERL_WORD_BOUNDARY_MASK
707# undef PERL_VARIANTS_WORD_MASK
708#endif
03c1e4ab 709
7c93d8f0 710/*
5ff889fb
KW
711=for apidoc is_utf8_string
712
82c5d941
KW
713Returns TRUE if the first C<len> bytes of string C<s> form a valid
714Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
715be calculated using C<strlen(s)> (which means if you use this option, that C<s>
716can't have embedded C<NUL> characters and has to have a terminating C<NUL>
717byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
718
2717076a
KW
719This function considers Perl's extended UTF-8 to be valid. That means that
720code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
721considered valid by this function. Use C<L</is_strict_utf8_string>>,
722C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
723code points are considered valid.
5ff889fb 724
9f2abfde
KW
725See also
726C<L</is_utf8_invariant_string>>,
0cbf5865 727C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
728C<L</is_utf8_string_loc>>,
729C<L</is_utf8_string_loclen>>,
8bc127bf
KW
730C<L</is_utf8_fixed_width_buf_flags>>,
731C<L</is_utf8_fixed_width_buf_loc_flags>>,
732C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
733
734=cut
735*/
736
dd237e82 737#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 738
c9cd936b
KW
739#if defined(PERL_CORE) || defined (PERL_EXT)
740
741/*
742=for apidoc is_utf8_non_invariant_string
743
744Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
745C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
746UTF-8; otherwise returns FALSE.
747
748A TRUE return means that at least one code point represented by the sequence
749either is a wide character not representable as a single byte, or the
750representation differs depending on whether the sequence is encoded in UTF-8 or
751not.
752
753See also
754C<L<perlapi/is_utf8_invariant_string>>,
755C<L<perlapi/is_utf8_string>>
756
757=cut
758
759This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
760It generally needn't be if its string is entirely UTF-8 invariant, and it
761shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
762
763It is an internal function because khw thinks that XS code shouldn't be working
764at this low a level. A valid use case could change that.
765
766*/
767
768PERL_STATIC_INLINE bool
86a87e17 769Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
c9cd936b
KW
770{
771 const U8 * first_variant;
772
773 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
774
775 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
776 return FALSE;
777 }
778
779 return is_utf8_string(first_variant, len - (first_variant - s));
780}
781
782#endif
783
5ff889fb 784/*
9f2abfde
KW
785=for apidoc is_strict_utf8_string
786
787Returns TRUE if the first C<len> bytes of string C<s> form a valid
788UTF-8-encoded string that is fully interchangeable by any application using
789Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
790calculated using C<strlen(s)> (which means if you use this option, that C<s>
791can't have embedded C<NUL> characters and has to have a terminating C<NUL>
792byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
793
794This function returns FALSE for strings containing any
795code points above the Unicode max of 0x10FFFF, surrogate code points, or
796non-character code points.
797
798See also
799C<L</is_utf8_invariant_string>>,
0cbf5865 800C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
801C<L</is_utf8_string>>,
802C<L</is_utf8_string_flags>>,
803C<L</is_utf8_string_loc>>,
804C<L</is_utf8_string_loc_flags>>,
805C<L</is_utf8_string_loclen>>,
806C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
807C<L</is_utf8_fixed_width_buf_flags>>,
808C<L</is_utf8_fixed_width_buf_loc_flags>>,
809C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
810C<L</is_strict_utf8_string_loc>>,
811C<L</is_strict_utf8_string_loclen>>,
812C<L</is_c9strict_utf8_string>>,
813C<L</is_c9strict_utf8_string_loc>>,
814and
815C<L</is_c9strict_utf8_string_loclen>>.
816
817=cut
818*/
819
dd237e82 820#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
821
822/*
823=for apidoc is_c9strict_utf8_string
824
825Returns TRUE if the first C<len> bytes of string C<s> form a valid
826UTF-8-encoded string that conforms to
827L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
828otherwise it returns FALSE. If C<len> is 0, it will be calculated using
829C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
830C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
831characters being ASCII constitute 'a valid UTF-8 string'.
832
833This function returns FALSE for strings containing any code points above the
834Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
835code points per
836L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
837
838See also
839C<L</is_utf8_invariant_string>>,
0cbf5865 840C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
841C<L</is_utf8_string>>,
842C<L</is_utf8_string_flags>>,
843C<L</is_utf8_string_loc>>,
844C<L</is_utf8_string_loc_flags>>,
845C<L</is_utf8_string_loclen>>,
846C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
847C<L</is_utf8_fixed_width_buf_flags>>,
848C<L</is_utf8_fixed_width_buf_loc_flags>>,
849C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
850C<L</is_strict_utf8_string>>,
851C<L</is_strict_utf8_string_loc>>,
852C<L</is_strict_utf8_string_loclen>>,
853C<L</is_c9strict_utf8_string_loc>>,
854and
855C<L</is_c9strict_utf8_string_loclen>>.
856
857=cut
858*/
859
dd237e82 860#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
861
862/*
863=for apidoc is_utf8_string_flags
864
865Returns TRUE if the first C<len> bytes of string C<s> form a valid
866UTF-8 string, subject to the restrictions imposed by C<flags>;
867returns FALSE otherwise. If C<len> is 0, it will be calculated
868using C<strlen(s)> (which means if you use this option, that C<s> can't have
869embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
870that all characters being ASCII constitute 'a valid UTF-8 string'.
871
872If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
873C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
874as C<L</is_strict_utf8_string>>; and if C<flags> is
875C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
876C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
877combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
878C<L</utf8n_to_uvchr>>, with the same meanings.
879
880See also
881C<L</is_utf8_invariant_string>>,
0cbf5865 882C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
883C<L</is_utf8_string>>,
884C<L</is_utf8_string_loc>>,
885C<L</is_utf8_string_loc_flags>>,
886C<L</is_utf8_string_loclen>>,
887C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
888C<L</is_utf8_fixed_width_buf_flags>>,
889C<L</is_utf8_fixed_width_buf_loc_flags>>,
890C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
891C<L</is_strict_utf8_string>>,
892C<L</is_strict_utf8_string_loc>>,
893C<L</is_strict_utf8_string_loclen>>,
894C<L</is_c9strict_utf8_string>>,
895C<L</is_c9strict_utf8_string_loc>>,
896and
897C<L</is_c9strict_utf8_string_loclen>>.
898
899=cut
900*/
901
902PERL_STATIC_INLINE bool
c9182d9c 903Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 904{
33756530 905 const U8 * first_variant;
9f2abfde
KW
906
907 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
908 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 909 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 910
f60f61fd
KW
911 if (len == 0) {
912 len = strlen((const char *)s);
913 }
914
9f2abfde
KW
915 if (flags == 0) {
916 return is_utf8_string(s, len);
917 }
918
d044b7a7 919 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
920 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
921 {
922 return is_strict_utf8_string(s, len);
923 }
924
d044b7a7 925 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
926 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
927 {
928 return is_c9strict_utf8_string(s, len);
929 }
930
33756530
KW
931 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
932 const U8* const send = s + len;
933 const U8* x = first_variant;
934
a0d7f935
KW
935 while (x < send) {
936 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
937 if (UNLIKELY(! cur_len)) {
938 return FALSE;
939 }
940 x += cur_len;
9f2abfde 941 }
33756530 942 }
9f2abfde
KW
943
944 return TRUE;
945}
946
947/*
5ff889fb
KW
948
949=for apidoc is_utf8_string_loc
950
2717076a 951Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 952case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 953"utf8ness success") in the C<ep> pointer.
5ff889fb 954
2717076a 955See also C<L</is_utf8_string_loclen>>.
5ff889fb 956
3964c812
KW
957=cut
958*/
959
960#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
961
962/*
963
5ff889fb
KW
964=for apidoc is_utf8_string_loclen
965
2717076a 966Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 967case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 968"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 969encoded characters in the C<el> pointer.
5ff889fb 970
2717076a 971See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
972
973=cut
974*/
975
56e4cf64 976PERL_STATIC_INLINE bool
33756530 977Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 978{
33756530 979 const U8 * first_variant;
5ff889fb
KW
980
981 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
982
33756530
KW
983 if (len == 0) {
984 len = strlen((const char *) s);
985 }
986
987 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
988 if (el)
989 *el = len;
990
991 if (ep) {
992 *ep = s + len;
993 }
994
995 return TRUE;
996 }
997
998 {
999 const U8* const send = s + len;
1000 const U8* x = first_variant;
1001 STRLEN outlen = first_variant - s;
1002
a0d7f935
KW
1003 while (x < send) {
1004 const STRLEN cur_len = isUTF8_CHAR(x, send);
1005 if (UNLIKELY(! cur_len)) {
1006 break;
1007 }
1008 x += cur_len;
1009 outlen++;
5ff889fb 1010 }
5ff889fb 1011
a0d7f935
KW
1012 if (el)
1013 *el = outlen;
5ff889fb 1014
a0d7f935
KW
1015 if (ep) {
1016 *ep = x;
1017 }
5ff889fb 1018
a0d7f935 1019 return (x == send);
33756530 1020 }
5ff889fb
KW
1021}
1022
1023/*
9f2abfde 1024
44170c9a 1025=for apidoc isUTF8_CHAR
8ed185f9
KW
1026
1027Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1028looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1029that represents some code point; otherwise it evaluates to 0. If non-zero, the
1030value gives how many bytes starting at C<s> comprise the code point's
1031representation. Any bytes remaining before C<e>, but beyond the ones needed to
1032form the first code point in C<s>, are not examined.
1033
13aab5dd 1034The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1035extension to official UTF-8 to represent those higher than the Unicode maximum
1036of 0x10FFFF. That means that this macro is used to efficiently decide if the
1037next few bytes in C<s> is legal UTF-8 for a single character.
1038
1039Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1040defined by Unicode to be fully interchangeable across applications;
1041C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1042#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1043code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1044
1045Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1046C<L</is_utf8_string_loclen>> to check entire strings.
1047
13aab5dd
KW
1048Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1049machines) is a valid UTF-8 character.
8ed185f9
KW
1050
1051=cut
1052
1053This uses an adaptation of the table and algorithm given in
f6521f7c 1054https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
8ed185f9
KW
1055documentation of the original version. A copyright notice for the original
1056version is given at the beginning of this file. The Perl adapation is
71525f77 1057documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1058
1059*/
1060
1061PERL_STATIC_INLINE Size_t
c9182d9c 1062Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9
KW
1063{
1064 const U8 * s = s0;
1065 UV state = 0;
1066
1067 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1068
1069 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1070 * code point, which can be returned immediately. Otherwise, it is either
1071 * malformed, or for the start byte FF which the dfa doesn't handle (except
1072 * on 32-bit ASCII platforms where it trivially is an error). Call a
1073 * helper function for the other platforms. */
1074
1075 while (s < e && LIKELY(state != 1)) {
71525f77 1076 state = PL_extended_utf8_dfa_tab[256
8ed185f9 1077 + state
71525f77 1078 + PL_extended_utf8_dfa_tab[*s]];
8ed185f9
KW
1079 if (state != 0) {
1080 s++;
1081 continue;
1082 }
1083
1084 return s - s0 + 1;
1085 }
1086
1087#if defined(UV_IS_QUAD) || defined(EBCDIC)
1088
1089 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1376b35c 1090 return is_utf8_char_helper(s0, e, 0);
8ed185f9
KW
1091 }
1092
1093#endif
1094
1095 return 0;
1096}
1097
1098/*
1099
67049a5f
KW
1100=for apidoc isSTRICT_UTF8_CHAR
1101
1102Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1103looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1104Unicode code point completely acceptable for open interchange between all
1105applications; otherwise it evaluates to 0. If non-zero, the value gives how
1106many bytes starting at C<s> comprise the code point's representation. Any
1107bytes remaining before C<e>, but beyond the ones needed to form the first code
1108point in C<s>, are not examined.
1109
1110The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1111be a surrogate nor a non-character code point. Thus this excludes any code
1112point from Perl's extended UTF-8.
1113
1114This is used to efficiently decide if the next few bytes in C<s> is
1115legal Unicode-acceptable UTF-8 for a single character.
1116
1117Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1118#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1119code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1120and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1121
1122Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1123C<L</is_strict_utf8_string_loclen>> to check entire strings.
1124
1125=cut
1126
1127This uses an adaptation of the tables and algorithm given in
f6521f7c 1128https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
67049a5f
KW
1129documentation of the original version. A copyright notice for the original
1130version is given at the beginning of this file. The Perl adapation is
1131documented at the definition of strict_extended_utf8_dfa_tab[].
1132
1133*/
1134
1135PERL_STATIC_INLINE Size_t
c9182d9c 1136Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f
KW
1137{
1138 const U8 * s = s0;
1139 UV state = 0;
1140
1141 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1142
1143 while (s < e && LIKELY(state != 1)) {
71525f77 1144 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
67049a5f
KW
1145
1146 if (state != 0) {
1147 s++;
1148 continue;
1149 }
1150
1151 return s - s0 + 1;
1152 }
1153
1154#ifndef EBCDIC
1155
1156 /* The dfa above drops out for certain Hanguls; handle them specially */
1157 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1158 return 3;
1159 }
1160
1161#endif
1162
1163 return 0;
1164}
1165
1166/*
1167
44170c9a 1168=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1169
1170Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1171looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1172Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1173the value gives how many bytes starting at C<s> comprise the code point's
1174representation. Any bytes remaining before C<e>, but beyond the ones needed to
1175form the first code point in C<s>, are not examined.
1176
1177The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1178differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1179code points. This corresponds to
1180L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1181which said that non-character code points are merely discouraged rather than
1182completely forbidden in open interchange. See
1183L<perlunicode/Noncharacter code points>.
1184
1185Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1186C<L</isUTF8_CHAR_flags>> for a more customized definition.
1187
1188Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1189C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1190
1191=cut
1192
1193This uses an adaptation of the tables and algorithm given in
f6521f7c 1194https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
c5bfbb64
KW
1195documentation of the original version. A copyright notice for the original
1196version is given at the beginning of this file. The Perl adapation is
71525f77 1197documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1198
1199*/
1200
1201PERL_STATIC_INLINE Size_t
c9182d9c 1202Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64
KW
1203{
1204 const U8 * s = s0;
1205 UV state = 0;
1206
1207 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1208
1209 while (s < e && LIKELY(state != 1)) {
71525f77 1210 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
c5bfbb64
KW
1211
1212 if (state != 0) {
1213 s++;
1214 continue;
1215 }
1216
1217 return s - s0 + 1;
1218 }
1219
1220 return 0;
1221}
1222
1223/*
1224
9f2abfde
KW
1225=for apidoc is_strict_utf8_string_loc
1226
1227Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1228case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1229"utf8ness success") in the C<ep> pointer.
1230
1231See also C<L</is_strict_utf8_string_loclen>>.
1232
1233=cut
1234*/
1235
1236#define is_strict_utf8_string_loc(s, len, ep) \
1237 is_strict_utf8_string_loclen(s, len, ep, 0)
1238
1239/*
1240
1241=for apidoc is_strict_utf8_string_loclen
1242
1243Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1244case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1245"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1246encoded characters in the C<el> pointer.
1247
1248See also C<L</is_strict_utf8_string_loc>>.
1249
1250=cut
1251*/
1252
1253PERL_STATIC_INLINE bool
c9182d9c 1254Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1255{
33756530 1256 const U8 * first_variant;
9f2abfde
KW
1257
1258 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1259
33756530
KW
1260 if (len == 0) {
1261 len = strlen((const char *) s);
1262 }
1263
1264 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1265 if (el)
1266 *el = len;
1267
1268 if (ep) {
1269 *ep = s + len;
1270 }
1271
1272 return TRUE;
1273 }
1274
1275 {
1276 const U8* const send = s + len;
1277 const U8* x = first_variant;
1278 STRLEN outlen = first_variant - s;
1279
a0d7f935
KW
1280 while (x < send) {
1281 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1282 if (UNLIKELY(! cur_len)) {
1283 break;
1284 }
1285 x += cur_len;
1286 outlen++;
9f2abfde 1287 }
9f2abfde 1288
a0d7f935
KW
1289 if (el)
1290 *el = outlen;
9f2abfde 1291
a0d7f935
KW
1292 if (ep) {
1293 *ep = x;
1294 }
9f2abfde 1295
a0d7f935 1296 return (x == send);
33756530 1297 }
9f2abfde
KW
1298}
1299
1300/*
1301
1302=for apidoc is_c9strict_utf8_string_loc
1303
1304Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1305the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1306"utf8ness success") in the C<ep> pointer.
1307
1308See also C<L</is_c9strict_utf8_string_loclen>>.
1309
1310=cut
1311*/
1312
1313#define is_c9strict_utf8_string_loc(s, len, ep) \
1314 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1315
1316/*
1317
1318=for apidoc is_c9strict_utf8_string_loclen
1319
1320Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1321the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1322"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1323characters in the C<el> pointer.
1324
1325See also C<L</is_c9strict_utf8_string_loc>>.
1326
1327=cut
1328*/
1329
1330PERL_STATIC_INLINE bool
c9182d9c 1331Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1332{
33756530 1333 const U8 * first_variant;
9f2abfde
KW
1334
1335 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1336
33756530
KW
1337 if (len == 0) {
1338 len = strlen((const char *) s);
1339 }
1340
1341 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1342 if (el)
1343 *el = len;
1344
1345 if (ep) {
1346 *ep = s + len;
1347 }
1348
1349 return TRUE;
1350 }
1351
1352 {
1353 const U8* const send = s + len;
1354 const U8* x = first_variant;
1355 STRLEN outlen = first_variant - s;
1356
a0d7f935
KW
1357 while (x < send) {
1358 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1359 if (UNLIKELY(! cur_len)) {
1360 break;
1361 }
1362 x += cur_len;
1363 outlen++;
9f2abfde 1364 }
9f2abfde 1365
a0d7f935
KW
1366 if (el)
1367 *el = outlen;
9f2abfde 1368
a0d7f935
KW
1369 if (ep) {
1370 *ep = x;
1371 }
9f2abfde 1372
a0d7f935 1373 return (x == send);
33756530 1374 }
9f2abfde
KW
1375}
1376
1377/*
1378
1379=for apidoc is_utf8_string_loc_flags
1380
1381Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1382case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1383"utf8ness success") in the C<ep> pointer.
1384
1385See also C<L</is_utf8_string_loclen_flags>>.
1386
1387=cut
1388*/
1389
1390#define is_utf8_string_loc_flags(s, len, ep, flags) \
1391 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1392
1393
1394/* The above 3 actual functions could have been moved into the more general one
1395 * just below, and made #defines that call it with the right 'flags'. They are
1396 * currently kept separate to increase their chances of getting inlined */
1397
1398/*
1399
1400=for apidoc is_utf8_string_loclen_flags
1401
1402Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1403case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1404"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1405encoded characters in the C<el> pointer.
1406
1407See also C<L</is_utf8_string_loc_flags>>.
1408
1409=cut
1410*/
1411
1412PERL_STATIC_INLINE bool
c9182d9c 1413Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1414{
33756530 1415 const U8 * first_variant;
9f2abfde
KW
1416
1417 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1418 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1419 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1420
f60f61fd 1421 if (len == 0) {
a0d7f935 1422 len = strlen((const char *) s);
f60f61fd
KW
1423 }
1424
9f2abfde
KW
1425 if (flags == 0) {
1426 return is_utf8_string_loclen(s, len, ep, el);
1427 }
1428
d044b7a7 1429 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1430 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1431 {
1432 return is_strict_utf8_string_loclen(s, len, ep, el);
1433 }
1434
d044b7a7 1435 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1436 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1437 {
1438 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1439 }
1440
33756530
KW
1441 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1442 if (el)
1443 *el = len;
1444
1445 if (ep) {
1446 *ep = s + len;
1447 }
1448
1449 return TRUE;
1450 }
1451
1452 {
1453 const U8* send = s + len;
1454 const U8* x = first_variant;
1455 STRLEN outlen = first_variant - s;
1456
a0d7f935
KW
1457 while (x < send) {
1458 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1459 if (UNLIKELY(! cur_len)) {
1460 break;
1461 }
1462 x += cur_len;
1463 outlen++;
9f2abfde 1464 }
9f2abfde 1465
a0d7f935
KW
1466 if (el)
1467 *el = outlen;
9f2abfde 1468
a0d7f935
KW
1469 if (ep) {
1470 *ep = x;
1471 }
9f2abfde 1472
a0d7f935 1473 return (x == send);
33756530 1474 }
9f2abfde
KW
1475}
1476
1477/*
7c93d8f0
KW
1478=for apidoc utf8_distance
1479
1480Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1481and C<b>.
1482
1483WARNING: use only if you *know* that the pointers point inside the
1484same UTF-8 buffer.
1485
1486=cut
1487*/
1488
1489PERL_STATIC_INLINE IV
1490Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1491{
1492 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1493
1494 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1495}
1496
1497/*
1498=for apidoc utf8_hop
1499
1500Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1501forward or backward.
1502
1503WARNING: do not use the following unless you *know* C<off> is within
1504the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1505on the first byte of character or just after the last byte of a character.
1506
1507=cut
1508*/
1509
1510PERL_STATIC_INLINE U8 *
1511Perl_utf8_hop(const U8 *s, SSize_t off)
1512{
1513 PERL_ARGS_ASSERT_UTF8_HOP;
1514
1515 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1516 * the bitops (especially ~) can create illegal UTF-8.
1517 * In other words: in Perl UTF-8 is not just for Unicode. */
1518
1519 if (off >= 0) {
1520 while (off--)
1521 s += UTF8SKIP(s);
1522 }
1523 else {
1524 while (off++) {
1525 s--;
1526 while (UTF8_IS_CONTINUATION(*s))
1527 s--;
1528 }
1529 }
e099ea69 1530 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 1531 return (U8 *)s;
e099ea69 1532 GCC_DIAG_RESTORE
7c93d8f0
KW
1533}
1534
4dab108f 1535/*
65df57a8
TC
1536=for apidoc utf8_hop_forward
1537
1538Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1539forward.
1540
1541C<off> must be non-negative.
1542
1543C<s> must be before or equal to C<end>.
1544
1545When moving forward it will not move beyond C<end>.
1546
1547Will not exceed this limit even if the string is not valid "UTF-8".
1548
1549=cut
1550*/
1551
1552PERL_STATIC_INLINE U8 *
1553Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1554{
1555 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1556
1557 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1558 * the bitops (especially ~) can create illegal UTF-8.
1559 * In other words: in Perl UTF-8 is not just for Unicode. */
1560
1561 assert(s <= end);
1562 assert(off >= 0);
1563
1564 while (off--) {
1565 STRLEN skip = UTF8SKIP(s);
de979548 1566 if ((STRLEN)(end - s) <= skip) {
e099ea69 1567 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1568 return (U8 *)end;
e099ea69 1569 GCC_DIAG_RESTORE
de979548 1570 }
65df57a8
TC
1571 s += skip;
1572 }
1573
e099ea69 1574 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1575 return (U8 *)s;
e099ea69 1576 GCC_DIAG_RESTORE
65df57a8
TC
1577}
1578
1579/*
1580=for apidoc utf8_hop_back
1581
1582Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1583backward.
1584
1585C<off> must be non-positive.
1586
1587C<s> must be after or equal to C<start>.
1588
1589When moving backward it will not move before C<start>.
1590
1591Will not exceed this limit even if the string is not valid "UTF-8".
1592
1593=cut
1594*/
1595
1596PERL_STATIC_INLINE U8 *
1597Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1598{
1599 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1600
1601 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1602 * the bitops (especially ~) can create illegal UTF-8.
1603 * In other words: in Perl UTF-8 is not just for Unicode. */
1604
1605 assert(start <= s);
1606 assert(off <= 0);
1607
1608 while (off++ && s > start) {
e7185695 1609 do {
65df57a8 1610 s--;
e7185695 1611 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8 1612 }
f6521f7c 1613
e099ea69 1614 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1615 return (U8 *)s;
e099ea69 1616 GCC_DIAG_RESTORE
65df57a8
TC
1617}
1618
1619/*
1620=for apidoc utf8_hop_safe
1621
1622Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1623either forward or backward.
1624
1625When moving backward it will not move before C<start>.
1626
1627When moving forward it will not move beyond C<end>.
1628
1629Will not exceed those limits even if the string is not valid "UTF-8".
1630
1631=cut
1632*/
1633
1634PERL_STATIC_INLINE U8 *
1635Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1636{
1637 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1638
1639 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1640 * the bitops (especially ~) can create illegal UTF-8.
1641 * In other words: in Perl UTF-8 is not just for Unicode. */
1642
1643 assert(start <= s && s <= end);
1644
1645 if (off >= 0) {
1646 return utf8_hop_forward(s, off, end);
1647 }
1648 else {
1649 return utf8_hop_back(s, off, start);
1650 }
1651}
1652
1653/*
4dab108f
KW
1654
1655=for apidoc is_utf8_valid_partial_char
1656
6cbb9248
KW
1657Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1658S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1659points. Otherwise, it returns 1 if there exists at least one non-empty
1660sequence of bytes that when appended to sequence C<s>, starting at position
1661C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1662otherwise returns 0.
1663
1664In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1665point.
1666
1667This is useful when a fixed-length buffer is being tested for being well-formed
1668UTF-8, but the final few bytes in it don't comprise a full character; that is,
1669it is split somewhere in the middle of the final code point's UTF-8
1670representation. (Presumably when the buffer is refreshed with the next chunk
1671of data, the new first bytes will complete the partial code point.) This
1672function is used to verify that the final bytes in the current buffer are in
1673fact the legal beginning of some code point, so that if they aren't, the
1674failure can be signalled without having to wait for the next read.
4dab108f
KW
1675
1676=cut
1677*/
2717076a
KW
1678#define is_utf8_valid_partial_char(s, e) \
1679 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1680
1681/*
1682
1683=for apidoc is_utf8_valid_partial_char_flags
1684
1685Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1686or not the input is a valid UTF-8 encoded partial character, but it takes an
1687extra parameter, C<flags>, which can further restrict which code points are
1688considered valid.
1689
1690If C<flags> is 0, this behaves identically to
1691C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1692of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1693there is any sequence of bytes that can complete the input partial character in
1694such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1695TRUE; otherwise FALSE. Non character code points cannot be determined based on
1696partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1697determined from just the first one or two bytes.
1698
1699=cut
1700 */
1701
56e4cf64 1702PERL_STATIC_INLINE bool
c9182d9c 1703Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1704{
f1c999a7 1705 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1706
f1c999a7 1707 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1708 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1709
8875bd48 1710 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1711 return FALSE;
1712 }
1713
1376b35c 1714 return cBOOL(is_utf8_char_helper(s, e, flags));
4dab108f
KW
1715}
1716
8bc127bf
KW
1717/*
1718
1719=for apidoc is_utf8_fixed_width_buf_flags
1720
1721Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1722is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1723otherwise it returns FALSE.
1724
1725If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1726without restriction. If the final few bytes of the buffer do not form a
1727complete code point, this will return TRUE anyway, provided that
1728C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1729
1730If C<flags> in non-zero, it can be any combination of the
1731C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1732same meanings.
1733
1734This function differs from C<L</is_utf8_string_flags>> only in that the latter
1735returns FALSE if the final few bytes of the string don't form a complete code
1736point.
1737
1738=cut
1739 */
1740#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1741 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1742
1743/*
1744
1745=for apidoc is_utf8_fixed_width_buf_loc_flags
1746
1747Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1748failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1749to the beginning of any partial character at the end of the buffer; if there is
1750no partial character C<*ep> will contain C<s>+C<len>.
1751
1752See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1753
1754=cut
1755*/
1756
1757#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1758 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1759
1760/*
1761
1762=for apidoc is_utf8_fixed_width_buf_loclen_flags
1763
1764Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1765complete, valid characters found in the C<el> pointer.
1766
1767=cut
1768*/
1769
1770PERL_STATIC_INLINE bool
c9182d9c 1771Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 1772 STRLEN len,
8bc127bf
KW
1773 const U8 **ep,
1774 STRLEN *el,
1775 const U32 flags)
1776{
1777 const U8 * maybe_partial;
1778
1779 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1780
1781 if (! ep) {
1782 ep = &maybe_partial;
1783 }
1784
1785 /* If it's entirely valid, return that; otherwise see if the only error is
1786 * that the final few bytes are for a partial character */
1787 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1788 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1789}
1790
e6a4ffc3 1791PERL_STATIC_INLINE UV
c9182d9c 1792Perl_utf8n_to_uvchr_msgs(const U8 *s,
e6a4ffc3
KW
1793 STRLEN curlen,
1794 STRLEN *retlen,
1795 const U32 flags,
1796 U32 * errors,
1797 AV ** msgs)
1798{
1799 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1800 * simple cases, and, if necessary calls a helper function to deal with the
1801 * more complex ones. Almost all well-formed non-problematic code points
1802 * are considered simple, so that it's unlikely that the helper function
1803 * will need to be called.
1804 *
1805 * This is an adaptation of the tables and algorithm given in
f6521f7c 1806 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
e6a4ffc3
KW
1807 * comprehensive documentation of the original version. A copyright notice
1808 * for the original version is given at the beginning of this file. The
71525f77 1809 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
1810 */
1811
1812 const U8 * const s0 = s;
1813 const U8 * send = s0 + curlen;
f73cfe7d 1814 UV uv = 0; /* The 0 silences some stupid compilers */
e6a4ffc3
KW
1815 UV state = 0;
1816
1817 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1818
1819 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1820 * non-problematic code point, which can be returned immediately.
1821 * Otherwise we call a helper function to figure out the more complicated
1822 * cases. */
1823
1824 while (s < send && LIKELY(state != 1)) {
71525f77 1825 UV type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3
KW
1826
1827 uv = (state == 0)
1828 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1829 : UTF8_ACCUMULATE(uv, *s);
71525f77 1830 state = PL_strict_utf8_dfa_tab[256 + state + type];
e6a4ffc3
KW
1831
1832 if (state != 0) {
1833 s++;
1834 continue;
1835 }
1836
1837 if (retlen) {
1838 *retlen = s - s0 + 1;
1839 }
1840 if (errors) {
1841 *errors = 0;
1842 }
1843 if (msgs) {
1844 *msgs = NULL;
1845 }
1846
cfebc7aa 1847 return UNI_TO_NATIVE(uv);
e6a4ffc3
KW
1848 }
1849
1850 /* Here is potentially problematic. Use the full mechanism */
1851 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1852}
1853
82651abe 1854PERL_STATIC_INLINE UV
9a9a6c98 1855Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 1856{
9a9a6c98 1857 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
1858
1859 assert(s < send);
1860
1861 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
1862
1863 /* EMPTY is not really allowed, and asserts on debugging builds. But
1864 * on non-debugging we have to deal with it, and this causes it to
1865 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 1866 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 1867 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
1868 }
1869 else {
1870 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1871 if (retlen && ret == 0 && *s != '\0') {
1872 *retlen = (STRLEN) -1;
1873 }
1874
1875 return ret;
1876 }
1877}
1878
c8028aa6
TC
1879/* ------------------------------- perl.h ----------------------------- */
1880
1881/*
dcccc8ff
KW
1882=head1 Miscellaneous Functions
1883
44170c9a 1884=for apidoc is_safe_syscall
c8028aa6 1885
1a0efc9a
KW
1886Test that the given C<pv> (with length C<len>) doesn't contain any internal
1887C<NUL> characters.
1888If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
1889category, and return FALSE.
c8028aa6
TC
1890
1891Return TRUE if the name is safe.
1892
1a0efc9a
KW
1893C<what> and C<op_name> are used in any warning.
1894
796b6530 1895Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1896
1897=cut
1898*/
1899
1900PERL_STATIC_INLINE bool
ffd62fc2
KW
1901Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1902{
c8028aa6
TC
1903 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1904 * perl itself uses xce*() functions which accept 8-bit strings.
1905 */
1906
1907 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1908
6c4650b3 1909 if (len > 1) {
c8028aa6 1910 char *null_at;
41188aa0 1911 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1912 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1913 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1914 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1915 what, op_name, pv, null_at+1);
c8028aa6
TC
1916 return FALSE;
1917 }
1918 }
1919
1920 return TRUE;
1921}
1922
1923/*
7cb3f959
TC
1924
1925Return true if the supplied filename has a newline character
fa6c7d00 1926immediately before the first (hopefully only) NUL.
7cb3f959
TC
1927
1928My original look at this incorrectly used the len from SvPV(), but
1929that's incorrect, since we allow for a NUL in pv[len-1].
1930
1931So instead, strlen() and work from there.
1932
1933This allow for the user reading a filename, forgetting to chomp it,
1934then calling:
1935
1936 open my $foo, "$file\0";
1937
1938*/
1939
1940#ifdef PERL_CORE
1941
1942PERL_STATIC_INLINE bool
ffd62fc2
KW
1943S_should_warn_nl(const char *pv)
1944{
7cb3f959
TC
1945 STRLEN len;
1946
1947 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1948
1949 len = strlen(pv);
1950
1951 return len > 0 && pv[len-1] == '\n';
1952}
1953
1954#endif
1955
3a019afd
KW
1956#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1957
1958PERL_STATIC_INLINE bool
1959S_lossless_NV_to_IV(const NV nv, IV *ivp)
1960{
1961 /* This function determines if the input NV 'nv' may be converted without
1962 * loss of data to an IV. If not, it returns FALSE taking no other action.
1963 * But if it is possible, it does the conversion, returning TRUE, and
1964 * storing the converted result in '*ivp' */
1965
1966 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1967
1968# if defined(Perl_isnan)
1969
1970 if (UNLIKELY(Perl_isnan(nv))) {
1971 return FALSE;
1972 }
1973
1974# endif
1975
1976 if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1977 return FALSE;
1978 }
1979
1980 if ((IV) nv != nv) {
1981 return FALSE;
1982 }
1983
1984 *ivp = (IV) nv;
1985 return TRUE;
1986}
1987
1988#endif
1989
14fe5f8a
KW
1990/* ------------------ regcomp.c, toke.c ------------ */
1991
1992#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C)
1993
1994/*
1995 - regcurly - a little FSA that accepts {\d+,?\d*}
1996 Pulled from reg.c.
1997 */
6bbb95d4 1998PERL_STATIC_INLINE bool
14fe5f8a
KW
1999S_regcurly(const char *s)
2000{
2001 PERL_ARGS_ASSERT_REGCURLY;
2002
2003 if (*s++ != '{')
2004 return FALSE;
2005 if (!isDIGIT(*s))
2006 return FALSE;
2007 while (isDIGIT(*s))
2008 s++;
2009 if (*s == ',') {
2010 s++;
2011 while (isDIGIT(*s))
2012 s++;
2013 }
2014
2015 return *s == '}';
2016}
2017
2018#endif
2019
81d52ecd
JH
2020/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2021
94b0cb42
KW
2022#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2023
81d52ecd
JH
2024#define MAX_CHARSET_NAME_LENGTH 2
2025
2026PERL_STATIC_INLINE const char *
94b0cb42 2027S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 2028{
94b0cb42
KW
2029 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2030
81d52ecd
JH
2031 /* Returns a string that corresponds to the name of the regex character set
2032 * given by 'flags', and *lenp is set the length of that string, which
2033 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2034
2035 *lenp = 1;
2036 switch (get_regex_charset(flags)) {
2037 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2038 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2039 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2040 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2041 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2042 *lenp = 2;
2043 return ASCII_MORE_RESTRICT_PAT_MODS;
2044 }
2045 /* The NOT_REACHED; hides an assert() which has a rather complex
2046 * definition in perl.h. */
2047 NOT_REACHED; /* NOTREACHED */
2048 return "?"; /* Unknown */
2049}
2050
94b0cb42
KW
2051#endif
2052
7cb3f959 2053/*
ed382232
TC
2054
2055Return false if any get magic is on the SV other than taint magic.
2056
2057*/
2058
2059PERL_STATIC_INLINE bool
ffd62fc2
KW
2060Perl_sv_only_taint_gmagic(SV *sv)
2061{
ed382232
TC
2062 MAGIC *mg = SvMAGIC(sv);
2063
2064 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2065
2066 while (mg) {
2067 if (mg->mg_type != PERL_MAGIC_taint
2068 && !(mg->mg_flags & MGf_GSKIP)
2069 && mg->mg_virtual->svt_get) {
2070 return FALSE;
2071 }
2072 mg = mg->mg_moremagic;
2073 }
2074
2075 return TRUE;
2076}
2077
ed8ff0f3
DM
2078/* ------------------ cop.h ------------------------------------------- */
2079
5b6f7443
DM
2080/* implement GIMME_V() macro */
2081
2082PERL_STATIC_INLINE U8
2083Perl_gimme_V(pTHX)
2084{
2085 I32 cxix;
2086 U8 gimme = (PL_op->op_flags & OPf_WANT);
2087
2088 if (gimme)
2089 return gimme;
2090 cxix = PL_curstackinfo->si_cxsubix;
2091 if (cxix < 0)
2092 return G_VOID;
2093 assert(cxstack[cxix].blk_gimme & G_WANT);
2094 return (cxstack[cxix].blk_gimme & G_WANT);
2095}
2096
ed8ff0f3
DM
2097
2098/* Enter a block. Push a new base context and return its address. */
2099
2100PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2101Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2102{
2103 PERL_CONTEXT * cx;
2104
2105 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2106
2107 CXINC;
2108 cx = CX_CUR();
2109 cx->cx_type = type;
2110 cx->blk_gimme = gimme;
2111 cx->blk_oldsaveix = saveix;
4caf7d8c 2112 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2113 cx->blk_oldcop = PL_curcop;
4caf7d8c 2114 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2115 cx->blk_oldscopesp = PL_scopestack_ix;
2116 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2117 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2118
2119 PL_tmps_floor = PL_tmps_ix;
2120 CX_DEBUG(cx, "PUSH");
2121 return cx;
2122}
2123
2124
2125/* Exit a block (RETURN and LAST). */
2126
2127PERL_STATIC_INLINE void
c9182d9c 2128Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2129{
2130 PERL_ARGS_ASSERT_CX_POPBLOCK;
2131
2132 CX_DEBUG(cx, "POP");
2133 /* these 3 are common to cx_popblock and cx_topblock */
2134 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2135 PL_scopestack_ix = cx->blk_oldscopesp;
2136 PL_curpm = cx->blk_oldpm;
2137
2138 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2139 * and leaves a CX entry lying around for repeated use, so
2140 * skip for multicall */ \
2141 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2142 || PL_savestack_ix == cx->blk_oldsaveix);
2143 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2144 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2145}
2146
2147/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2148 * Whereas cx_popblock() restores the state to the point just before
2149 * cx_pushblock() was called, cx_topblock() restores it to the point just
2150 * *after* cx_pushblock() was called. */
2151
2152PERL_STATIC_INLINE void
c9182d9c 2153Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2154{
2155 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2156
2157 CX_DEBUG(cx, "TOP");
2158 /* these 3 are common to cx_popblock and cx_topblock */
2159 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2160 PL_scopestack_ix = cx->blk_oldscopesp;
2161 PL_curpm = cx->blk_oldpm;
2162
2163 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2164}
2165
2166
a73d8813 2167PERL_STATIC_INLINE void
c9182d9c 2168Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2169{
2170 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2171
2172 PERL_ARGS_ASSERT_CX_PUSHSUB;
2173
3f6bd23a 2174 PERL_DTRACE_PROBE_ENTRY(cv);
5b6f7443
DM
2175 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2176 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
a73d8813
DM
2177 cx->blk_sub.cv = cv;
2178 cx->blk_sub.olddepth = CvDEPTH(cv);
2179 cx->blk_sub.prevcomppad = PL_comppad;
2180 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2181 cx->blk_sub.retop = retop;
2182 SvREFCNT_inc_simple_void_NN(cv);
2183 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2184}
2185
2186
2187/* subsets of cx_popsub() */
2188
2189PERL_STATIC_INLINE void
c9182d9c 2190Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2191{
2192 CV *cv;
2193
2194 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2195 assert(CxTYPE(cx) == CXt_SUB);
2196
2197 PL_comppad = cx->blk_sub.prevcomppad;
2198 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2199 cv = cx->blk_sub.cv;
2200 CvDEPTH(cv) = cx->blk_sub.olddepth;
2201 cx->blk_sub.cv = NULL;
2202 SvREFCNT_dec(cv);
5b6f7443 2203 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
a73d8813
DM
2204}
2205
2206
2207/* handle the @_ part of leaving a sub */
2208
2209PERL_STATIC_INLINE void
c9182d9c 2210Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2211{
2212 AV *av;
2213
2214 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2215 assert(CxTYPE(cx) == CXt_SUB);
2216 assert(AvARRAY(MUTABLE_AV(
2217 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2218 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2219
2220 CX_POP_SAVEARRAY(cx);
2221 av = MUTABLE_AV(PAD_SVl(0));
2222 if (UNLIKELY(AvREAL(av)))
2223 /* abandon @_ if it got reified */
2224 clear_defarray(av, 0);
2225 else {
2226 CLEAR_ARGARRAY(av);
2227 }
2228}
2229
2230
2231PERL_STATIC_INLINE void
c9182d9c 2232Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2233{
2234 PERL_ARGS_ASSERT_CX_POPSUB;
2235 assert(CxTYPE(cx) == CXt_SUB);
2236
3f6bd23a 2237 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2238
2239 if (CxHASARGS(cx))
2240 cx_popsub_args(cx);
2241 cx_popsub_common(cx);
2242}
2243
2244
6a7d52cc 2245PERL_STATIC_INLINE void
c9182d9c 2246Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2247{
2248 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2249
5b6f7443
DM
2250 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2251 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
6a7d52cc
DM
2252 cx->blk_format.cv = cv;
2253 cx->blk_format.retop = retop;
2254 cx->blk_format.gv = gv;
2255 cx->blk_format.dfoutgv = PL_defoutgv;
2256 cx->blk_format.prevcomppad = PL_comppad;
2257 cx->blk_u16 = 0;
2258
2259 SvREFCNT_inc_simple_void_NN(cv);
2260 CvDEPTH(cv)++;
2261 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2262}
2263
2264
2265PERL_STATIC_INLINE void
c9182d9c 2266Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2267{
2268 CV *cv;
2269 GV *dfout;
2270
2271 PERL_ARGS_ASSERT_CX_POPFORMAT;
2272 assert(CxTYPE(cx) == CXt_FORMAT);
2273
2274 dfout = cx->blk_format.dfoutgv;
2275 setdefout(dfout);
2276 cx->blk_format.dfoutgv = NULL;
2277 SvREFCNT_dec_NN(dfout);
2278
2279 PL_comppad = cx->blk_format.prevcomppad;
2280 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2281 cv = cx->blk_format.cv;
2282 cx->blk_format.cv = NULL;
2283 --CvDEPTH(cv);
2284 SvREFCNT_dec_NN(cv);
5b6f7443 2285 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
6a7d52cc
DM
2286}
2287
2288
13febba5 2289PERL_STATIC_INLINE void
c9182d9c 2290Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5
DM
2291{
2292 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2293
5b6f7443
DM
2294 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2295 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
13febba5
DM
2296 cx->blk_eval.retop = retop;
2297 cx->blk_eval.old_namesv = namesv;
2298 cx->blk_eval.old_eval_root = PL_eval_root;
2299 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2300 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2301 cx->blk_eval.cur_top_env = PL_top_env;
2302
4c57ced5 2303 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2304 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2305 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2306}
2307
2308
2309PERL_STATIC_INLINE void
c9182d9c 2310Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2311{
2312 SV *sv;
2313
2314 PERL_ARGS_ASSERT_CX_POPEVAL;
2315 assert(CxTYPE(cx) == CXt_EVAL);
2316
2317 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2318 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2319 PL_eval_root = cx->blk_eval.old_eval_root;
2320 sv = cx->blk_eval.cur_text;
4c57ced5 2321 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2322 cx->blk_eval.cur_text = NULL;
2323 SvREFCNT_dec_NN(sv);
2324 }
2325
2326 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2327 if (sv) {
2328 cx->blk_eval.old_namesv = NULL;
2329 SvREFCNT_dec_NN(sv);
2330 }
5b6f7443 2331 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
13febba5 2332}
6a7d52cc 2333
a73d8813 2334
d1b6bf72
DM
2335/* push a plain loop, i.e.
2336 * { block }
2337 * while (cond) { block }
2338 * for (init;cond;continue) { block }
2339 * This loop can be last/redo'ed etc.
2340 */
2341
2342PERL_STATIC_INLINE void
c9182d9c 2343Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2344{
2345 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2346 cx->blk_loop.my_op = cLOOP;
2347}
2348
2349
2350/* push a true for loop, i.e.
2351 * for var (list) { block }
2352 */
2353
2354PERL_STATIC_INLINE void
c9182d9c 2355Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
2356{
2357 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2358
2359 /* this one line is common with cx_pushloop_plain */
2360 cx->blk_loop.my_op = cLOOP;
2361
2362 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2363 cx->blk_loop.itersave = itersave;
2364#ifdef USE_ITHREADS
2365 cx->blk_loop.oldcomppad = PL_comppad;
2366#endif
2367}
2368
2369
2370/* pop all loop types, including plain */
2371
2372PERL_STATIC_INLINE void
c9182d9c 2373Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2374{
2375 PERL_ARGS_ASSERT_CX_POPLOOP;
2376
2377 assert(CxTYPE_is_LOOP(cx));
2378 if ( CxTYPE(cx) == CXt_LOOP_ARY
2379 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2380 {
2381 /* Free ary or cur. This assumes that state_u.ary.ary
2382 * aligns with state_u.lazysv.cur. See cx_dup() */
2383 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2384 cx->blk_loop.state_u.lazysv.cur = NULL;
2385 SvREFCNT_dec_NN(sv);
2386 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2387 sv = cx->blk_loop.state_u.lazysv.end;
2388 cx->blk_loop.state_u.lazysv.end = NULL;
2389 SvREFCNT_dec_NN(sv);
2390 }
2391 }
2392 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2393 SV *cursv;
2394 SV **svp = (cx)->blk_loop.itervar_u.svp;
2395 if ((cx->cx_type & CXp_FOR_GV))
2396 svp = &GvSV((GV*)svp);
2397 cursv = *svp;
2398 *svp = cx->blk_loop.itersave;
2399 cx->blk_loop.itersave = NULL;
2400 SvREFCNT_dec(cursv);
2401 }
2402}
2403
2a7b7c61
DM
2404
2405PERL_STATIC_INLINE void
c9182d9c 2406Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2407{
7896dde7 2408 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 2409
7896dde7 2410 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
2411}
2412
2413
2414PERL_STATIC_INLINE void
c9182d9c 2415Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2416{
7896dde7
Z
2417 PERL_ARGS_ASSERT_CX_POPWHEN;
2418 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
2419
2420 PERL_UNUSED_ARG(cx);
59a14f30 2421 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
2422 /* currently NOOP */
2423}
2424
2425
7896dde7 2426PERL_STATIC_INLINE void
c9182d9c 2427Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
2428{
2429 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2430
2431 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2432 cx->blk_givwhen.defsv_save = orig_defsv;
2433}
2434
2435
2436PERL_STATIC_INLINE void
c9182d9c 2437Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
2438{
2439 SV *sv;
2440
2441 PERL_ARGS_ASSERT_CX_POPGIVEN;
2442 assert(CxTYPE(cx) == CXt_GIVEN);
2443
2444 sv = GvSV(PL_defgv);
2445 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2446 cx->blk_givwhen.defsv_save = NULL;
2447 SvREFCNT_dec(sv);
2448}
2449
ec2c235b
KW
2450/* ------------------ util.h ------------------------------------------- */
2451
2452/*
2453=head1 Miscellaneous Functions
2454
2455=for apidoc foldEQ
2456
2457Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2458same
2459case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2460match themselves and their opposite case counterparts. Non-cased and non-ASCII
2461range bytes match only themselves.
2462
2463=cut
2464*/
2465
2466PERL_STATIC_INLINE I32
2467Perl_foldEQ(const char *s1, const char *s2, I32 len)
2468{
2469 const U8 *a = (const U8 *)s1;
2470 const U8 *b = (const U8 *)s2;
2471
2472 PERL_ARGS_ASSERT_FOLDEQ;
2473
2474 assert(len >= 0);
2475
2476 while (len--) {
2477 if (*a != *b && *a != PL_fold[*b])
2478 return 0;
2479 a++,b++;
2480 }
2481 return 1;
2482}
2483
0f9cb40c 2484PERL_STATIC_INLINE I32
ec2c235b
KW
2485Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2486{
79a1fabd
KW
2487 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2488 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2489 * does not check for this. Nor does it check that the strings each have
2490 * at least 'len' characters. */
ec2c235b
KW
2491
2492 const U8 *a = (const U8 *)s1;
2493 const U8 *b = (const U8 *)s2;
2494
2495 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2496
2497 assert(len >= 0);
2498
2499 while (len--) {
2500 if (*a != *b && *a != PL_fold_latin1[*b]) {
2501 return 0;
2502 }
2503 a++, b++;
2504 }
2505 return 1;
2506}
2507
2508/*
2509=for apidoc foldEQ_locale
2510
2511Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2512same case-insensitively in the current locale; false otherwise.
2513
2514=cut
2515*/
2516
0f9cb40c 2517PERL_STATIC_INLINE I32
ec2c235b
KW
2518Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2519{
2520 dVAR;
2521 const U8 *a = (const U8 *)s1;
2522 const U8 *b = (const U8 *)s2;
2523
2524 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2525
2526 assert(len >= 0);
2527
2528 while (len--) {
2529 if (*a != *b && *a != PL_fold_locale[*b])
2530 return 0;
2531 a++,b++;
2532 }
2533 return 1;
2534}
2535
1ab100a8
KW
2536/*
2537=for apidoc my_strnlen
2538
2539The C library C<strnlen> if available, or a Perl implementation of it.
2540
2541C<my_strnlen()> computes the length of the string, up to C<maxlen>
2542characters. It will will never attempt to address more than C<maxlen>
2543characters, making it suitable for use with strings that are not
2544guaranteed to be NUL-terminated.
2545
2546=cut
2547
2548Description stolen from http://man.openbsd.org/strnlen.3,
2549implementation stolen from PostgreSQL.
2550*/
2551#ifndef HAS_STRNLEN
2552
2553PERL_STATIC_INLINE Size_t
2554Perl_my_strnlen(const char *str, Size_t maxlen)
2555{
2556 const char *end = (char *) memchr(str, '\0', maxlen);
2557
2558 PERL_ARGS_ASSERT_MY_STRNLEN;
2559
2560 if (end == NULL) return maxlen;
2561 return end - str;
2562}
2563
2564#endif
2565
6dba01e2
KW
2566#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2567
2568PERL_STATIC_INLINE void *
2569S_my_memrchr(const char * s, const char c, const STRLEN len)
2570{
2571 /* memrchr(), since many platforms lack it */
2572
2573 const char * t = s + len - 1;
2574
2575 PERL_ARGS_ASSERT_MY_MEMRCHR;
2576
2577 while (t >= s) {
2578 if (*t == c) {
2579 return (void *) t;
2580 }
2581 t--;
2582 }
2583
2584 return NULL;
2585}
2586
2587#endif
2588
24f3e849
KW
2589PERL_STATIC_INLINE char *
2590Perl_mortal_getenv(const char * str)
2591{
2592 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
2593 *
2594 * It's (mostly) thread-safe because it uses a mutex to prevent
2595 * simultaneous access from other threads that use the same mutex, and
2596 * makes a copy of the result before releasing that mutex. All of the Perl
2597 * core uses that mutex, but, like all mutexes, everything has to cooperate
2598 * for it to completely work. It is possible for code from, say XS, to not
2599 * use this mutex, defeating the safety.
2600 *
2601 * On some platforms, getenv() is not sequential-call-safe, because
2602 * subsequent calls destroy the static storage inside the C library
2603 * returned by an earlier call. The result must be copied or completely
2604 * acted upon before a subsequent getenv call. Those calls could come from
2605 * another thread. Again, making a copy while controlling the mutex
2606 * prevents these problems..
2607 *
2608 * To prevent leaks, the copy is made by creating a new SV containing it,
2609 * mortalizing the SV, and returning the SV's string (the copy). Thus this
2610 * is a drop-in replacement for getenv().
2611 *
2612 * A complication is that this can be called during phases where the
2613 * mortalization process isn't available. These are in interpreter
2614 * destruction or early in construction. khw believes that at these times
2615 * there shouldn't be anything else going on, so plain getenv is safe AS
2616 * LONG AS the caller acts on the return before calling it again. */
2617
2618 char * ret;
2619 dTHX;
2620
2621 PERL_ARGS_ASSERT_MORTAL_GETENV;
2622
2623 /* Can't mortalize without stacks. khw believes that no other threads
2624 * should be running, so no need to lock things, and this may be during a
2625 * phase when locking isn't even available */
2626 if (UNLIKELY(PL_scopestack_ix == 0)) {
2627 return getenv(str);
2628 }
2629
2630 ENV_LOCK;
2631
2632 ret = getenv(str);
2633
2634 if (ret != NULL) {
2635 ret = SvPVX(sv_2mortal(newSVpv(ret, 0)));
2636 }
2637
2638 ENV_UNLOCK;
2639 return ret;
2640}
2641
ed382232 2642/*
c8028aa6
TC
2643 * ex: set ts=8 sts=4 sw=4 et:
2644 */