This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Revert "op.h: Restrict to core certain internal symbols"
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9 8 * This file contains tables and code adapted from
f6521f7c 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
8ed185f9
KW
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
37 * Each section names the header file that the functions "belong" to.
38 */
27669aa4 39
be3a7a5d
KW
40/* ------------------------------- av.h ------------------------------- */
41
87306e06 42/*
3f620621 43=for apidoc_section $AV
87306e06
KW
44=for apidoc av_count
45Returns the number of elements in the array C<av>. This is the true length of
46the array, including any undefined elements. It is always the same as
47S<C<av_top_index(av) + 1>>.
48
49=cut
50*/
51PERL_STATIC_INLINE Size_t
52Perl_av_count(pTHX_ AV *av)
be3a7a5d 53{
87306e06 54 PERL_ARGS_ASSERT_AV_COUNT;
be3a7a5d
KW
55 assert(SvTYPE(av) == SVt_PVAV);
56
87306e06 57 return AvFILL(av) + 1;
be3a7a5d
KW
58}
59
1afe1db1
FC
60/* ------------------------------- cv.h ------------------------------- */
61
fa3e44c0 62/*
3f620621 63=for apidoc_section $CV
fa3e44c0
KW
64=for apidoc CvGV
65Returns the GV associated with the CV C<sv>, reifying it if necessary.
66
67=cut
68*/
ae77754a 69PERL_STATIC_INLINE GV *
c9182d9c 70Perl_CvGV(pTHX_ CV *sv)
ae77754a 71{
74804ad1
KW
72 PERL_ARGS_ASSERT_CVGV;
73
ae77754a
FC
74 return CvNAMED(sv)
75 ? Perl_cvgv_from_hek(aTHX_ sv)
76 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
77}
78
1afe1db1 79PERL_STATIC_INLINE I32 *
74804ad1 80Perl_CvDEPTH(const CV * const sv)
1afe1db1 81{
74804ad1 82 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 83 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 84
8de47657 85 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
86}
87
d16269d8
PM
88/*
89 CvPROTO returns the prototype as stored, which is not necessarily what
90 the interpreter should be using. Specifically, the interpreter assumes
91 that spaces have been stripped, which has been the case if the prototype
92 was added by toke.c, but is generally not the case if it was added elsewhere.
93 Since we can't enforce the spacelessness at assignment time, this routine
94 provides a temporary copy at parse time with spaces removed.
95 I<orig> is the start of the original buffer, I<len> is the length of the
96 prototype and will be updated when this returns.
97 */
98
5b67adb8 99#ifdef PERL_CORE
d16269d8
PM
100PERL_STATIC_INLINE char *
101S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
102{
103 SV * tmpsv;
104 char * tmps;
105 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
106 tmps = SvPVX(tmpsv);
107 while ((*len)--) {
108 if (!isSPACE(*orig))
109 *tmps++ = *orig;
110 orig++;
111 }
112 *tmps = '\0';
113 *len = tmps - SvPVX(tmpsv);
114 return SvPVX(tmpsv);
115}
5b67adb8 116#endif
d16269d8 117
25fdce4a
FC
118/* ------------------------------- mg.h ------------------------------- */
119
120#if defined(PERL_CORE) || defined(PERL_EXT)
121/* assumes get-magic and stringification have already occurred */
122PERL_STATIC_INLINE STRLEN
123S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
124{
125 assert(mg->mg_type == PERL_MAGIC_regex_global);
126 assert(mg->mg_len != -1);
127 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
128 return (STRLEN)mg->mg_len;
129 else {
130 const STRLEN pos = (STRLEN)mg->mg_len;
131 /* Without this check, we may read past the end of the buffer: */
132 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
133 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
134 }
135}
136#endif
137
03414f05
FC
138/* ------------------------------- pad.h ------------------------------ */
139
140#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
141PERL_STATIC_INLINE bool
b9d5702c 142S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 143{
b9d5702c
KW
144 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
145
03414f05
FC
146 /* is seq within the range _LOW to _HIGH ?
147 * This is complicated by the fact that PL_cop_seqmax
148 * may have wrapped around at some point */
149 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
150 return FALSE; /* not yet introduced */
151
152 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
153 /* in compiling scope */
154 if (
155 (seq > COP_SEQ_RANGE_LOW(pn))
156 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
157 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
158 )
159 return TRUE;
160 }
161 else if (
162 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
163 ?
164 ( seq > COP_SEQ_RANGE_LOW(pn)
165 || seq <= COP_SEQ_RANGE_HIGH(pn))
166
167 : ( seq > COP_SEQ_RANGE_LOW(pn)
168 && seq <= COP_SEQ_RANGE_HIGH(pn))
169 )
170 return TRUE;
171 return FALSE;
172}
173#endif
174
33a4312b
FC
175/* ------------------------------- pp.h ------------------------------- */
176
177PERL_STATIC_INLINE I32
c9182d9c 178Perl_TOPMARK(pTHX)
33a4312b
FC
179{
180 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 181 "MARK top %p %" IVdf "\n",
33a4312b
FC
182 PL_markstack_ptr,
183 (IV)*PL_markstack_ptr)));
184 return *PL_markstack_ptr;
185}
186
187PERL_STATIC_INLINE I32
c9182d9c 188Perl_POPMARK(pTHX)
33a4312b
FC
189{
190 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 191 "MARK pop %p %" IVdf "\n",
33a4312b
FC
192 (PL_markstack_ptr-1),
193 (IV)*(PL_markstack_ptr-1))));
194 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
195 return *PL_markstack_ptr--;
196}
197
8d919b0a
FC
198/* ----------------------------- regexp.h ----------------------------- */
199
200PERL_STATIC_INLINE struct regexp *
c9182d9c 201Perl_ReANY(const REGEXP * const re)
8d919b0a 202{
df6b4bd5 203 XPV* const p = (XPV*)SvANY(re);
bdef45de
KW
204
205 PERL_ARGS_ASSERT_REANY;
8d919b0a 206 assert(isREGEXP(re));
bdef45de 207
df6b4bd5
DM
208 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
209 : (struct regexp *)p;
8d919b0a
FC
210}
211
27669aa4
FC
212/* ------------------------------- sv.h ------------------------------- */
213
a887b094 214PERL_STATIC_INLINE bool
9d0469db 215Perl_SvTRUE(pTHX_ SV *sv) {
6aa30f5e 216 if (UNLIKELY(sv == NULL))
9d0469db 217 return FALSE;
7b9b9e32
PE
218 SvGETMAGIC(sv);
219 return SvTRUE_nomg_NN(sv);
a887b094
PE
220}
221
27669aa4 222PERL_STATIC_INLINE SV *
c9182d9c 223Perl_SvREFCNT_inc(SV *sv)
27669aa4 224{
2439e033 225 if (LIKELY(sv != NULL))
27669aa4
FC
226 SvREFCNT(sv)++;
227 return sv;
228}
229PERL_STATIC_INLINE SV *
c9182d9c 230Perl_SvREFCNT_inc_NN(SV *sv)
27669aa4 231{
3f2f854a
KW
232 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
233
27669aa4
FC
234 SvREFCNT(sv)++;
235 return sv;
236}
237PERL_STATIC_INLINE void
c9182d9c 238Perl_SvREFCNT_inc_void(SV *sv)
27669aa4 239{
2439e033 240 if (LIKELY(sv != NULL))
27669aa4
FC
241 SvREFCNT(sv)++;
242}
75e16a44 243PERL_STATIC_INLINE void
c9182d9c 244Perl_SvREFCNT_dec(pTHX_ SV *sv)
75e16a44 245{
2439e033 246 if (LIKELY(sv != NULL)) {
75a9bf96 247 U32 rc = SvREFCNT(sv);
79e2a32a 248 if (LIKELY(rc > 1))
75a9bf96
DM
249 SvREFCNT(sv) = rc - 1;
250 else
251 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
252 }
253}
541377b1
FC
254
255PERL_STATIC_INLINE void
c9182d9c 256Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
4a9a56a7
DM
257{
258 U32 rc = SvREFCNT(sv);
3f2f854a
KW
259
260 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
261
79e2a32a 262 if (LIKELY(rc > 1))
4a9a56a7
DM
263 SvREFCNT(sv) = rc - 1;
264 else
265 Perl_sv_free2(aTHX_ sv, rc);
266}
267
268PERL_STATIC_INLINE void
1bd041dc 269Perl_SvAMAGIC_on(SV *sv)
541377b1 270{
1bd041dc 271 PERL_ARGS_ASSERT_SVAMAGIC_ON;
541377b1 272 assert(SvROK(sv));
1bd041dc 273
541377b1
FC
274 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
275}
276PERL_STATIC_INLINE void
1bd041dc 277Perl_SvAMAGIC_off(SV *sv)
541377b1 278{
1bd041dc
KW
279 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
280
541377b1
FC
281 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
282 HvAMAGIC_off(SvSTASH(SvRV(sv)));
283}
284
285PERL_STATIC_INLINE U32
c9182d9c 286Perl_SvPADSTALE_on(SV *sv)
541377b1 287{
c0683843 288 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
289 return SvFLAGS(sv) |= SVs_PADSTALE;
290}
291PERL_STATIC_INLINE U32
c9182d9c 292Perl_SvPADSTALE_off(SV *sv)
541377b1 293{
c0683843 294 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
295 return SvFLAGS(sv) &= ~SVs_PADSTALE;
296}
25fdce4a 297#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 298PERL_STATIC_INLINE STRLEN
6964422a 299S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 300{
25fdce4a 301 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
302 if (SvGAMAGIC(sv)) {
303 U8 *hopped = utf8_hop((U8 *)pv, pos);
304 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
305 return (STRLEN)(hopped - (U8 *)pv);
306 }
307 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
308}
309#endif
f019c49e 310
a8a2ceaa
KW
311/* ------------------------------- utf8.h ------------------------------- */
312
2fe720e2 313/*
3f620621 314=for apidoc_section $unicode
2fe720e2
KW
315*/
316
55d09dc8 317PERL_STATIC_INLINE void
c9182d9c 318Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
319{
320 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
321 * encoded string at '*dest', updating '*dest' to include it */
322
55d09dc8
KW
323 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
324
6f2d5cbc 325 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 326 *((*dest)++) = byte;
55d09dc8 327 else {
a09ec51a
KW
328 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
329 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
330 }
331}
332
e123187a 333/*
2fe720e2 334=for apidoc valid_utf8_to_uvchr
09232555
KW
335Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
336known that the next character in the input UTF-8 string C<s> is well-formed
337(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
338points, and non-Unicode code points are allowed.
2fe720e2
KW
339
340=cut
341
342 */
343
344PERL_STATIC_INLINE UV
345Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
346{
c41b2540 347 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
348 const U8* send = s + expectlen;
349 UV uv = *s;
350
351 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
352
353 if (retlen) {
354 *retlen = expectlen;
355 }
356
357 /* An invariant is trivially returned */
358 if (expectlen == 1) {
359 return uv;
360 }
361
362 /* Remove the leading bits that indicate the number of bytes, leaving just
363 * the bits that are part of the value */
364 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
365
366 /* Now, loop through the remaining bytes, accumulating each into the
367 * working total as we go. (I khw tried unrolling the loop for up to 4
368 * bytes, but there was no performance improvement) */
369 for (++s; s < send; s++) {
370 uv = UTF8_ACCUMULATE(uv, *s);
371 }
372
373 return UNI_TO_NATIVE(uv);
374
375}
376
1e599354
KW
377/*
378=for apidoc is_utf8_invariant_string
379
82c5d941 380Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 381regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
382EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
383are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
384the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
385characters are invariant, but so also are the C1 controls.
1e599354
KW
386
387If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
388use this option, that C<s> can't have embedded C<NUL> characters and has to
389have a terminating C<NUL> byte).
390
9f2abfde
KW
391See also
392C<L</is_utf8_string>>,
393C<L</is_utf8_string_flags>>,
394C<L</is_utf8_string_loc>>,
395C<L</is_utf8_string_loc_flags>>,
396C<L</is_utf8_string_loclen>>,
397C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
398C<L</is_utf8_fixed_width_buf_flags>>,
399C<L</is_utf8_fixed_width_buf_loc_flags>>,
400C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
401C<L</is_strict_utf8_string>>,
402C<L</is_strict_utf8_string_loc>>,
403C<L</is_strict_utf8_string_loclen>>,
404C<L</is_c9strict_utf8_string>>,
405C<L</is_c9strict_utf8_string_loc>>,
406and
407C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
408
409=cut
0cbf5865
KW
410
411*/
412
413#define is_utf8_invariant_string(s, len) \
414 is_utf8_invariant_string_loc(s, len, NULL)
415
416/*
417=for apidoc is_utf8_invariant_string_loc
418
419Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
420the first UTF-8 variant character in the C<ep> pointer; if all characters are
421UTF-8 invariant, this function does not change the contents of C<*ep>.
422
423=cut
424
1e599354
KW
425*/
426
427PERL_STATIC_INLINE bool
c9182d9c 428Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 429{
e17544a6 430 const U8* send;
1e599354
KW
431 const U8* x = s;
432
0cbf5865
KW
433 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
434
e17544a6
KW
435 if (len == 0) {
436 len = strlen((const char *)s);
437 }
438
439 send = s + len;
440
4ab2fd9b 441/* This looks like 0x010101... */
2c5c8af5 442# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
443
444/* This looks like 0x808080... */
2c5c8af5 445# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
e099ea69 446# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 447# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 448
099e59a4
KW
449/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
450 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
451 * optimized out completely on a 32-bit system, and its mask gets optimized out
452 * on a 64-bit system */
2c5c8af5 453# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
454 | ( PTR2nat(x) >> 1) \
455 | ( ( (PTR2nat(x) \
456 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 457
3f515a2e
KW
458#ifndef EBCDIC
459
099e59a4
KW
460 /* Do the word-at-a-time iff there is at least one usable full word. That
461 * means that after advancing to a word boundary, there still is at least a
462 * full word left. The number of bytes needed to advance is 'wordsize -
463 * offset' unless offset is 0. */
464 if ((STRLEN) (send - x) >= PERL_WORDSIZE
465
466 /* This term is wordsize if subword; 0 if not */
467 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
468
469 /* 'offset' */
470 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
471 {
b40579ff 472
46bb68f6
KW
473 /* Process per-byte until reach word boundary. XXX This loop could be
474 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 475 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
476 if (! UTF8_IS_INVARIANT(*x)) {
477 if (ep) {
478 *ep = x;
479 }
e17544a6 480
46bb68f6
KW
481 return FALSE;
482 }
483 x++;
e17544a6 484 }
e17544a6 485
099e59a4
KW
486 /* Here, we know we have at least one full word to process. Process
487 * per-word as long as we have at least a full word left */
488 do {
4ab2fd9b 489 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 490
46bb68f6
KW
491 /* Found a variant. Just return if caller doesn't want its
492 * exact position */
493 if (! ep) {
494 return FALSE;
495 }
e17544a6 496
2c5c8af5
KW
497# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
498 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 499
73f0a2eb 500 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
501 assert(*ep >= s && *ep < send);
502
503 return FALSE;
504
2c5c8af5 505# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
506 checks. */
507
46bb68f6 508 break;
2c5c8af5 509# endif
46bb68f6 510 }
1d2af574 511
46bb68f6 512 x += PERL_WORDSIZE;
1d2af574 513
099e59a4 514 } while (x + PERL_WORDSIZE <= send);
b40579ff 515 }
e17544a6 516
0b08cab0 517#endif /* End of ! EBCDIC */
e17544a6
KW
518
519 /* Process per-byte */
520 while (x < send) {
521 if (! UTF8_IS_INVARIANT(*x)) {
522 if (ep) {
523 *ep = x;
524 }
0cbf5865 525
e17544a6 526 return FALSE;
0cbf5865 527 }
1e599354 528
e17544a6 529 x++;
1e599354
KW
530 }
531
532 return TRUE;
533}
534
23a7ee81
KW
535#ifndef EBCDIC
536
1d2af574 537PERL_STATIC_INLINE unsigned int
73f0a2eb 538Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574
KW
539{
540
541 /* This returns the position in a word (0..7) of the first variant byte in
542 * it. This is a helper function. Note that there are no branches */
543
544 assert(word);
545
546 /* Get just the msb bits of each byte */
547 word &= PERL_VARIANTS_WORD_MASK;
548
7adf2470 549# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
550
551 /* Bytes are stored like
552 * Byte8 ... Byte2 Byte1
553 * 63..56...15...8 7...0
554 *
555 * Isolate the lsb;
556 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
557 *
a3815e44 558 * The word will look like this, with a rightmost set bit in position 's':
1d2af574
KW
559 * ('x's are don't cares)
560 * s
561 * x..x100..0
562 * x..xx10..0 Right shift (rightmost 0 is shifted off)
563 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
564 * the 1 just to their left into a 0; the remainder is
565 * untouched
4fa92663
KW
566 * 0..0011..1 The xor with the original, x..xx10..0, clears that
567 * remainder, sets the bottom to all 1
1d2af574
KW
568 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
569 *
570 * Another method is to do 'word &= -word'; but it generates a compiler
571 * message on some platforms about taking the negative of an unsigned */
572
573 word >>= 1;
574 word = 1 + (word ^ (word - 1));
575
576# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
577
578 /* Bytes are stored like
579 * Byte1 Byte2 ... Byte8
580 * 63..56 55..47 ... 7...0
581 *
582 * Isolate the msb; http://codeforces.com/blog/entry/10330
583 *
584 * Only the most significant set bit matters. Or'ing word with its right
585 * shift of 1 makes that bit and the next one to its right both 1. Then
586 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
587 * msb and all to the right being 1. */
588 word |= word >> 1;
589 word |= word >> 2;
590 word |= word >> 4;
591 word |= word >> 8;
592 word |= word >> 16;
593 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
594
595 /* Then subtracting the right shift by 1 clears all but the left-most of
596 * the 1 bits, which is our desired result */
597 word -= (word >> 1);
598
599# else
600# error Unexpected byte order
601# endif
602
7cf2d6c7
KW
603 /* Here 'word' has a single bit set: the msb of the first byte in which it
604 * is set. Calculate that position in the word. We can use this
1d2af574 605 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
67e12c5c
KW
606 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
607 * just get shifted off at compile time) */
608 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
609 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
610 | (39 << 24) | (47 << 16)
611 | (55 << 8) | (63 << 0));
1d2af574
KW
612 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
613
614 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
615 word = ((word + 1) >> 3) - 1;
616
617# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
618
619 /* And invert the result */
620 word = CHARBITS - word - 1;
621
622# endif
623
624 return (unsigned int) word;
625}
626
23a7ee81 627#endif
03c1e4ab
KW
628#if defined(PERL_CORE) || defined(PERL_EXT)
629
630/*
631=for apidoc variant_under_utf8_count
632
633This function looks at the sequence of bytes between C<s> and C<e>, which are
634assumed to be encoded in ASCII/Latin1, and returns how many of them would
635change should the string be translated into UTF-8. Due to the nature of UTF-8,
636each of these would occupy two bytes instead of the single one in the input
637string. Thus, this function returns the precise number of bytes the string
638would expand by when translated to UTF-8.
639
640Unlike most of the other functions that have C<utf8> in their name, the input
641to this function is NOT a UTF-8-encoded string. The function name is slightly
642I<odd> to emphasize this.
643
644This function is internal to Perl because khw thinks that any XS code that
645would want this is probably operating too close to the internals. Presenting a
646valid use case could change that.
647
648See also
649C<L<perlapi/is_utf8_invariant_string>>
650and
651C<L<perlapi/is_utf8_invariant_string_loc>>,
652
653=cut
654
655*/
656
657PERL_STATIC_INLINE Size_t
658S_variant_under_utf8_count(const U8* const s, const U8* const e)
659{
660 const U8* x = s;
661 Size_t count = 0;
662
663 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
664
665# ifndef EBCDIC
666
5d0379de
KW
667 /* Test if the string is long enough to use word-at-a-time. (Logic is the
668 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
669 if ((STRLEN) (e - x) >= PERL_WORDSIZE
670 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
671 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
672 {
673
674 /* Process per-byte until reach word boundary. XXX This loop could be
675 * eliminated if we knew that this platform had fast unaligned reads */
676 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
677 count += ! UTF8_IS_INVARIANT(*x++);
678 }
679
680 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
681 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
682 explanation of how this works */
e5863284
KW
683 PERL_UINTMAX_T increment
684 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
685 * PERL_COUNT_MULTIPLIER)
686 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 687 count += (Size_t) increment;
03c1e4ab
KW
688 x += PERL_WORDSIZE;
689 } while (x + PERL_WORDSIZE <= e);
690 }
691
692# endif
693
694 /* Process per-byte */
695 while (x < e) {
696 if (! UTF8_IS_INVARIANT(*x)) {
697 count++;
698 }
699
700 x++;
701 }
702
703 return count;
704}
705
706#endif
707
aff4cafe
KW
708#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
709# undef PERL_WORDSIZE
710# undef PERL_COUNT_MULTIPLIER
711# undef PERL_WORD_BOUNDARY_MASK
712# undef PERL_VARIANTS_WORD_MASK
713#endif
03c1e4ab 714
7c93d8f0 715/*
5ff889fb
KW
716=for apidoc is_utf8_string
717
82c5d941
KW
718Returns TRUE if the first C<len> bytes of string C<s> form a valid
719Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
720be calculated using C<strlen(s)> (which means if you use this option, that C<s>
721can't have embedded C<NUL> characters and has to have a terminating C<NUL>
722byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
723
2717076a
KW
724This function considers Perl's extended UTF-8 to be valid. That means that
725code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
726considered valid by this function. Use C<L</is_strict_utf8_string>>,
727C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
728code points are considered valid.
5ff889fb 729
9f2abfde
KW
730See also
731C<L</is_utf8_invariant_string>>,
0cbf5865 732C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
733C<L</is_utf8_string_loc>>,
734C<L</is_utf8_string_loclen>>,
8bc127bf
KW
735C<L</is_utf8_fixed_width_buf_flags>>,
736C<L</is_utf8_fixed_width_buf_loc_flags>>,
737C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
738
739=cut
740*/
741
dd237e82 742#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 743
c9cd936b
KW
744#if defined(PERL_CORE) || defined (PERL_EXT)
745
746/*
747=for apidoc is_utf8_non_invariant_string
748
749Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
750C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
751UTF-8; otherwise returns FALSE.
752
753A TRUE return means that at least one code point represented by the sequence
754either is a wide character not representable as a single byte, or the
755representation differs depending on whether the sequence is encoded in UTF-8 or
756not.
757
758See also
759C<L<perlapi/is_utf8_invariant_string>>,
760C<L<perlapi/is_utf8_string>>
761
762=cut
763
764This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
765It generally needn't be if its string is entirely UTF-8 invariant, and it
766shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
767
768It is an internal function because khw thinks that XS code shouldn't be working
769at this low a level. A valid use case could change that.
770
771*/
772
773PERL_STATIC_INLINE bool
86a87e17 774Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
c9cd936b
KW
775{
776 const U8 * first_variant;
777
778 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
779
780 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
781 return FALSE;
782 }
783
784 return is_utf8_string(first_variant, len - (first_variant - s));
785}
786
787#endif
788
5ff889fb 789/*
9f2abfde
KW
790=for apidoc is_strict_utf8_string
791
792Returns TRUE if the first C<len> bytes of string C<s> form a valid
793UTF-8-encoded string that is fully interchangeable by any application using
794Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
795calculated using C<strlen(s)> (which means if you use this option, that C<s>
796can't have embedded C<NUL> characters and has to have a terminating C<NUL>
797byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
798
799This function returns FALSE for strings containing any
800code points above the Unicode max of 0x10FFFF, surrogate code points, or
801non-character code points.
802
803See also
804C<L</is_utf8_invariant_string>>,
0cbf5865 805C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
806C<L</is_utf8_string>>,
807C<L</is_utf8_string_flags>>,
808C<L</is_utf8_string_loc>>,
809C<L</is_utf8_string_loc_flags>>,
810C<L</is_utf8_string_loclen>>,
811C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
812C<L</is_utf8_fixed_width_buf_flags>>,
813C<L</is_utf8_fixed_width_buf_loc_flags>>,
814C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
815C<L</is_strict_utf8_string_loc>>,
816C<L</is_strict_utf8_string_loclen>>,
817C<L</is_c9strict_utf8_string>>,
818C<L</is_c9strict_utf8_string_loc>>,
819and
820C<L</is_c9strict_utf8_string_loclen>>.
821
822=cut
823*/
824
dd237e82 825#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
826
827/*
828=for apidoc is_c9strict_utf8_string
829
830Returns TRUE if the first C<len> bytes of string C<s> form a valid
831UTF-8-encoded string that conforms to
832L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
833otherwise it returns FALSE. If C<len> is 0, it will be calculated using
834C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
835C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
836characters being ASCII constitute 'a valid UTF-8 string'.
837
838This function returns FALSE for strings containing any code points above the
839Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
840code points per
841L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
842
843See also
844C<L</is_utf8_invariant_string>>,
0cbf5865 845C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
846C<L</is_utf8_string>>,
847C<L</is_utf8_string_flags>>,
848C<L</is_utf8_string_loc>>,
849C<L</is_utf8_string_loc_flags>>,
850C<L</is_utf8_string_loclen>>,
851C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
852C<L</is_utf8_fixed_width_buf_flags>>,
853C<L</is_utf8_fixed_width_buf_loc_flags>>,
854C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
855C<L</is_strict_utf8_string>>,
856C<L</is_strict_utf8_string_loc>>,
857C<L</is_strict_utf8_string_loclen>>,
858C<L</is_c9strict_utf8_string_loc>>,
859and
860C<L</is_c9strict_utf8_string_loclen>>.
861
862=cut
863*/
864
dd237e82 865#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
866
867/*
868=for apidoc is_utf8_string_flags
869
870Returns TRUE if the first C<len> bytes of string C<s> form a valid
871UTF-8 string, subject to the restrictions imposed by C<flags>;
872returns FALSE otherwise. If C<len> is 0, it will be calculated
873using C<strlen(s)> (which means if you use this option, that C<s> can't have
874embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
875that all characters being ASCII constitute 'a valid UTF-8 string'.
876
877If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
878C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
879as C<L</is_strict_utf8_string>>; and if C<flags> is
880C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
881C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
882combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
883C<L</utf8n_to_uvchr>>, with the same meanings.
884
885See also
886C<L</is_utf8_invariant_string>>,
0cbf5865 887C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
888C<L</is_utf8_string>>,
889C<L</is_utf8_string_loc>>,
890C<L</is_utf8_string_loc_flags>>,
891C<L</is_utf8_string_loclen>>,
892C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
893C<L</is_utf8_fixed_width_buf_flags>>,
894C<L</is_utf8_fixed_width_buf_loc_flags>>,
895C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
896C<L</is_strict_utf8_string>>,
897C<L</is_strict_utf8_string_loc>>,
898C<L</is_strict_utf8_string_loclen>>,
899C<L</is_c9strict_utf8_string>>,
900C<L</is_c9strict_utf8_string_loc>>,
901and
902C<L</is_c9strict_utf8_string_loclen>>.
903
904=cut
905*/
906
907PERL_STATIC_INLINE bool
c9182d9c 908Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 909{
33756530 910 const U8 * first_variant;
9f2abfde
KW
911
912 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
913 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 914 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 915
f60f61fd
KW
916 if (len == 0) {
917 len = strlen((const char *)s);
918 }
919
9f2abfde
KW
920 if (flags == 0) {
921 return is_utf8_string(s, len);
922 }
923
d044b7a7 924 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
925 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
926 {
927 return is_strict_utf8_string(s, len);
928 }
929
d044b7a7 930 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
931 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
932 {
933 return is_c9strict_utf8_string(s, len);
934 }
935
33756530
KW
936 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
937 const U8* const send = s + len;
938 const U8* x = first_variant;
939
a0d7f935
KW
940 while (x < send) {
941 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
942 if (UNLIKELY(! cur_len)) {
943 return FALSE;
944 }
945 x += cur_len;
9f2abfde 946 }
33756530 947 }
9f2abfde
KW
948
949 return TRUE;
950}
951
952/*
5ff889fb
KW
953
954=for apidoc is_utf8_string_loc
955
2717076a 956Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 957case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 958"utf8ness success") in the C<ep> pointer.
5ff889fb 959
2717076a 960See also C<L</is_utf8_string_loclen>>.
5ff889fb 961
3964c812
KW
962=cut
963*/
964
965#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
966
967/*
968
5ff889fb
KW
969=for apidoc is_utf8_string_loclen
970
2717076a 971Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 972case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 973"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 974encoded characters in the C<el> pointer.
5ff889fb 975
2717076a 976See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
977
978=cut
979*/
980
56e4cf64 981PERL_STATIC_INLINE bool
33756530 982Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 983{
33756530 984 const U8 * first_variant;
5ff889fb
KW
985
986 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
987
33756530
KW
988 if (len == 0) {
989 len = strlen((const char *) s);
990 }
991
992 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
993 if (el)
994 *el = len;
995
996 if (ep) {
997 *ep = s + len;
998 }
999
1000 return TRUE;
1001 }
1002
1003 {
1004 const U8* const send = s + len;
1005 const U8* x = first_variant;
1006 STRLEN outlen = first_variant - s;
1007
a0d7f935
KW
1008 while (x < send) {
1009 const STRLEN cur_len = isUTF8_CHAR(x, send);
1010 if (UNLIKELY(! cur_len)) {
1011 break;
1012 }
1013 x += cur_len;
1014 outlen++;
5ff889fb 1015 }
5ff889fb 1016
a0d7f935
KW
1017 if (el)
1018 *el = outlen;
5ff889fb 1019
a0d7f935
KW
1020 if (ep) {
1021 *ep = x;
1022 }
5ff889fb 1023
a0d7f935 1024 return (x == send);
33756530 1025 }
5ff889fb
KW
1026}
1027
1028/*
9f2abfde 1029
44170c9a 1030=for apidoc isUTF8_CHAR
8ed185f9
KW
1031
1032Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1033looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1034that represents some code point; otherwise it evaluates to 0. If non-zero, the
1035value gives how many bytes starting at C<s> comprise the code point's
1036representation. Any bytes remaining before C<e>, but beyond the ones needed to
1037form the first code point in C<s>, are not examined.
1038
13aab5dd 1039The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1040extension to official UTF-8 to represent those higher than the Unicode maximum
1041of 0x10FFFF. That means that this macro is used to efficiently decide if the
1042next few bytes in C<s> is legal UTF-8 for a single character.
1043
1044Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1045defined by Unicode to be fully interchangeable across applications;
1046C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1047#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1048code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1049
1050Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1051C<L</is_utf8_string_loclen>> to check entire strings.
1052
13aab5dd
KW
1053Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1054machines) is a valid UTF-8 character.
8ed185f9
KW
1055
1056=cut
1057
1058This uses an adaptation of the table and algorithm given in
f6521f7c 1059https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
8ed185f9
KW
1060documentation of the original version. A copyright notice for the original
1061version is given at the beginning of this file. The Perl adapation is
71525f77 1062documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1063
1064*/
1065
1066PERL_STATIC_INLINE Size_t
c9182d9c 1067Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9
KW
1068{
1069 const U8 * s = s0;
1070 UV state = 0;
1071
1072 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1073
1074 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1075 * code point, which can be returned immediately. Otherwise, it is either
1076 * malformed, or for the start byte FF which the dfa doesn't handle (except
1077 * on 32-bit ASCII platforms where it trivially is an error). Call a
1078 * helper function for the other platforms. */
1079
1080 while (s < e && LIKELY(state != 1)) {
71525f77 1081 state = PL_extended_utf8_dfa_tab[256
8ed185f9 1082 + state
71525f77 1083 + PL_extended_utf8_dfa_tab[*s]];
8ed185f9
KW
1084 if (state != 0) {
1085 s++;
1086 continue;
1087 }
1088
1089 return s - s0 + 1;
1090 }
1091
1092#if defined(UV_IS_QUAD) || defined(EBCDIC)
1093
1094 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1376b35c 1095 return is_utf8_char_helper(s0, e, 0);
8ed185f9
KW
1096 }
1097
1098#endif
1099
1100 return 0;
1101}
1102
1103/*
1104
67049a5f
KW
1105=for apidoc isSTRICT_UTF8_CHAR
1106
1107Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1108looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1109Unicode code point completely acceptable for open interchange between all
1110applications; otherwise it evaluates to 0. If non-zero, the value gives how
1111many bytes starting at C<s> comprise the code point's representation. Any
1112bytes remaining before C<e>, but beyond the ones needed to form the first code
1113point in C<s>, are not examined.
1114
1115The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1116be a surrogate nor a non-character code point. Thus this excludes any code
1117point from Perl's extended UTF-8.
1118
1119This is used to efficiently decide if the next few bytes in C<s> is
1120legal Unicode-acceptable UTF-8 for a single character.
1121
1122Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1123#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1124code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1125and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1126
1127Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1128C<L</is_strict_utf8_string_loclen>> to check entire strings.
1129
1130=cut
1131
1132This uses an adaptation of the tables and algorithm given in
f6521f7c 1133https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
67049a5f
KW
1134documentation of the original version. A copyright notice for the original
1135version is given at the beginning of this file. The Perl adapation is
1136documented at the definition of strict_extended_utf8_dfa_tab[].
1137
1138*/
1139
1140PERL_STATIC_INLINE Size_t
c9182d9c 1141Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f
KW
1142{
1143 const U8 * s = s0;
1144 UV state = 0;
1145
1146 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1147
1148 while (s < e && LIKELY(state != 1)) {
71525f77 1149 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
67049a5f
KW
1150
1151 if (state != 0) {
1152 s++;
1153 continue;
1154 }
1155
1156 return s - s0 + 1;
1157 }
1158
1159#ifndef EBCDIC
1160
1161 /* The dfa above drops out for certain Hanguls; handle them specially */
1162 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1163 return 3;
1164 }
1165
1166#endif
1167
1168 return 0;
1169}
1170
1171/*
1172
44170c9a 1173=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1174
1175Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1176looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1177Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1178the value gives how many bytes starting at C<s> comprise the code point's
1179representation. Any bytes remaining before C<e>, but beyond the ones needed to
1180form the first code point in C<s>, are not examined.
1181
1182The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1183differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1184code points. This corresponds to
1185L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1186which said that non-character code points are merely discouraged rather than
1187completely forbidden in open interchange. See
1188L<perlunicode/Noncharacter code points>.
1189
1190Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1191C<L</isUTF8_CHAR_flags>> for a more customized definition.
1192
1193Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1194C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1195
1196=cut
1197
1198This uses an adaptation of the tables and algorithm given in
f6521f7c 1199https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
c5bfbb64
KW
1200documentation of the original version. A copyright notice for the original
1201version is given at the beginning of this file. The Perl adapation is
71525f77 1202documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1203
1204*/
1205
1206PERL_STATIC_INLINE Size_t
c9182d9c 1207Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64
KW
1208{
1209 const U8 * s = s0;
1210 UV state = 0;
1211
1212 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1213
1214 while (s < e && LIKELY(state != 1)) {
71525f77 1215 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
c5bfbb64
KW
1216
1217 if (state != 0) {
1218 s++;
1219 continue;
1220 }
1221
1222 return s - s0 + 1;
1223 }
1224
1225 return 0;
1226}
1227
1228/*
1229
9f2abfde
KW
1230=for apidoc is_strict_utf8_string_loc
1231
1232Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1233case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1234"utf8ness success") in the C<ep> pointer.
1235
1236See also C<L</is_strict_utf8_string_loclen>>.
1237
1238=cut
1239*/
1240
1241#define is_strict_utf8_string_loc(s, len, ep) \
1242 is_strict_utf8_string_loclen(s, len, ep, 0)
1243
1244/*
1245
1246=for apidoc is_strict_utf8_string_loclen
1247
1248Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1249case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1250"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1251encoded characters in the C<el> pointer.
1252
1253See also C<L</is_strict_utf8_string_loc>>.
1254
1255=cut
1256*/
1257
1258PERL_STATIC_INLINE bool
c9182d9c 1259Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1260{
33756530 1261 const U8 * first_variant;
9f2abfde
KW
1262
1263 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1264
33756530
KW
1265 if (len == 0) {
1266 len = strlen((const char *) s);
1267 }
1268
1269 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1270 if (el)
1271 *el = len;
1272
1273 if (ep) {
1274 *ep = s + len;
1275 }
1276
1277 return TRUE;
1278 }
1279
1280 {
1281 const U8* const send = s + len;
1282 const U8* x = first_variant;
1283 STRLEN outlen = first_variant - s;
1284
a0d7f935
KW
1285 while (x < send) {
1286 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1287 if (UNLIKELY(! cur_len)) {
1288 break;
1289 }
1290 x += cur_len;
1291 outlen++;
9f2abfde 1292 }
9f2abfde 1293
a0d7f935
KW
1294 if (el)
1295 *el = outlen;
9f2abfde 1296
a0d7f935
KW
1297 if (ep) {
1298 *ep = x;
1299 }
9f2abfde 1300
a0d7f935 1301 return (x == send);
33756530 1302 }
9f2abfde
KW
1303}
1304
1305/*
1306
1307=for apidoc is_c9strict_utf8_string_loc
1308
1309Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1310the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1311"utf8ness success") in the C<ep> pointer.
1312
1313See also C<L</is_c9strict_utf8_string_loclen>>.
1314
1315=cut
1316*/
1317
1318#define is_c9strict_utf8_string_loc(s, len, ep) \
1319 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1320
1321/*
1322
1323=for apidoc is_c9strict_utf8_string_loclen
1324
1325Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1326the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1327"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1328characters in the C<el> pointer.
1329
1330See also C<L</is_c9strict_utf8_string_loc>>.
1331
1332=cut
1333*/
1334
1335PERL_STATIC_INLINE bool
c9182d9c 1336Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1337{
33756530 1338 const U8 * first_variant;
9f2abfde
KW
1339
1340 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1341
33756530
KW
1342 if (len == 0) {
1343 len = strlen((const char *) s);
1344 }
1345
1346 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1347 if (el)
1348 *el = len;
1349
1350 if (ep) {
1351 *ep = s + len;
1352 }
1353
1354 return TRUE;
1355 }
1356
1357 {
1358 const U8* const send = s + len;
1359 const U8* x = first_variant;
1360 STRLEN outlen = first_variant - s;
1361
a0d7f935
KW
1362 while (x < send) {
1363 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1364 if (UNLIKELY(! cur_len)) {
1365 break;
1366 }
1367 x += cur_len;
1368 outlen++;
9f2abfde 1369 }
9f2abfde 1370
a0d7f935
KW
1371 if (el)
1372 *el = outlen;
9f2abfde 1373
a0d7f935
KW
1374 if (ep) {
1375 *ep = x;
1376 }
9f2abfde 1377
a0d7f935 1378 return (x == send);
33756530 1379 }
9f2abfde
KW
1380}
1381
1382/*
1383
1384=for apidoc is_utf8_string_loc_flags
1385
1386Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1387case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1388"utf8ness success") in the C<ep> pointer.
1389
1390See also C<L</is_utf8_string_loclen_flags>>.
1391
1392=cut
1393*/
1394
1395#define is_utf8_string_loc_flags(s, len, ep, flags) \
1396 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1397
1398
1399/* The above 3 actual functions could have been moved into the more general one
1400 * just below, and made #defines that call it with the right 'flags'. They are
1401 * currently kept separate to increase their chances of getting inlined */
1402
1403/*
1404
1405=for apidoc is_utf8_string_loclen_flags
1406
1407Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1408case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1409"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1410encoded characters in the C<el> pointer.
1411
1412See also C<L</is_utf8_string_loc_flags>>.
1413
1414=cut
1415*/
1416
1417PERL_STATIC_INLINE bool
c9182d9c 1418Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1419{
33756530 1420 const U8 * first_variant;
9f2abfde
KW
1421
1422 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1423 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1424 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1425
f60f61fd 1426 if (len == 0) {
a0d7f935 1427 len = strlen((const char *) s);
f60f61fd
KW
1428 }
1429
9f2abfde
KW
1430 if (flags == 0) {
1431 return is_utf8_string_loclen(s, len, ep, el);
1432 }
1433
d044b7a7 1434 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1435 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1436 {
1437 return is_strict_utf8_string_loclen(s, len, ep, el);
1438 }
1439
d044b7a7 1440 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1441 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1442 {
1443 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1444 }
1445
33756530
KW
1446 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1447 if (el)
1448 *el = len;
1449
1450 if (ep) {
1451 *ep = s + len;
1452 }
1453
1454 return TRUE;
1455 }
1456
1457 {
1458 const U8* send = s + len;
1459 const U8* x = first_variant;
1460 STRLEN outlen = first_variant - s;
1461
a0d7f935
KW
1462 while (x < send) {
1463 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1464 if (UNLIKELY(! cur_len)) {
1465 break;
1466 }
1467 x += cur_len;
1468 outlen++;
9f2abfde 1469 }
9f2abfde 1470
a0d7f935
KW
1471 if (el)
1472 *el = outlen;
9f2abfde 1473
a0d7f935
KW
1474 if (ep) {
1475 *ep = x;
1476 }
9f2abfde 1477
a0d7f935 1478 return (x == send);
33756530 1479 }
9f2abfde
KW
1480}
1481
1482/*
7c93d8f0
KW
1483=for apidoc utf8_distance
1484
1485Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1486and C<b>.
1487
1488WARNING: use only if you *know* that the pointers point inside the
1489same UTF-8 buffer.
1490
1491=cut
1492*/
1493
1494PERL_STATIC_INLINE IV
1495Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1496{
1497 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1498
1499 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1500}
1501
1502/*
1503=for apidoc utf8_hop
1504
1505Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1506forward or backward.
1507
1508WARNING: do not use the following unless you *know* C<off> is within
1509the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1510on the first byte of character or just after the last byte of a character.
1511
1512=cut
1513*/
1514
1515PERL_STATIC_INLINE U8 *
1516Perl_utf8_hop(const U8 *s, SSize_t off)
1517{
1518 PERL_ARGS_ASSERT_UTF8_HOP;
1519
1520 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1521 * the bitops (especially ~) can create illegal UTF-8.
1522 * In other words: in Perl UTF-8 is not just for Unicode. */
1523
1524 if (off >= 0) {
1525 while (off--)
1526 s += UTF8SKIP(s);
1527 }
1528 else {
1529 while (off++) {
1530 s--;
1531 while (UTF8_IS_CONTINUATION(*s))
1532 s--;
1533 }
1534 }
e099ea69 1535 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 1536 return (U8 *)s;
e099ea69 1537 GCC_DIAG_RESTORE
7c93d8f0
KW
1538}
1539
4dab108f 1540/*
65df57a8
TC
1541=for apidoc utf8_hop_forward
1542
1543Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1544forward.
1545
1546C<off> must be non-negative.
1547
1548C<s> must be before or equal to C<end>.
1549
1550When moving forward it will not move beyond C<end>.
1551
1552Will not exceed this limit even if the string is not valid "UTF-8".
1553
1554=cut
1555*/
1556
1557PERL_STATIC_INLINE U8 *
1558Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1559{
1560 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1561
1562 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1563 * the bitops (especially ~) can create illegal UTF-8.
1564 * In other words: in Perl UTF-8 is not just for Unicode. */
1565
1566 assert(s <= end);
1567 assert(off >= 0);
1568
1569 while (off--) {
1570 STRLEN skip = UTF8SKIP(s);
de979548 1571 if ((STRLEN)(end - s) <= skip) {
e099ea69 1572 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1573 return (U8 *)end;
e099ea69 1574 GCC_DIAG_RESTORE
de979548 1575 }
65df57a8
TC
1576 s += skip;
1577 }
1578
e099ea69 1579 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1580 return (U8 *)s;
e099ea69 1581 GCC_DIAG_RESTORE
65df57a8
TC
1582}
1583
1584/*
1585=for apidoc utf8_hop_back
1586
1587Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1588backward.
1589
1590C<off> must be non-positive.
1591
1592C<s> must be after or equal to C<start>.
1593
1594When moving backward it will not move before C<start>.
1595
1596Will not exceed this limit even if the string is not valid "UTF-8".
1597
1598=cut
1599*/
1600
1601PERL_STATIC_INLINE U8 *
1602Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1603{
1604 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1605
1606 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1607 * the bitops (especially ~) can create illegal UTF-8.
1608 * In other words: in Perl UTF-8 is not just for Unicode. */
1609
1610 assert(start <= s);
1611 assert(off <= 0);
1612
1613 while (off++ && s > start) {
e7185695 1614 do {
65df57a8 1615 s--;
e7185695 1616 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8 1617 }
f6521f7c 1618
e099ea69 1619 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1620 return (U8 *)s;
e099ea69 1621 GCC_DIAG_RESTORE
65df57a8
TC
1622}
1623
1624/*
1625=for apidoc utf8_hop_safe
1626
1627Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1628either forward or backward.
1629
1630When moving backward it will not move before C<start>.
1631
1632When moving forward it will not move beyond C<end>.
1633
1634Will not exceed those limits even if the string is not valid "UTF-8".
1635
1636=cut
1637*/
1638
1639PERL_STATIC_INLINE U8 *
1640Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1641{
1642 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1643
1644 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1645 * the bitops (especially ~) can create illegal UTF-8.
1646 * In other words: in Perl UTF-8 is not just for Unicode. */
1647
1648 assert(start <= s && s <= end);
1649
1650 if (off >= 0) {
1651 return utf8_hop_forward(s, off, end);
1652 }
1653 else {
1654 return utf8_hop_back(s, off, start);
1655 }
1656}
1657
1658/*
4dab108f
KW
1659
1660=for apidoc is_utf8_valid_partial_char
1661
6cbb9248
KW
1662Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1663S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1664points. Otherwise, it returns 1 if there exists at least one non-empty
1665sequence of bytes that when appended to sequence C<s>, starting at position
1666C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1667otherwise returns 0.
1668
1669In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1670point.
1671
1672This is useful when a fixed-length buffer is being tested for being well-formed
1673UTF-8, but the final few bytes in it don't comprise a full character; that is,
1674it is split somewhere in the middle of the final code point's UTF-8
1675representation. (Presumably when the buffer is refreshed with the next chunk
1676of data, the new first bytes will complete the partial code point.) This
1677function is used to verify that the final bytes in the current buffer are in
1678fact the legal beginning of some code point, so that if they aren't, the
1679failure can be signalled without having to wait for the next read.
4dab108f
KW
1680
1681=cut
1682*/
2717076a
KW
1683#define is_utf8_valid_partial_char(s, e) \
1684 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1685
1686/*
1687
1688=for apidoc is_utf8_valid_partial_char_flags
1689
1690Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1691or not the input is a valid UTF-8 encoded partial character, but it takes an
1692extra parameter, C<flags>, which can further restrict which code points are
1693considered valid.
1694
1695If C<flags> is 0, this behaves identically to
1696C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1697of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1698there is any sequence of bytes that can complete the input partial character in
1699such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1700TRUE; otherwise FALSE. Non character code points cannot be determined based on
1701partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1702determined from just the first one or two bytes.
1703
1704=cut
1705 */
1706
56e4cf64 1707PERL_STATIC_INLINE bool
c9182d9c 1708Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1709{
f1c999a7 1710 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1711
f1c999a7 1712 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1713 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1714
8875bd48 1715 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1716 return FALSE;
1717 }
1718
1376b35c 1719 return cBOOL(is_utf8_char_helper(s, e, flags));
4dab108f
KW
1720}
1721
8bc127bf
KW
1722/*
1723
1724=for apidoc is_utf8_fixed_width_buf_flags
1725
1726Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1727is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1728otherwise it returns FALSE.
1729
1730If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1731without restriction. If the final few bytes of the buffer do not form a
1732complete code point, this will return TRUE anyway, provided that
1733C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1734
1735If C<flags> in non-zero, it can be any combination of the
1736C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1737same meanings.
1738
1739This function differs from C<L</is_utf8_string_flags>> only in that the latter
1740returns FALSE if the final few bytes of the string don't form a complete code
1741point.
1742
1743=cut
1744 */
1745#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1746 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1747
1748/*
1749
1750=for apidoc is_utf8_fixed_width_buf_loc_flags
1751
1752Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1753failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1754to the beginning of any partial character at the end of the buffer; if there is
1755no partial character C<*ep> will contain C<s>+C<len>.
1756
1757See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1758
1759=cut
1760*/
1761
1762#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1763 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1764
1765/*
1766
1767=for apidoc is_utf8_fixed_width_buf_loclen_flags
1768
1769Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1770complete, valid characters found in the C<el> pointer.
1771
1772=cut
1773*/
1774
1775PERL_STATIC_INLINE bool
c9182d9c 1776Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 1777 STRLEN len,
8bc127bf
KW
1778 const U8 **ep,
1779 STRLEN *el,
1780 const U32 flags)
1781{
1782 const U8 * maybe_partial;
1783
1784 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1785
1786 if (! ep) {
1787 ep = &maybe_partial;
1788 }
1789
1790 /* If it's entirely valid, return that; otherwise see if the only error is
1791 * that the final few bytes are for a partial character */
1792 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1793 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1794}
1795
e6a4ffc3 1796PERL_STATIC_INLINE UV
c9182d9c 1797Perl_utf8n_to_uvchr_msgs(const U8 *s,
e6a4ffc3
KW
1798 STRLEN curlen,
1799 STRLEN *retlen,
1800 const U32 flags,
1801 U32 * errors,
1802 AV ** msgs)
1803{
1804 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1805 * simple cases, and, if necessary calls a helper function to deal with the
1806 * more complex ones. Almost all well-formed non-problematic code points
1807 * are considered simple, so that it's unlikely that the helper function
1808 * will need to be called.
1809 *
1810 * This is an adaptation of the tables and algorithm given in
f6521f7c 1811 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
e6a4ffc3
KW
1812 * comprehensive documentation of the original version. A copyright notice
1813 * for the original version is given at the beginning of this file. The
71525f77 1814 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
1815 */
1816
1817 const U8 * const s0 = s;
1818 const U8 * send = s0 + curlen;
f73cfe7d 1819 UV uv = 0; /* The 0 silences some stupid compilers */
e6a4ffc3
KW
1820 UV state = 0;
1821
1822 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1823
1824 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1825 * non-problematic code point, which can be returned immediately.
1826 * Otherwise we call a helper function to figure out the more complicated
1827 * cases. */
1828
1829 while (s < send && LIKELY(state != 1)) {
71525f77 1830 UV type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3
KW
1831
1832 uv = (state == 0)
1833 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1834 : UTF8_ACCUMULATE(uv, *s);
71525f77 1835 state = PL_strict_utf8_dfa_tab[256 + state + type];
e6a4ffc3
KW
1836
1837 if (state != 0) {
1838 s++;
1839 continue;
1840 }
1841
1842 if (retlen) {
1843 *retlen = s - s0 + 1;
1844 }
1845 if (errors) {
1846 *errors = 0;
1847 }
1848 if (msgs) {
1849 *msgs = NULL;
1850 }
1851
cfebc7aa 1852 return UNI_TO_NATIVE(uv);
e6a4ffc3
KW
1853 }
1854
1855 /* Here is potentially problematic. Use the full mechanism */
1856 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1857}
1858
82651abe 1859PERL_STATIC_INLINE UV
9a9a6c98 1860Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 1861{
9a9a6c98 1862 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
1863
1864 assert(s < send);
1865
1866 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
1867
1868 /* EMPTY is not really allowed, and asserts on debugging builds. But
1869 * on non-debugging we have to deal with it, and this causes it to
1870 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 1871 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 1872 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
1873 }
1874 else {
1875 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1876 if (retlen && ret == 0 && *s != '\0') {
1877 *retlen = (STRLEN) -1;
1878 }
1879
1880 return ret;
1881 }
1882}
1883
c8028aa6
TC
1884/* ------------------------------- perl.h ----------------------------- */
1885
1886/*
3f620621 1887=for apidoc_section $utility
dcccc8ff 1888
44170c9a 1889=for apidoc is_safe_syscall
c8028aa6 1890
1a0efc9a
KW
1891Test that the given C<pv> (with length C<len>) doesn't contain any internal
1892C<NUL> characters.
1893If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
1894category, and return FALSE.
c8028aa6
TC
1895
1896Return TRUE if the name is safe.
1897
1a0efc9a
KW
1898C<what> and C<op_name> are used in any warning.
1899
796b6530 1900Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1901
1902=cut
1903*/
1904
1905PERL_STATIC_INLINE bool
ffd62fc2
KW
1906Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1907{
c8028aa6
TC
1908 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1909 * perl itself uses xce*() functions which accept 8-bit strings.
1910 */
1911
1912 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1913
6c4650b3 1914 if (len > 1) {
c8028aa6 1915 char *null_at;
41188aa0 1916 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1917 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1918 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1919 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1920 what, op_name, pv, null_at+1);
c8028aa6
TC
1921 return FALSE;
1922 }
1923 }
1924
1925 return TRUE;
1926}
1927
1928/*
7cb3f959
TC
1929
1930Return true if the supplied filename has a newline character
fa6c7d00 1931immediately before the first (hopefully only) NUL.
7cb3f959
TC
1932
1933My original look at this incorrectly used the len from SvPV(), but
1934that's incorrect, since we allow for a NUL in pv[len-1].
1935
1936So instead, strlen() and work from there.
1937
1938This allow for the user reading a filename, forgetting to chomp it,
1939then calling:
1940
1941 open my $foo, "$file\0";
1942
1943*/
1944
1945#ifdef PERL_CORE
1946
1947PERL_STATIC_INLINE bool
ffd62fc2
KW
1948S_should_warn_nl(const char *pv)
1949{
7cb3f959
TC
1950 STRLEN len;
1951
1952 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1953
1954 len = strlen(pv);
1955
1956 return len > 0 && pv[len-1] == '\n';
1957}
1958
1959#endif
1960
3a019afd
KW
1961#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1962
1963PERL_STATIC_INLINE bool
1964S_lossless_NV_to_IV(const NV nv, IV *ivp)
1965{
1966 /* This function determines if the input NV 'nv' may be converted without
1967 * loss of data to an IV. If not, it returns FALSE taking no other action.
1968 * But if it is possible, it does the conversion, returning TRUE, and
1969 * storing the converted result in '*ivp' */
1970
1971 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1972
cd304e76
DM
1973# if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
1974 /* Normally any comparison with a NaN returns false; if we can't rely
1975 * on that behaviour, check explicitly */
3a019afd
KW
1976 if (UNLIKELY(Perl_isnan(nv))) {
1977 return FALSE;
1978 }
3a019afd
KW
1979# endif
1980
cd304e76
DM
1981 /* Written this way so that with an always-false NaN comparison we
1982 * return false */
ef0a8475 1983 if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
3a019afd
KW
1984 return FALSE;
1985 }
1986
1987 if ((IV) nv != nv) {
1988 return FALSE;
1989 }
1990
1991 *ivp = (IV) nv;
1992 return TRUE;
1993}
1994
1995#endif
1996
81d52ecd
JH
1997/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
1998
94b0cb42
KW
1999#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2000
81d52ecd
JH
2001#define MAX_CHARSET_NAME_LENGTH 2
2002
2003PERL_STATIC_INLINE const char *
94b0cb42 2004S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 2005{
94b0cb42
KW
2006 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2007
81d52ecd
JH
2008 /* Returns a string that corresponds to the name of the regex character set
2009 * given by 'flags', and *lenp is set the length of that string, which
2010 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2011
2012 *lenp = 1;
2013 switch (get_regex_charset(flags)) {
2014 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2015 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2016 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2017 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2018 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2019 *lenp = 2;
2020 return ASCII_MORE_RESTRICT_PAT_MODS;
2021 }
2022 /* The NOT_REACHED; hides an assert() which has a rather complex
2023 * definition in perl.h. */
2024 NOT_REACHED; /* NOTREACHED */
2025 return "?"; /* Unknown */
2026}
2027
94b0cb42
KW
2028#endif
2029
7cb3f959 2030/*
ed382232
TC
2031
2032Return false if any get magic is on the SV other than taint magic.
2033
2034*/
2035
2036PERL_STATIC_INLINE bool
ffd62fc2
KW
2037Perl_sv_only_taint_gmagic(SV *sv)
2038{
ed382232
TC
2039 MAGIC *mg = SvMAGIC(sv);
2040
2041 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2042
2043 while (mg) {
2044 if (mg->mg_type != PERL_MAGIC_taint
2045 && !(mg->mg_flags & MGf_GSKIP)
2046 && mg->mg_virtual->svt_get) {
2047 return FALSE;
2048 }
2049 mg = mg->mg_moremagic;
2050 }
2051
2052 return TRUE;
2053}
2054
ed8ff0f3
DM
2055/* ------------------ cop.h ------------------------------------------- */
2056
5b6f7443
DM
2057/* implement GIMME_V() macro */
2058
2059PERL_STATIC_INLINE U8
2060Perl_gimme_V(pTHX)
2061{
2062 I32 cxix;
2063 U8 gimme = (PL_op->op_flags & OPf_WANT);
2064
2065 if (gimme)
2066 return gimme;
2067 cxix = PL_curstackinfo->si_cxsubix;
2068 if (cxix < 0)
390fe0c0 2069 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
5b6f7443
DM
2070 assert(cxstack[cxix].blk_gimme & G_WANT);
2071 return (cxstack[cxix].blk_gimme & G_WANT);
2072}
2073
ed8ff0f3
DM
2074
2075/* Enter a block. Push a new base context and return its address. */
2076
2077PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2078Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2079{
2080 PERL_CONTEXT * cx;
2081
2082 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2083
2084 CXINC;
2085 cx = CX_CUR();
2086 cx->cx_type = type;
2087 cx->blk_gimme = gimme;
2088 cx->blk_oldsaveix = saveix;
4caf7d8c 2089 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2090 cx->blk_oldcop = PL_curcop;
4caf7d8c 2091 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2092 cx->blk_oldscopesp = PL_scopestack_ix;
2093 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2094 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2095
2096 PL_tmps_floor = PL_tmps_ix;
2097 CX_DEBUG(cx, "PUSH");
2098 return cx;
2099}
2100
2101
2102/* Exit a block (RETURN and LAST). */
2103
2104PERL_STATIC_INLINE void
c9182d9c 2105Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2106{
2107 PERL_ARGS_ASSERT_CX_POPBLOCK;
2108
2109 CX_DEBUG(cx, "POP");
2110 /* these 3 are common to cx_popblock and cx_topblock */
2111 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2112 PL_scopestack_ix = cx->blk_oldscopesp;
2113 PL_curpm = cx->blk_oldpm;
2114
2115 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2116 * and leaves a CX entry lying around for repeated use, so
2117 * skip for multicall */ \
2118 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2119 || PL_savestack_ix == cx->blk_oldsaveix);
2120 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2121 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2122}
2123
2124/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2125 * Whereas cx_popblock() restores the state to the point just before
2126 * cx_pushblock() was called, cx_topblock() restores it to the point just
2127 * *after* cx_pushblock() was called. */
2128
2129PERL_STATIC_INLINE void
c9182d9c 2130Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2131{
2132 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2133
2134 CX_DEBUG(cx, "TOP");
2135 /* these 3 are common to cx_popblock and cx_topblock */
2136 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2137 PL_scopestack_ix = cx->blk_oldscopesp;
2138 PL_curpm = cx->blk_oldpm;
2139
2140 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2141}
2142
2143
a73d8813 2144PERL_STATIC_INLINE void
c9182d9c 2145Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2146{
2147 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2148
2149 PERL_ARGS_ASSERT_CX_PUSHSUB;
2150
3f6bd23a 2151 PERL_DTRACE_PROBE_ENTRY(cv);
5b6f7443
DM
2152 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2153 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
a73d8813
DM
2154 cx->blk_sub.cv = cv;
2155 cx->blk_sub.olddepth = CvDEPTH(cv);
2156 cx->blk_sub.prevcomppad = PL_comppad;
2157 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2158 cx->blk_sub.retop = retop;
2159 SvREFCNT_inc_simple_void_NN(cv);
2160 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2161}
2162
2163
2164/* subsets of cx_popsub() */
2165
2166PERL_STATIC_INLINE void
c9182d9c 2167Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2168{
2169 CV *cv;
2170
2171 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2172 assert(CxTYPE(cx) == CXt_SUB);
2173
2174 PL_comppad = cx->blk_sub.prevcomppad;
2175 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2176 cv = cx->blk_sub.cv;
2177 CvDEPTH(cv) = cx->blk_sub.olddepth;
2178 cx->blk_sub.cv = NULL;
2179 SvREFCNT_dec(cv);
5b6f7443 2180 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
a73d8813
DM
2181}
2182
2183
2184/* handle the @_ part of leaving a sub */
2185
2186PERL_STATIC_INLINE void
c9182d9c 2187Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2188{
2189 AV *av;
2190
2191 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2192 assert(CxTYPE(cx) == CXt_SUB);
2193 assert(AvARRAY(MUTABLE_AV(
2194 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2195 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2196
2197 CX_POP_SAVEARRAY(cx);
2198 av = MUTABLE_AV(PAD_SVl(0));
2199 if (UNLIKELY(AvREAL(av)))
2200 /* abandon @_ if it got reified */
2201 clear_defarray(av, 0);
2202 else {
2203 CLEAR_ARGARRAY(av);
2204 }
2205}
2206
2207
2208PERL_STATIC_INLINE void
c9182d9c 2209Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2210{
2211 PERL_ARGS_ASSERT_CX_POPSUB;
2212 assert(CxTYPE(cx) == CXt_SUB);
2213
3f6bd23a 2214 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2215
2216 if (CxHASARGS(cx))
2217 cx_popsub_args(cx);
2218 cx_popsub_common(cx);
2219}
2220
2221
6a7d52cc 2222PERL_STATIC_INLINE void
c9182d9c 2223Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2224{
2225 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2226
5b6f7443
DM
2227 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2228 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
6a7d52cc
DM
2229 cx->blk_format.cv = cv;
2230 cx->blk_format.retop = retop;
2231 cx->blk_format.gv = gv;
2232 cx->blk_format.dfoutgv = PL_defoutgv;
2233 cx->blk_format.prevcomppad = PL_comppad;
2234 cx->blk_u16 = 0;
2235
2236 SvREFCNT_inc_simple_void_NN(cv);
2237 CvDEPTH(cv)++;
2238 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2239}
2240
2241
2242PERL_STATIC_INLINE void
c9182d9c 2243Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2244{
2245 CV *cv;
2246 GV *dfout;
2247
2248 PERL_ARGS_ASSERT_CX_POPFORMAT;
2249 assert(CxTYPE(cx) == CXt_FORMAT);
2250
2251 dfout = cx->blk_format.dfoutgv;
2252 setdefout(dfout);
2253 cx->blk_format.dfoutgv = NULL;
2254 SvREFCNT_dec_NN(dfout);
2255
2256 PL_comppad = cx->blk_format.prevcomppad;
2257 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2258 cv = cx->blk_format.cv;
2259 cx->blk_format.cv = NULL;
2260 --CvDEPTH(cv);
2261 SvREFCNT_dec_NN(cv);
5b6f7443 2262 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
6a7d52cc
DM
2263}
2264
2265
13febba5 2266PERL_STATIC_INLINE void
c9182d9c 2267Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5
DM
2268{
2269 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2270
5b6f7443
DM
2271 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2272 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
13febba5
DM
2273 cx->blk_eval.retop = retop;
2274 cx->blk_eval.old_namesv = namesv;
2275 cx->blk_eval.old_eval_root = PL_eval_root;
2276 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2277 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2278 cx->blk_eval.cur_top_env = PL_top_env;
2279
4c57ced5 2280 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2281 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2282 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2283}
2284
2285
2286PERL_STATIC_INLINE void
c9182d9c 2287Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2288{
2289 SV *sv;
2290
2291 PERL_ARGS_ASSERT_CX_POPEVAL;
2292 assert(CxTYPE(cx) == CXt_EVAL);
2293
2294 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2295 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2296 PL_eval_root = cx->blk_eval.old_eval_root;
2297 sv = cx->blk_eval.cur_text;
4c57ced5 2298 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2299 cx->blk_eval.cur_text = NULL;
2300 SvREFCNT_dec_NN(sv);
2301 }
2302
2303 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2304 if (sv) {
2305 cx->blk_eval.old_namesv = NULL;
2306 SvREFCNT_dec_NN(sv);
2307 }
5b6f7443 2308 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
13febba5 2309}
6a7d52cc 2310
a73d8813 2311
d1b6bf72
DM
2312/* push a plain loop, i.e.
2313 * { block }
2314 * while (cond) { block }
2315 * for (init;cond;continue) { block }
2316 * This loop can be last/redo'ed etc.
2317 */
2318
2319PERL_STATIC_INLINE void
c9182d9c 2320Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2321{
2322 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2323 cx->blk_loop.my_op = cLOOP;
2324}
2325
2326
2327/* push a true for loop, i.e.
2328 * for var (list) { block }
2329 */
2330
2331PERL_STATIC_INLINE void
c9182d9c 2332Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
2333{
2334 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2335
2336 /* this one line is common with cx_pushloop_plain */
2337 cx->blk_loop.my_op = cLOOP;
2338
2339 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2340 cx->blk_loop.itersave = itersave;
2341#ifdef USE_ITHREADS
2342 cx->blk_loop.oldcomppad = PL_comppad;
2343#endif
2344}
2345
2346
2347/* pop all loop types, including plain */
2348
2349PERL_STATIC_INLINE void
c9182d9c 2350Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2351{
2352 PERL_ARGS_ASSERT_CX_POPLOOP;
2353
2354 assert(CxTYPE_is_LOOP(cx));
2355 if ( CxTYPE(cx) == CXt_LOOP_ARY
2356 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2357 {
2358 /* Free ary or cur. This assumes that state_u.ary.ary
2359 * aligns with state_u.lazysv.cur. See cx_dup() */
2360 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2361 cx->blk_loop.state_u.lazysv.cur = NULL;
2362 SvREFCNT_dec_NN(sv);
2363 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2364 sv = cx->blk_loop.state_u.lazysv.end;
2365 cx->blk_loop.state_u.lazysv.end = NULL;
2366 SvREFCNT_dec_NN(sv);
2367 }
2368 }
2369 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2370 SV *cursv;
2371 SV **svp = (cx)->blk_loop.itervar_u.svp;
2372 if ((cx->cx_type & CXp_FOR_GV))
2373 svp = &GvSV((GV*)svp);
2374 cursv = *svp;
2375 *svp = cx->blk_loop.itersave;
2376 cx->blk_loop.itersave = NULL;
2377 SvREFCNT_dec(cursv);
2378 }
2379}
2380
2a7b7c61
DM
2381
2382PERL_STATIC_INLINE void
c9182d9c 2383Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2384{
7896dde7 2385 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 2386
7896dde7 2387 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
2388}
2389
2390
2391PERL_STATIC_INLINE void
c9182d9c 2392Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2393{
7896dde7
Z
2394 PERL_ARGS_ASSERT_CX_POPWHEN;
2395 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
2396
2397 PERL_UNUSED_ARG(cx);
59a14f30 2398 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
2399 /* currently NOOP */
2400}
2401
2402
7896dde7 2403PERL_STATIC_INLINE void
c9182d9c 2404Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
2405{
2406 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2407
2408 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2409 cx->blk_givwhen.defsv_save = orig_defsv;
2410}
2411
2412
2413PERL_STATIC_INLINE void
c9182d9c 2414Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
2415{
2416 SV *sv;
2417
2418 PERL_ARGS_ASSERT_CX_POPGIVEN;
2419 assert(CxTYPE(cx) == CXt_GIVEN);
2420
2421 sv = GvSV(PL_defgv);
2422 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2423 cx->blk_givwhen.defsv_save = NULL;
2424 SvREFCNT_dec(sv);
2425}
2426
ec2c235b
KW
2427/* ------------------ util.h ------------------------------------------- */
2428
2429/*
3f620621 2430=for apidoc_section $string
ec2c235b
KW
2431
2432=for apidoc foldEQ
2433
2434Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2435same
2436case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2437match themselves and their opposite case counterparts. Non-cased and non-ASCII
2438range bytes match only themselves.
2439
2440=cut
2441*/
2442
2443PERL_STATIC_INLINE I32
2444Perl_foldEQ(const char *s1, const char *s2, I32 len)
2445{
2446 const U8 *a = (const U8 *)s1;
2447 const U8 *b = (const U8 *)s2;
2448
2449 PERL_ARGS_ASSERT_FOLDEQ;
2450
2451 assert(len >= 0);
2452
2453 while (len--) {
2454 if (*a != *b && *a != PL_fold[*b])
2455 return 0;
2456 a++,b++;
2457 }
2458 return 1;
2459}
2460
0f9cb40c 2461PERL_STATIC_INLINE I32
ec2c235b
KW
2462Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2463{
79a1fabd
KW
2464 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2465 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2466 * does not check for this. Nor does it check that the strings each have
2467 * at least 'len' characters. */
ec2c235b
KW
2468
2469 const U8 *a = (const U8 *)s1;
2470 const U8 *b = (const U8 *)s2;
2471
2472 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2473
2474 assert(len >= 0);
2475
2476 while (len--) {
2477 if (*a != *b && *a != PL_fold_latin1[*b]) {
2478 return 0;
2479 }
2480 a++, b++;
2481 }
2482 return 1;
2483}
2484
2485/*
3f620621 2486=for apidoc_section $locale
ec2c235b
KW
2487=for apidoc foldEQ_locale
2488
2489Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2490same case-insensitively in the current locale; false otherwise.
2491
2492=cut
2493*/
2494
0f9cb40c 2495PERL_STATIC_INLINE I32
ec2c235b
KW
2496Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2497{
ec2c235b
KW
2498 const U8 *a = (const U8 *)s1;
2499 const U8 *b = (const U8 *)s2;
2500
2501 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2502
2503 assert(len >= 0);
2504
2505 while (len--) {
2506 if (*a != *b && *a != PL_fold_locale[*b])
2507 return 0;
2508 a++,b++;
2509 }
2510 return 1;
2511}
2512
1ab100a8 2513/*
3f620621 2514=for apidoc_section $string
1ab100a8
KW
2515=for apidoc my_strnlen
2516
2517The C library C<strnlen> if available, or a Perl implementation of it.
2518
2519C<my_strnlen()> computes the length of the string, up to C<maxlen>
a3815e44 2520characters. It will never attempt to address more than C<maxlen>
1ab100a8
KW
2521characters, making it suitable for use with strings that are not
2522guaranteed to be NUL-terminated.
2523
2524=cut
2525
2526Description stolen from http://man.openbsd.org/strnlen.3,
2527implementation stolen from PostgreSQL.
2528*/
2529#ifndef HAS_STRNLEN
2530
2531PERL_STATIC_INLINE Size_t
2532Perl_my_strnlen(const char *str, Size_t maxlen)
2533{
2534 const char *end = (char *) memchr(str, '\0', maxlen);
2535
2536 PERL_ARGS_ASSERT_MY_STRNLEN;
2537
2538 if (end == NULL) return maxlen;
2539 return end - str;
2540}
2541
2542#endif
2543
6dba01e2
KW
2544#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2545
2546PERL_STATIC_INLINE void *
2547S_my_memrchr(const char * s, const char c, const STRLEN len)
2548{
2549 /* memrchr(), since many platforms lack it */
2550
2551 const char * t = s + len - 1;
2552
2553 PERL_ARGS_ASSERT_MY_MEMRCHR;
2554
2555 while (t >= s) {
2556 if (*t == c) {
2557 return (void *) t;
2558 }
2559 t--;
2560 }
2561
2562 return NULL;
2563}
2564
2565#endif
2566
24f3e849
KW
2567PERL_STATIC_INLINE char *
2568Perl_mortal_getenv(const char * str)
2569{
2570 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
2571 *
2572 * It's (mostly) thread-safe because it uses a mutex to prevent
2573 * simultaneous access from other threads that use the same mutex, and
2574 * makes a copy of the result before releasing that mutex. All of the Perl
2575 * core uses that mutex, but, like all mutexes, everything has to cooperate
2576 * for it to completely work. It is possible for code from, say XS, to not
2577 * use this mutex, defeating the safety.
2578 *
2579 * On some platforms, getenv() is not sequential-call-safe, because
2580 * subsequent calls destroy the static storage inside the C library
2581 * returned by an earlier call. The result must be copied or completely
2582 * acted upon before a subsequent getenv call. Those calls could come from
2583 * another thread. Again, making a copy while controlling the mutex
2584 * prevents these problems..
2585 *
2586 * To prevent leaks, the copy is made by creating a new SV containing it,
2587 * mortalizing the SV, and returning the SV's string (the copy). Thus this
2588 * is a drop-in replacement for getenv().
2589 *
2590 * A complication is that this can be called during phases where the
2591 * mortalization process isn't available. These are in interpreter
2592 * destruction or early in construction. khw believes that at these times
2593 * there shouldn't be anything else going on, so plain getenv is safe AS
2594 * LONG AS the caller acts on the return before calling it again. */
2595
2596 char * ret;
2597 dTHX;
2598
2599 PERL_ARGS_ASSERT_MORTAL_GETENV;
2600
2601 /* Can't mortalize without stacks. khw believes that no other threads
2602 * should be running, so no need to lock things, and this may be during a
2603 * phase when locking isn't even available */
2604 if (UNLIKELY(PL_scopestack_ix == 0)) {
2605 return getenv(str);
2606 }
2607
2608 ENV_LOCK;
2609
2610 ret = getenv(str);
2611
2612 if (ret != NULL) {
2613 ret = SvPVX(sv_2mortal(newSVpv(ret, 0)));
2614 }
2615
2616 ENV_UNLOCK;
2617 return ret;
2618}
2619
ed382232 2620/*
c8028aa6
TC
2621 * ex: set ts=8 sts=4 sw=4 et:
2622 */