This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
An initial implementation of builtin::indexed
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9 8 * This file contains tables and code adapted from
f6521f7c 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
8ed185f9
KW
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
37 * Each section names the header file that the functions "belong" to.
38 */
27669aa4 39
be3a7a5d
KW
40/* ------------------------------- av.h ------------------------------- */
41
87306e06 42/*
3f620621 43=for apidoc_section $AV
87306e06
KW
44=for apidoc av_count
45Returns the number of elements in the array C<av>. This is the true length of
46the array, including any undefined elements. It is always the same as
47S<C<av_top_index(av) + 1>>.
48
49=cut
50*/
51PERL_STATIC_INLINE Size_t
52Perl_av_count(pTHX_ AV *av)
be3a7a5d 53{
87306e06 54 PERL_ARGS_ASSERT_AV_COUNT;
be3a7a5d
KW
55 assert(SvTYPE(av) == SVt_PVAV);
56
87306e06 57 return AvFILL(av) + 1;
be3a7a5d
KW
58}
59
84c75204
RL
60/* ------------------------------- av.c ------------------------------- */
61
62/*
63=for apidoc av_store_simple
64
65This is a cut-down version of av_store that assumes that the array is
66very straightforward - no magic, not readonly, and AvREAL - and that
67C<key> is not negative. This function MUST NOT be used in situations
68where any of those assumptions may not hold.
69
70Stores an SV in an array. The array index is specified as C<key>. It
71can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
72
73Note that the caller is responsible for suitably incrementing the reference
74count of C<val> before the call.
75
76Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
77
78=cut
79*/
80
81PERL_STATIC_INLINE SV**
82Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
83{
56077d95
LT
84 SV** ary;
85
84c75204
RL
86 PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
87 assert(SvTYPE(av) == SVt_PVAV);
88 assert(!SvMAGICAL(av));
89 assert(!SvREADONLY(av));
90 assert(AvREAL(av));
91 assert(key > -1);
92
56077d95 93 ary = AvARRAY(av);
84c75204
RL
94
95 if (AvFILLp(av) < key) {
96 if (key > AvMAX(av)) {
97 av_extend(av,key);
98 ary = AvARRAY(av);
99 }
100 AvFILLp(av) = key;
101 } else
102 SvREFCNT_dec(ary[key]);
103
104 ary[key] = val;
105 return &ary[key];
106}
107
108/*
109=for apidoc av_fetch_simple
110
111This is a cut-down version of av_fetch that assumes that the array is
112very straightforward - no magic, not readonly, and AvREAL - and that
113C<key> is not negative. This function MUST NOT be used in situations
114where any of those assumptions may not hold.
115
116Returns the SV at the specified index in the array. The C<key> is the
117index. If lval is true, you are guaranteed to get a real SV back (in case
118it wasn't real before), which you can then modify. Check that the return
119value is non-null before dereferencing it to a C<SV*>.
120
121The rough perl equivalent is C<$myarray[$key]>.
122
123=cut
124*/
125
126PERL_STATIC_INLINE SV**
127Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
128{
129 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
130 assert(SvTYPE(av) == SVt_PVAV);
131 assert(!SvMAGICAL(av));
132 assert(!SvREADONLY(av));
133 assert(AvREAL(av));
134 assert(key > -1);
135
136 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
8fcb2425 137 return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
84c75204
RL
138 } else {
139 return &AvARRAY(av)[key];
140 }
141}
142
1afe1db1
FC
143/* ------------------------------- cv.h ------------------------------- */
144
fa3e44c0 145/*
3f620621 146=for apidoc_section $CV
fa3e44c0
KW
147=for apidoc CvGV
148Returns the GV associated with the CV C<sv>, reifying it if necessary.
149
150=cut
151*/
ae77754a 152PERL_STATIC_INLINE GV *
c9182d9c 153Perl_CvGV(pTHX_ CV *sv)
ae77754a 154{
74804ad1
KW
155 PERL_ARGS_ASSERT_CVGV;
156
ae77754a 157 return CvNAMED(sv)
1604cfb0
MS
158 ? Perl_cvgv_from_hek(aTHX_ sv)
159 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
ae77754a
FC
160}
161
1afe1db1 162PERL_STATIC_INLINE I32 *
74804ad1 163Perl_CvDEPTH(const CV * const sv)
1afe1db1 164{
74804ad1 165 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 166 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 167
8de47657 168 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
169}
170
d16269d8
PM
171/*
172 CvPROTO returns the prototype as stored, which is not necessarily what
173 the interpreter should be using. Specifically, the interpreter assumes
174 that spaces have been stripped, which has been the case if the prototype
175 was added by toke.c, but is generally not the case if it was added elsewhere.
176 Since we can't enforce the spacelessness at assignment time, this routine
177 provides a temporary copy at parse time with spaces removed.
178 I<orig> is the start of the original buffer, I<len> is the length of the
179 prototype and will be updated when this returns.
180 */
181
5b67adb8 182#ifdef PERL_CORE
d16269d8
PM
183PERL_STATIC_INLINE char *
184S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
185{
186 SV * tmpsv;
187 char * tmps;
188 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
189 tmps = SvPVX(tmpsv);
190 while ((*len)--) {
1604cfb0
MS
191 if (!isSPACE(*orig))
192 *tmps++ = *orig;
193 orig++;
d16269d8
PM
194 }
195 *tmps = '\0';
196 *len = tmps - SvPVX(tmpsv);
1604cfb0 197 return SvPVX(tmpsv);
d16269d8 198}
5b67adb8 199#endif
d16269d8 200
25fdce4a
FC
201/* ------------------------------- mg.h ------------------------------- */
202
203#if defined(PERL_CORE) || defined(PERL_EXT)
204/* assumes get-magic and stringification have already occurred */
205PERL_STATIC_INLINE STRLEN
206S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
207{
208 assert(mg->mg_type == PERL_MAGIC_regex_global);
209 assert(mg->mg_len != -1);
210 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
1604cfb0 211 return (STRLEN)mg->mg_len;
25fdce4a 212 else {
1604cfb0
MS
213 const STRLEN pos = (STRLEN)mg->mg_len;
214 /* Without this check, we may read past the end of the buffer: */
215 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
216 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
25fdce4a
FC
217 }
218}
219#endif
220
03414f05
FC
221/* ------------------------------- pad.h ------------------------------ */
222
223#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
224PERL_STATIC_INLINE bool
b9d5702c 225S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 226{
b9d5702c
KW
227 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
228
03414f05
FC
229 /* is seq within the range _LOW to _HIGH ?
230 * This is complicated by the fact that PL_cop_seqmax
231 * may have wrapped around at some point */
232 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
1604cfb0 233 return FALSE; /* not yet introduced */
03414f05
FC
234
235 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
236 /* in compiling scope */
1604cfb0
MS
237 if (
238 (seq > COP_SEQ_RANGE_LOW(pn))
239 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
240 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
241 )
242 return TRUE;
03414f05
FC
243 }
244 else if (
1604cfb0
MS
245 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
246 ?
247 ( seq > COP_SEQ_RANGE_LOW(pn)
248 || seq <= COP_SEQ_RANGE_HIGH(pn))
03414f05 249
1604cfb0
MS
250 : ( seq > COP_SEQ_RANGE_LOW(pn)
251 && seq <= COP_SEQ_RANGE_HIGH(pn))
03414f05 252 )
1604cfb0 253 return TRUE;
03414f05
FC
254 return FALSE;
255}
256#endif
257
33a4312b
FC
258/* ------------------------------- pp.h ------------------------------- */
259
260PERL_STATIC_INLINE I32
c9182d9c 261Perl_TOPMARK(pTHX)
33a4312b
FC
262{
263 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
1604cfb0
MS
264 "MARK top %p %" IVdf "\n",
265 PL_markstack_ptr,
266 (IV)*PL_markstack_ptr)));
33a4312b
FC
267 return *PL_markstack_ptr;
268}
269
270PERL_STATIC_INLINE I32
c9182d9c 271Perl_POPMARK(pTHX)
33a4312b
FC
272{
273 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
1604cfb0
MS
274 "MARK pop %p %" IVdf "\n",
275 (PL_markstack_ptr-1),
276 (IV)*(PL_markstack_ptr-1))));
33a4312b
FC
277 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
278 return *PL_markstack_ptr--;
279}
280
8d919b0a
FC
281/* ----------------------------- regexp.h ----------------------------- */
282
9d0d3060
NC
283/* PVLVs need to act as a superset of all scalar types - they are basically
284 * PVMGs with a few extra fields.
285 * REGEXPs are first class scalars, but have many fields that can't be copied
286 * into a PVLV body.
287 *
288 * Hence we take a different approach - instead of a copy, PVLVs store a pointer
289 * back to the original body. To avoid increasing the size of PVLVs just for the
290 * rare case of REGEXP assignment, this pointer is stored in the memory usually
291 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
292 * read the pointer from the two possible locations. The macro SvLEN() wraps the
293 * access to the union's member xpvlenu_len, but there is no equivalent macro
294 * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
295 *
296 * See commit df6b4bd56551f2d3 for more details. */
297
8d919b0a 298PERL_STATIC_INLINE struct regexp *
c9182d9c 299Perl_ReANY(const REGEXP * const re)
8d919b0a 300{
df6b4bd5 301 XPV* const p = (XPV*)SvANY(re);
bdef45de
KW
302
303 PERL_ARGS_ASSERT_REANY;
8d919b0a 304 assert(isREGEXP(re));
bdef45de 305
df6b4bd5
DM
306 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
307 : (struct regexp *)p;
8d919b0a
FC
308}
309
27669aa4
FC
310/* ------------------------------- sv.h ------------------------------- */
311
a887b094 312PERL_STATIC_INLINE bool
4eff5eb8
KW
313Perl_SvTRUE(pTHX_ SV *sv)
314{
315 PERL_ARGS_ASSERT_SVTRUE;
316
6aa30f5e 317 if (UNLIKELY(sv == NULL))
9d0469db 318 return FALSE;
7b9b9e32
PE
319 SvGETMAGIC(sv);
320 return SvTRUE_nomg_NN(sv);
a887b094
PE
321}
322
4eff5eb8
KW
323PERL_STATIC_INLINE bool
324Perl_SvTRUE_nomg(pTHX_ SV *sv)
325{
326 PERL_ARGS_ASSERT_SVTRUE_NOMG;
327
328 if (UNLIKELY(sv == NULL))
329 return FALSE;
330 return SvTRUE_nomg_NN(sv);
331}
332
333PERL_STATIC_INLINE bool
334Perl_SvTRUE_NN(pTHX_ SV *sv)
335{
336 PERL_ARGS_ASSERT_SVTRUE_NN;
337
338 SvGETMAGIC(sv);
339 return SvTRUE_nomg_NN(sv);
340}
341
342PERL_STATIC_INLINE bool
343Perl_SvTRUE_common(pTHX_ SV * sv, const bool sv_2bool_is_fallback)
344{
345 PERL_ARGS_ASSERT_SVTRUE_COMMON;
346
347 if (UNLIKELY(SvIMMORTAL_INTERP(sv)))
348 return SvIMMORTAL_TRUE(sv);
349
350 if (! SvOK(sv))
351 return FALSE;
352
353 if (SvPOK(sv))
354 return SvPVXtrue(sv);
355
356 if (SvIOK(sv))
357 return SvIVX(sv) != 0; /* casts to bool */
358
359 if (SvROK(sv) && !(SvOBJECT(SvRV(sv)) && HvAMAGIC(SvSTASH(SvRV(sv)))))
360 return TRUE;
361
362 if (sv_2bool_is_fallback)
363 return sv_2bool_nomg(sv);
364
365 return isGV_with_GP(sv);
366}
367
368
27669aa4 369PERL_STATIC_INLINE SV *
c9182d9c 370Perl_SvREFCNT_inc(SV *sv)
27669aa4 371{
2439e033 372 if (LIKELY(sv != NULL))
1604cfb0 373 SvREFCNT(sv)++;
27669aa4
FC
374 return sv;
375}
376PERL_STATIC_INLINE SV *
c9182d9c 377Perl_SvREFCNT_inc_NN(SV *sv)
27669aa4 378{
3f2f854a
KW
379 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
380
27669aa4
FC
381 SvREFCNT(sv)++;
382 return sv;
383}
384PERL_STATIC_INLINE void
c9182d9c 385Perl_SvREFCNT_inc_void(SV *sv)
27669aa4 386{
2439e033 387 if (LIKELY(sv != NULL))
1604cfb0 388 SvREFCNT(sv)++;
27669aa4 389}
75e16a44 390PERL_STATIC_INLINE void
c9182d9c 391Perl_SvREFCNT_dec(pTHX_ SV *sv)
75e16a44 392{
2439e033 393 if (LIKELY(sv != NULL)) {
1604cfb0
MS
394 U32 rc = SvREFCNT(sv);
395 if (LIKELY(rc > 1))
396 SvREFCNT(sv) = rc - 1;
397 else
398 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
399 }
400}
541377b1
FC
401
402PERL_STATIC_INLINE void
c9182d9c 403Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
4a9a56a7
DM
404{
405 U32 rc = SvREFCNT(sv);
3f2f854a
KW
406
407 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
408
79e2a32a 409 if (LIKELY(rc > 1))
1604cfb0 410 SvREFCNT(sv) = rc - 1;
4a9a56a7 411 else
1604cfb0 412 Perl_sv_free2(aTHX_ sv, rc);
4a9a56a7
DM
413}
414
415PERL_STATIC_INLINE void
1bd041dc 416Perl_SvAMAGIC_on(SV *sv)
541377b1 417{
1bd041dc 418 PERL_ARGS_ASSERT_SVAMAGIC_ON;
541377b1 419 assert(SvROK(sv));
1bd041dc 420
541377b1
FC
421 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
422}
423PERL_STATIC_INLINE void
1bd041dc 424Perl_SvAMAGIC_off(SV *sv)
541377b1 425{
1bd041dc
KW
426 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
427
541377b1 428 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
1604cfb0 429 HvAMAGIC_off(SvSTASH(SvRV(sv)));
541377b1
FC
430}
431
432PERL_STATIC_INLINE U32
c9182d9c 433Perl_SvPADSTALE_on(SV *sv)
541377b1 434{
c0683843 435 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
436 return SvFLAGS(sv) |= SVs_PADSTALE;
437}
438PERL_STATIC_INLINE U32
c9182d9c 439Perl_SvPADSTALE_off(SV *sv)
541377b1 440{
c0683843 441 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
442 return SvFLAGS(sv) &= ~SVs_PADSTALE;
443}
25fdce4a 444#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 445PERL_STATIC_INLINE STRLEN
6964422a 446S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 447{
25fdce4a 448 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a 449 if (SvGAMAGIC(sv)) {
1604cfb0
MS
450 U8 *hopped = utf8_hop((U8 *)pv, pos);
451 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
452 return (STRLEN)(hopped - (U8 *)pv);
4ddea69a
FC
453 }
454 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
455}
456#endif
f019c49e 457
a8a2ceaa
KW
458/* ------------------------------- utf8.h ------------------------------- */
459
2fe720e2 460/*
3f620621 461=for apidoc_section $unicode
2fe720e2
KW
462*/
463
55d09dc8 464PERL_STATIC_INLINE void
c9182d9c 465Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
466{
467 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
468 * encoded string at '*dest', updating '*dest' to include it */
469
55d09dc8
KW
470 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
471
6f2d5cbc 472 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 473 *((*dest)++) = byte;
55d09dc8 474 else {
a09ec51a
KW
475 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
476 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
477 }
478}
479
e123187a 480/*
2fe720e2 481=for apidoc valid_utf8_to_uvchr
09232555
KW
482Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
483known that the next character in the input UTF-8 string C<s> is well-formed
484(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
485points, and non-Unicode code points are allowed.
2fe720e2
KW
486
487=cut
488
489 */
490
491PERL_STATIC_INLINE UV
492Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
493{
c41b2540 494 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
495 const U8* send = s + expectlen;
496 UV uv = *s;
497
498 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
499
500 if (retlen) {
501 *retlen = expectlen;
502 }
503
504 /* An invariant is trivially returned */
505 if (expectlen == 1) {
1604cfb0 506 return uv;
2fe720e2
KW
507 }
508
509 /* Remove the leading bits that indicate the number of bytes, leaving just
510 * the bits that are part of the value */
511 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
512
513 /* Now, loop through the remaining bytes, accumulating each into the
514 * working total as we go. (I khw tried unrolling the loop for up to 4
515 * bytes, but there was no performance improvement) */
516 for (++s; s < send; s++) {
517 uv = UTF8_ACCUMULATE(uv, *s);
518 }
519
520 return UNI_TO_NATIVE(uv);
521
522}
523
1e599354
KW
524/*
525=for apidoc is_utf8_invariant_string
526
82c5d941 527Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 528regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
529EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
530are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
531the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
532characters are invariant, but so also are the C1 controls.
1e599354
KW
533
534If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
535use this option, that C<s> can't have embedded C<NUL> characters and has to
536have a terminating C<NUL> byte).
537
9f2abfde
KW
538See also
539C<L</is_utf8_string>>,
540C<L</is_utf8_string_flags>>,
541C<L</is_utf8_string_loc>>,
542C<L</is_utf8_string_loc_flags>>,
543C<L</is_utf8_string_loclen>>,
544C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
545C<L</is_utf8_fixed_width_buf_flags>>,
546C<L</is_utf8_fixed_width_buf_loc_flags>>,
547C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
548C<L</is_strict_utf8_string>>,
549C<L</is_strict_utf8_string_loc>>,
550C<L</is_strict_utf8_string_loclen>>,
551C<L</is_c9strict_utf8_string>>,
552C<L</is_c9strict_utf8_string_loc>>,
553and
554C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
555
556=cut
0cbf5865
KW
557
558*/
559
560#define is_utf8_invariant_string(s, len) \
561 is_utf8_invariant_string_loc(s, len, NULL)
562
563/*
564=for apidoc is_utf8_invariant_string_loc
565
566Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
567the first UTF-8 variant character in the C<ep> pointer; if all characters are
568UTF-8 invariant, this function does not change the contents of C<*ep>.
569
570=cut
571
1e599354
KW
572*/
573
574PERL_STATIC_INLINE bool
c9182d9c 575Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 576{
e17544a6 577 const U8* send;
1e599354
KW
578 const U8* x = s;
579
0cbf5865
KW
580 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
581
e17544a6
KW
582 if (len == 0) {
583 len = strlen((const char *)s);
584 }
585
586 send = s + len;
587
4ab2fd9b 588/* This looks like 0x010101... */
2c5c8af5 589# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
590
591/* This looks like 0x808080... */
2c5c8af5 592# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
e099ea69 593# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 594# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 595
099e59a4
KW
596/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
597 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
598 * optimized out completely on a 32-bit system, and its mask gets optimized out
599 * on a 64-bit system */
2c5c8af5 600# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
601 | ( PTR2nat(x) >> 1) \
602 | ( ( (PTR2nat(x) \
603 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 604
3f515a2e
KW
605#ifndef EBCDIC
606
099e59a4
KW
607 /* Do the word-at-a-time iff there is at least one usable full word. That
608 * means that after advancing to a word boundary, there still is at least a
609 * full word left. The number of bytes needed to advance is 'wordsize -
610 * offset' unless offset is 0. */
611 if ((STRLEN) (send - x) >= PERL_WORDSIZE
612
613 /* This term is wordsize if subword; 0 if not */
614 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
615
616 /* 'offset' */
617 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
618 {
b40579ff 619
46bb68f6
KW
620 /* Process per-byte until reach word boundary. XXX This loop could be
621 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 622 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
623 if (! UTF8_IS_INVARIANT(*x)) {
624 if (ep) {
625 *ep = x;
626 }
e17544a6 627
46bb68f6
KW
628 return FALSE;
629 }
630 x++;
e17544a6 631 }
e17544a6 632
099e59a4
KW
633 /* Here, we know we have at least one full word to process. Process
634 * per-word as long as we have at least a full word left */
635 do {
4ab2fd9b 636 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 637
46bb68f6
KW
638 /* Found a variant. Just return if caller doesn't want its
639 * exact position */
640 if (! ep) {
641 return FALSE;
642 }
e17544a6 643
2c5c8af5
KW
644# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
645 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 646
73f0a2eb 647 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
648 assert(*ep >= s && *ep < send);
649
650 return FALSE;
651
2c5c8af5 652# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
653 checks. */
654
46bb68f6 655 break;
2c5c8af5 656# endif
46bb68f6 657 }
1d2af574 658
46bb68f6 659 x += PERL_WORDSIZE;
1d2af574 660
099e59a4 661 } while (x + PERL_WORDSIZE <= send);
b40579ff 662 }
e17544a6 663
0b08cab0 664#endif /* End of ! EBCDIC */
e17544a6
KW
665
666 /* Process per-byte */
667 while (x < send) {
1604cfb0 668 if (! UTF8_IS_INVARIANT(*x)) {
e17544a6
KW
669 if (ep) {
670 *ep = x;
671 }
0cbf5865 672
e17544a6 673 return FALSE;
0cbf5865 674 }
1e599354 675
e17544a6 676 x++;
1e599354
KW
677 }
678
679 return TRUE;
680}
bf874180 681
fc1bb663
KW
682/* See if the platform has builtins for finding the most/least significant bit,
683 * and which one is right for using on 32 and 64 bit operands */
684#if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
685# if U32SIZE == INTSIZE
686# define PERL_CLZ_32 __builtin_clz
687# endif
688# if defined(U64TYPE) && U64SIZE == INTSIZE
689# define PERL_CLZ_64 __builtin_clz
690# endif
691#endif
692#if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
693# if U32SIZE == INTSIZE
694# define PERL_CTZ_32 __builtin_ctz
695# endif
696# if defined(U64TYPE) && U64SIZE == INTSIZE
697# define PERL_CTZ_64 __builtin_ctz
698# endif
699#endif
700
701#if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
702# if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
703# define PERL_CLZ_32 __builtin_clzl
704# endif
705# if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
706# define PERL_CLZ_64 __builtin_clzl
707# endif
708#endif
709#if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
710# if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
711# define PERL_CTZ_32 __builtin_ctzl
712# endif
713# if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
714# define PERL_CTZ_64 __builtin_ctzl
715# endif
716#endif
717
718#if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
719# if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
720# define PERL_CLZ_32 __builtin_clzll
721# endif
722# if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
723# define PERL_CLZ_64 __builtin_clzll
724# endif
725#endif
726#if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
727# if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
728# define PERL_CTZ_32 __builtin_ctzll
729# endif
730# if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
731# define PERL_CTZ_64 __builtin_ctzll
732# endif
733#endif
734
e1f6bdff 735#if defined(_MSC_VER)
e88dde50
KW
736# include <intrin.h>
737# pragma intrinsic(_BitScanForward)
738# pragma intrinsic(_BitScanReverse)
739# ifdef _WIN64
740# pragma intrinsic(_BitScanForward64)
741# pragma intrinsic(_BitScanReverse64)
742# endif
743#endif
744
250e5324
KW
745/* The reason there are not checks to see if ffs() and ffsl() are available for
746 * determining the lsb, is because these don't improve on the deBruijn method
747 * fallback, which is just a branchless integer multiply, array element
748 * retrieval, and shift. The others, even if the function call overhead is
749 * optimized out, have to cope with the possibility of the input being all
750 * zeroes, and almost certainly will have conditionals for this eventuality.
751 * khw, at the time of this commit, looked at the source for both gcc and clang
752 * to verify this. (gcc used a method inferior to deBruijn.) */
753
330cd0ce 754/* Below are functions to find the first, last, or only set bit in a word. On
19d2c525
KW
755 * platforms with 64-bit capability, there is a pair for each operation; the
756 * first taking a 64 bit operand, and the second a 32 bit one. The logic is
757 * the same in each pair, so the second is stripped of most comments. */
758
759#ifdef U64TYPE /* HAS_QUAD not usable outside the core */
760
761PERL_STATIC_INLINE unsigned
762Perl_lsbit_pos64(U64 word)
763{
764 /* Find the position (0..63) of the least significant set bit in the input
765 * word */
766
767 ASSUME(word != 0);
768
fc1bb663
KW
769 /* If we can determine that the platform has a usable fast method to get
770 * this info, use that */
771
772# if defined(PERL_CTZ_64)
2e0bc9ce 773# define PERL_HAS_FAST_GET_LSB_POS64
fc1bb663
KW
774
775 return (unsigned) PERL_CTZ_64(word);
776
a333292f 777# elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
2e0bc9ce 778# define PERL_HAS_FAST_GET_LSB_POS64
e88dde50
KW
779
780 {
781 unsigned long index;
782 _BitScanForward64(&index, word);
783 return (unsigned)index;
784 }
785
fc1bb663
KW
786# else
787
788 /* Here, we didn't find a fast method for finding the lsb. Fall back to
789 * making the lsb the only set bit in the word, and use our function that
790 * works on words with a single bit set.
791 *
792 * Isolate the lsb;
19d2c525
KW
793 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
794 *
795 * The word will look like this, with a rightmost set bit in position 's':
796 * ('x's are don't cares, and 'y's are their complements)
797 * s
798 * x..x100..00
799 * y..y011..11 Complement
800 * y..y100..00 Add 1
801 * 0..0100..00 And with the original
802 *
803 * (Yes, complementing and adding 1 is just taking the negative on 2's
804 * complement machines, but not on 1's complement ones, and some compilers
805 * complain about negating an unsigned.)
806 */
807 return single_1bit_pos64(word & (~word + 1));
fc1bb663
KW
808
809# endif
810
19d2c525
KW
811}
812
813# define lsbit_pos_uintmax_(word) lsbit_pos64(word)
814#else /* ! QUAD */
815# define lsbit_pos_uintmax_(word) lsbit_pos32(word)
816#endif
817
818PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */
819Perl_lsbit_pos32(U32 word)
820{
821 /* Find the position (0..31) of the least significant set bit in the input
822 * word */
823
824 ASSUME(word != 0);
825
fc1bb663 826#if defined(PERL_CTZ_32)
2e0bc9ce 827# define PERL_HAS_FAST_GET_LSB_POS32
fc1bb663
KW
828
829 return (unsigned) PERL_CTZ_32(word);
830
e1f6bdff 831#elif U32SIZE == 4 && defined(_MSC_VER)
2e0bc9ce 832# define PERL_HAS_FAST_GET_LSB_POS32
e88dde50
KW
833
834 {
835 unsigned long index;
836 _BitScanForward(&index, word);
837 return (unsigned)index;
838 }
839
fc1bb663
KW
840#else
841
19d2c525 842 return single_1bit_pos32(word & (~word + 1));
fc1bb663
KW
843
844#endif
845
19d2c525
KW
846}
847
4a1b7bb2 848
fc1bb663 849/* Convert the leading zeros count to the bit position of the first set bit.
4a1b7bb2
KW
850 * This just subtracts from the highest position, 31 or 63. But some compilers
851 * don't optimize this optimally, and so a bit of bit twiddling encourages them
852 * to do the right thing. It turns out that subtracting a smaller non-negative
853 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
854 * the two numbers. To see why, first note that the sum of any number, x, and
855 * its complement, x', is all ones. So all ones minus x is x'. Then note that
856 * the xor of x and all ones is x'. */
857#define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc))
fc1bb663 858
995a4954
KW
859#ifdef U64TYPE /* HAS_QUAD not usable outside the core */
860
861PERL_STATIC_INLINE unsigned
330cd0ce
KW
862Perl_msbit_pos64(U64 word)
863{
864 /* Find the position (0..63) of the most significant set bit in the input
865 * word */
866
867 ASSUME(word != 0);
868
fc1bb663
KW
869 /* If we can determine that the platform has a usable fast method to get
870 * this, use that */
871
872# if defined(PERL_CLZ_64)
2e0bc9ce 873# define PERL_HAS_FAST_GET_MSB_POS64
fc1bb663
KW
874
875 return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
876
e1f6bdff 877# elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
2e0bc9ce 878# define PERL_HAS_FAST_GET_MSB_POS64
e88dde50
KW
879
880 {
881 unsigned long index;
882 _BitScanReverse64(&index, word);
883 return (unsigned)index;
884 }
885
fc1bb663
KW
886# else
887
888 /* Here, we didn't find a fast method for finding the msb. Fall back to
889 * making the msb the only set bit in the word, and use our function that
890 * works on words with a single bit set.
891 *
892 * Isolate the msb; http://codeforces.com/blog/entry/10330
330cd0ce
KW
893 *
894 * Only the most significant set bit matters. Or'ing word with its right
895 * shift of 1 makes that bit and the next one to its right both 1.
896 * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
897 * ... We end with the msb and all to the right being 1. */
898 word |= (word >> 1);
899 word |= (word >> 2);
900 word |= (word >> 4);
901 word |= (word >> 8);
902 word |= (word >> 16);
903 word |= (word >> 32);
904
905 /* Then subtracting the right shift by 1 clears all but the left-most of
906 * the 1 bits, which is our desired result */
907 word -= (word >> 1);
908
909 /* Now we have a single bit set */
910 return single_1bit_pos64(word);
fc1bb663
KW
911
912# endif
913
330cd0ce
KW
914}
915
916# define msbit_pos_uintmax_(word) msbit_pos64(word)
917#else /* ! QUAD */
918# define msbit_pos_uintmax_(word) msbit_pos32(word)
919#endif
920
921PERL_STATIC_INLINE unsigned
922Perl_msbit_pos32(U32 word)
923{
924 /* Find the position (0..31) of the most significant set bit in the input
925 * word */
926
927 ASSUME(word != 0);
928
fc1bb663 929#if defined(PERL_CLZ_32)
2e0bc9ce 930# define PERL_HAS_FAST_GET_MSB_POS32
fc1bb663
KW
931
932 return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
933
e1f6bdff 934#elif U32SIZE == 4 && defined(_MSC_VER)
2e0bc9ce 935# define PERL_HAS_FAST_GET_MSB_POS32
e88dde50
KW
936
937 {
938 unsigned long index;
939 _BitScanReverse(&index, word);
940 return (unsigned)index;
941 }
942
fc1bb663
KW
943#else
944
330cd0ce
KW
945 word |= (word >> 1);
946 word |= (word >> 2);
947 word |= (word >> 4);
948 word |= (word >> 8);
949 word |= (word >> 16);
950 word -= (word >> 1);
951 return single_1bit_pos32(word);
fc1bb663
KW
952
953#endif
954
330cd0ce
KW
955}
956
787e8384
KW
957#if UVSIZE == U64SIZE
958# define msbit_pos(word) msbit_pos64(word)
959# define lsbit_pos(word) lsbit_pos64(word)
960#elif UVSIZE == U32SIZE
961# define msbit_pos(word) msbit_pos32(word)
962# define lsbit_pos(word) lsbit_pos32(word)
963#endif
964
330cd0ce
KW
965#ifdef U64TYPE /* HAS_QUAD not usable outside the core */
966
967PERL_STATIC_INLINE unsigned
995a4954
KW
968Perl_single_1bit_pos64(U64 word)
969{
970 /* Given a 64-bit word known to contain all zero bits except one 1 bit,
971 * find and return the 1's position: 0..63 */
972
973# ifdef PERL_CORE /* macro not exported */
974 ASSUME(isPOWER_OF_2(word));
975# else
976 ASSUME(word && (word & (word-1)) == 0);
977# endif
978
2e0bc9ce
KW
979 /* The only set bit is both the most and least significant bit. If we have
980 * a fast way of finding either one, use that.
981 *
982 * It may appear at first glance that those functions call this one, but
983 * they don't if the corresponding #define is set */
984
985# ifdef PERL_HAS_FAST_GET_MSB_POS64
986
987 return msbit_pos64(word);
988
989# elif defined(PERL_HAS_FAST_GET_LSB_POS64)
990
991 return lsbit_pos64(word);
992
993# else
994
995a4954
KW
995 /* The position of the only set bit in a word can be quickly calculated
996 * using deBruijn sequences. See for example
997 * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
998 return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
999 >> PERL_deBruijnShift64_];
2e0bc9ce
KW
1000# endif
1001
995a4954
KW
1002}
1003
1004#endif
1005
bf874180
KW
1006PERL_STATIC_INLINE unsigned
1007Perl_single_1bit_pos32(U32 word)
1008{
1009 /* Given a 32-bit word known to contain all zero bits except one 1 bit,
1010 * find and return the 1's position: 0..31 */
1011
1012#ifdef PERL_CORE /* macro not exported */
1013 ASSUME(isPOWER_OF_2(word));
1014#else
1015 ASSUME(word && (word & (word-1)) == 0);
1016#endif
2e0bc9ce
KW
1017#ifdef PERL_HAS_FAST_GET_MSB_POS32
1018
1019 return msbit_pos32(word);
1020
1021#elif defined(PERL_HAS_FAST_GET_LSB_POS32)
1022
1023 return lsbit_pos32(word);
1024
1025/* Unlikely, but possible for the platform to have a wider fast operation but
1026 * not a narrower one. But easy enough to handle the case by widening the
1027 * parameter size. (Going the other way, emulating 64 bit by two 32 bit ops
1028 * would be slower than the deBruijn method.) */
1029#elif defined(PERL_HAS_FAST_GET_MSB_POS64)
1030
1031 return msbit_pos64(word);
1032
1033#elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1034
1035 return lsbit_pos64(word);
1036
1037#else
bf874180 1038
bf874180
KW
1039 return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
1040 >> PERL_deBruijnShift32_];
2e0bc9ce
KW
1041#endif
1042
bf874180 1043}
1e599354 1044
23a7ee81
KW
1045#ifndef EBCDIC
1046
1d2af574 1047PERL_STATIC_INLINE unsigned int
73f0a2eb 1048Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574 1049{
1d2af574
KW
1050 /* This returns the position in a word (0..7) of the first variant byte in
1051 * it. This is a helper function. Note that there are no branches */
1052
1d2af574
KW
1053 /* Get just the msb bits of each byte */
1054 word &= PERL_VARIANTS_WORD_MASK;
1055
58ddb8c5
KW
1056 /* This should only be called if we know there is a variant byte in the
1057 * word */
1058 assert(word);
1059
7adf2470 1060# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
1061
1062 /* Bytes are stored like
1063 * Byte8 ... Byte2 Byte1
1064 * 63..56...15...8 7...0
19d2c525
KW
1065 * so getting the lsb of the whole modified word is getting the msb of the
1066 * first byte that has its msb set */
1067 word = lsbit_pos_uintmax_(word);
1068
1069 /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert
1070 * to 0..7 */
1071 return (unsigned int) ((word + 1) >> 3) - 1;
1d2af574
KW
1072
1073# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1074
1075 /* Bytes are stored like
1076 * Byte1 Byte2 ... Byte8
1077 * 63..56 55..47 ... 7...0
330cd0ce
KW
1078 * so getting the msb of the whole modified word is getting the msb of the
1079 * first byte that has its msb set */
1080 word = msbit_pos_uintmax_(word);
1d2af574 1081
330cd0ce
KW
1082 /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert
1083 * to 0..7 */
1d2af574
KW
1084 word = ((word + 1) >> 3) - 1;
1085
330cd0ce
KW
1086 /* And invert the result because of the reversed byte order on this
1087 * platform */
1d2af574
KW
1088 word = CHARBITS - word - 1;
1089
330cd0ce
KW
1090 return (unsigned int) word;
1091
1092# else
1093# error Unexpected byte order
1d2af574
KW
1094# endif
1095
1d2af574
KW
1096}
1097
23a7ee81 1098#endif
03c1e4ab
KW
1099#if defined(PERL_CORE) || defined(PERL_EXT)
1100
1101/*
1102=for apidoc variant_under_utf8_count
1103
1104This function looks at the sequence of bytes between C<s> and C<e>, which are
1105assumed to be encoded in ASCII/Latin1, and returns how many of them would
1106change should the string be translated into UTF-8. Due to the nature of UTF-8,
1107each of these would occupy two bytes instead of the single one in the input
1108string. Thus, this function returns the precise number of bytes the string
1109would expand by when translated to UTF-8.
1110
1111Unlike most of the other functions that have C<utf8> in their name, the input
1112to this function is NOT a UTF-8-encoded string. The function name is slightly
1113I<odd> to emphasize this.
1114
1115This function is internal to Perl because khw thinks that any XS code that
1116would want this is probably operating too close to the internals. Presenting a
1117valid use case could change that.
1118
1119See also
1120C<L<perlapi/is_utf8_invariant_string>>
1121and
1122C<L<perlapi/is_utf8_invariant_string_loc>>,
1123
1124=cut
1125
1126*/
1127
1128PERL_STATIC_INLINE Size_t
1129S_variant_under_utf8_count(const U8* const s, const U8* const e)
1130{
1131 const U8* x = s;
1132 Size_t count = 0;
1133
1134 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1135
1136# ifndef EBCDIC
1137
5d0379de
KW
1138 /* Test if the string is long enough to use word-at-a-time. (Logic is the
1139 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
1140 if ((STRLEN) (e - x) >= PERL_WORDSIZE
1141 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1142 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1143 {
1144
1145 /* Process per-byte until reach word boundary. XXX This loop could be
1146 * eliminated if we knew that this platform had fast unaligned reads */
1147 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1148 count += ! UTF8_IS_INVARIANT(*x++);
1149 }
1150
1151 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
1152 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1153 explanation of how this works */
e5863284
KW
1154 PERL_UINTMAX_T increment
1155 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
1156 * PERL_COUNT_MULTIPLIER)
1157 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 1158 count += (Size_t) increment;
03c1e4ab
KW
1159 x += PERL_WORDSIZE;
1160 } while (x + PERL_WORDSIZE <= e);
1161 }
1162
1163# endif
1164
1165 /* Process per-byte */
1166 while (x < e) {
1604cfb0 1167 if (! UTF8_IS_INVARIANT(*x)) {
03c1e4ab
KW
1168 count++;
1169 }
1170
1171 x++;
1172 }
1173
1174 return count;
1175}
1176
1177#endif
1178
aff4cafe
KW
1179#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
1180# undef PERL_WORDSIZE
1181# undef PERL_COUNT_MULTIPLIER
1182# undef PERL_WORD_BOUNDARY_MASK
1183# undef PERL_VARIANTS_WORD_MASK
1184#endif
03c1e4ab 1185
7c93d8f0 1186/*
5ff889fb
KW
1187=for apidoc is_utf8_string
1188
82c5d941
KW
1189Returns TRUE if the first C<len> bytes of string C<s> form a valid
1190Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
1191be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1192can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1193byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1194
2717076a
KW
1195This function considers Perl's extended UTF-8 to be valid. That means that
1196code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
1197considered valid by this function. Use C<L</is_strict_utf8_string>>,
1198C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1199code points are considered valid.
5ff889fb 1200
9f2abfde
KW
1201See also
1202C<L</is_utf8_invariant_string>>,
0cbf5865 1203C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1204C<L</is_utf8_string_loc>>,
1205C<L</is_utf8_string_loclen>>,
8bc127bf
KW
1206C<L</is_utf8_fixed_width_buf_flags>>,
1207C<L</is_utf8_fixed_width_buf_loc_flags>>,
1208C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
1209
1210=cut
1211*/
1212
dd237e82 1213#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 1214
c9cd936b
KW
1215#if defined(PERL_CORE) || defined (PERL_EXT)
1216
1217/*
1218=for apidoc is_utf8_non_invariant_string
1219
1220Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1221C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1222UTF-8; otherwise returns FALSE.
1223
1224A TRUE return means that at least one code point represented by the sequence
1225either is a wide character not representable as a single byte, or the
1226representation differs depending on whether the sequence is encoded in UTF-8 or
1227not.
1228
1229See also
1230C<L<perlapi/is_utf8_invariant_string>>,
1231C<L<perlapi/is_utf8_string>>
1232
1233=cut
1234
1235This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
1236It generally needn't be if its string is entirely UTF-8 invariant, and it
1237shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
1238
1239It is an internal function because khw thinks that XS code shouldn't be working
1240at this low a level. A valid use case could change that.
1241
1242*/
1243
1244PERL_STATIC_INLINE bool
86a87e17 1245Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
c9cd936b
KW
1246{
1247 const U8 * first_variant;
1248
1249 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1250
1251 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1252 return FALSE;
1253 }
1254
1255 return is_utf8_string(first_variant, len - (first_variant - s));
1256}
1257
1258#endif
1259
5ff889fb 1260/*
9f2abfde
KW
1261=for apidoc is_strict_utf8_string
1262
1263Returns TRUE if the first C<len> bytes of string C<s> form a valid
1264UTF-8-encoded string that is fully interchangeable by any application using
1265Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
1266calculated using C<strlen(s)> (which means if you use this option, that C<s>
1267can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1268byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1269
1270This function returns FALSE for strings containing any
1271code points above the Unicode max of 0x10FFFF, surrogate code points, or
1272non-character code points.
1273
1274See also
1275C<L</is_utf8_invariant_string>>,
0cbf5865 1276C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1277C<L</is_utf8_string>>,
1278C<L</is_utf8_string_flags>>,
1279C<L</is_utf8_string_loc>>,
1280C<L</is_utf8_string_loc_flags>>,
1281C<L</is_utf8_string_loclen>>,
1282C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
1283C<L</is_utf8_fixed_width_buf_flags>>,
1284C<L</is_utf8_fixed_width_buf_loc_flags>>,
1285C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
1286C<L</is_strict_utf8_string_loc>>,
1287C<L</is_strict_utf8_string_loclen>>,
1288C<L</is_c9strict_utf8_string>>,
1289C<L</is_c9strict_utf8_string_loc>>,
1290and
1291C<L</is_c9strict_utf8_string_loclen>>.
1292
1293=cut
1294*/
1295
dd237e82 1296#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
1297
1298/*
1299=for apidoc is_c9strict_utf8_string
1300
1301Returns TRUE if the first C<len> bytes of string C<s> form a valid
1302UTF-8-encoded string that conforms to
1303L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1304otherwise it returns FALSE. If C<len> is 0, it will be calculated using
1305C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1306C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
1307characters being ASCII constitute 'a valid UTF-8 string'.
1308
1309This function returns FALSE for strings containing any code points above the
1310Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1311code points per
1312L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1313
1314See also
1315C<L</is_utf8_invariant_string>>,
0cbf5865 1316C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1317C<L</is_utf8_string>>,
1318C<L</is_utf8_string_flags>>,
1319C<L</is_utf8_string_loc>>,
1320C<L</is_utf8_string_loc_flags>>,
1321C<L</is_utf8_string_loclen>>,
1322C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
1323C<L</is_utf8_fixed_width_buf_flags>>,
1324C<L</is_utf8_fixed_width_buf_loc_flags>>,
1325C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
1326C<L</is_strict_utf8_string>>,
1327C<L</is_strict_utf8_string_loc>>,
1328C<L</is_strict_utf8_string_loclen>>,
1329C<L</is_c9strict_utf8_string_loc>>,
1330and
1331C<L</is_c9strict_utf8_string_loclen>>.
1332
1333=cut
1334*/
1335
dd237e82 1336#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
1337
1338/*
1339=for apidoc is_utf8_string_flags
1340
1341Returns TRUE if the first C<len> bytes of string C<s> form a valid
1342UTF-8 string, subject to the restrictions imposed by C<flags>;
1343returns FALSE otherwise. If C<len> is 0, it will be calculated
1344using C<strlen(s)> (which means if you use this option, that C<s> can't have
1345embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
1346that all characters being ASCII constitute 'a valid UTF-8 string'.
1347
1348If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1349C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1350as C<L</is_strict_utf8_string>>; and if C<flags> is
1351C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1352C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
1353combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1354C<L</utf8n_to_uvchr>>, with the same meanings.
1355
1356See also
1357C<L</is_utf8_invariant_string>>,
0cbf5865 1358C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1359C<L</is_utf8_string>>,
1360C<L</is_utf8_string_loc>>,
1361C<L</is_utf8_string_loc_flags>>,
1362C<L</is_utf8_string_loclen>>,
1363C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
1364C<L</is_utf8_fixed_width_buf_flags>>,
1365C<L</is_utf8_fixed_width_buf_loc_flags>>,
1366C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
1367C<L</is_strict_utf8_string>>,
1368C<L</is_strict_utf8_string_loc>>,
1369C<L</is_strict_utf8_string_loclen>>,
1370C<L</is_c9strict_utf8_string>>,
1371C<L</is_c9strict_utf8_string_loc>>,
1372and
1373C<L</is_c9strict_utf8_string_loclen>>.
1374
1375=cut
1376*/
1377
1378PERL_STATIC_INLINE bool
c9182d9c 1379Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 1380{
33756530 1381 const U8 * first_variant;
9f2abfde
KW
1382
1383 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1384 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1385 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1386
f60f61fd
KW
1387 if (len == 0) {
1388 len = strlen((const char *)s);
1389 }
1390
9f2abfde
KW
1391 if (flags == 0) {
1392 return is_utf8_string(s, len);
1393 }
1394
d044b7a7 1395 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1396 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1397 {
1398 return is_strict_utf8_string(s, len);
1399 }
1400
d044b7a7 1401 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1402 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1403 {
1404 return is_c9strict_utf8_string(s, len);
1405 }
1406
33756530
KW
1407 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1408 const U8* const send = s + len;
1409 const U8* x = first_variant;
1410
a0d7f935
KW
1411 while (x < send) {
1412 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1413 if (UNLIKELY(! cur_len)) {
1414 return FALSE;
1415 }
1416 x += cur_len;
9f2abfde 1417 }
33756530 1418 }
9f2abfde
KW
1419
1420 return TRUE;
1421}
1422
1423/*
5ff889fb
KW
1424
1425=for apidoc is_utf8_string_loc
1426
2717076a 1427Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 1428case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 1429"utf8ness success") in the C<ep> pointer.
5ff889fb 1430
2717076a 1431See also C<L</is_utf8_string_loclen>>.
5ff889fb 1432
3964c812
KW
1433=cut
1434*/
1435
1436#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
1437
1438/*
1439
5ff889fb
KW
1440=for apidoc is_utf8_string_loclen
1441
2717076a 1442Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 1443case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 1444"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 1445encoded characters in the C<el> pointer.
5ff889fb 1446
2717076a 1447See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
1448
1449=cut
1450*/
1451
56e4cf64 1452PERL_STATIC_INLINE bool
33756530 1453Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 1454{
33756530 1455 const U8 * first_variant;
5ff889fb
KW
1456
1457 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1458
33756530
KW
1459 if (len == 0) {
1460 len = strlen((const char *) s);
1461 }
1462
1463 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1464 if (el)
1465 *el = len;
1466
1467 if (ep) {
1468 *ep = s + len;
1469 }
1470
1471 return TRUE;
1472 }
1473
1474 {
1475 const U8* const send = s + len;
1476 const U8* x = first_variant;
1477 STRLEN outlen = first_variant - s;
1478
a0d7f935
KW
1479 while (x < send) {
1480 const STRLEN cur_len = isUTF8_CHAR(x, send);
1481 if (UNLIKELY(! cur_len)) {
1482 break;
1483 }
1484 x += cur_len;
1485 outlen++;
5ff889fb 1486 }
5ff889fb 1487
a0d7f935
KW
1488 if (el)
1489 *el = outlen;
5ff889fb 1490
a0d7f935
KW
1491 if (ep) {
1492 *ep = x;
1493 }
5ff889fb 1494
a0d7f935 1495 return (x == send);
33756530 1496 }
5ff889fb
KW
1497}
1498
213dc9d1
KW
1499/* The perl core arranges to never call the DFA below without there being at
1500 * least one byte available to look at. This allows the DFA to use a do {}
1501 * while loop which means that calling it with a UTF-8 invariant has a single
1502 * conditional, same as the calling code checking for invariance ahead of time.
1503 * And having the calling code remove that conditional speeds up by that
1504 * conditional, the case where it wasn't invariant. So there's no reason to
1505 * check before caling this.
1506 *
1507 * But we don't know this for non-core calls, so have to retain the check for
1508 * them. */
1509#ifdef PERL_CORE
1510# define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s))
1511#else
1512# define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE
1513#endif
1514
5ff889fb 1515/*
50f7a4ce
KW
1516 * DFA for checking input is valid UTF-8 syntax.
1517 *
1518 * This uses adaptations of the table and algorithm given in
1519 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1520 * documentation of the original version. A copyright notice for the original
1521 * version is given at the beginning of this file. The Perl adapations are
1522 * documented at the definition of PL_extended_utf8_dfa_tab[].
1523 *
1524 * This dfa is fast. There are three exit conditions:
1525 * 1) a well-formed code point, acceptable to the table
1526 * 2) the beginning bytes of an incomplete character, whose completion might
1527 * or might not be acceptable
1528 * 3) unacceptable to the table. Some of the adaptations have certain,
1529 * hopefully less likely to occur, legal inputs be unacceptable to the
1530 * table, so these must be sorted out afterwards.
1531 *
1532 * This macro is a complete implementation of the code executing the DFA. It
1533 * is passed the input sequence bounds and the table to use, and what to do
1534 * for each of the exit conditions. There are three canned actions, likely to
1535 * be the ones you want:
1536 * DFA_RETURN_SUCCESS_
1537 * DFA_RETURN_FAILURE_
1538 * DFA_GOTO_TEASE_APART_FF_
1539 *
1540 * You pass a parameter giving the action to take for each of the three
1541 * possible exit conditions:
1542 *
1543 * 'accept_action' This is executed when the DFA accepts the input.
1544 * DFA_RETURN_SUCCESS_ is the most likely candidate.
1545 * 'reject_action' This is executed when the DFA rejects the input.
1546 * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1547 * you have written code to distinguish the rejecting state
1548 * results. Because it happens in several places, and
1549 * involves #ifdefs, the special action
1550 * DFA_GOTO_TEASE_APART_FF_ is what you want with
1551 * PL_extended_utf8_dfa_tab. On platforms without
1552 * EXTRA_LONG_UTF8, there is no need to tease anything apart,
1553 * so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1554 * need to have a label 'tease_apart_FF' that it will transfer
1555 * to.
1556 * 'incomplete_char_action' This is executed when the DFA ran off the end
1557 * before accepting or rejecting the input.
1558 * DFA_RETURN_FAILURE_ is the likely action, but you could
1559 * have a 'goto', or NOOP. In the latter case the DFA drops
1560 * off the end, and you place your code to handle this case
1561 * immediately after it.
1562 */
1563
1564#define DFA_RETURN_SUCCESS_ return s - s0
1565#define DFA_RETURN_FAILURE_ return 0
1566#ifdef HAS_EXTRA_LONG_UTF8
1567# define DFA_TEASE_APART_FF_ goto tease_apart_FF
1568#else
1569# define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_
1570#endif
1571
1572#define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \
1573 accept_action, \
1574 reject_action, \
1575 incomplete_char_action) \
1576 STMT_START { \
1577 const U8 * s = s0; \
1578 UV state = 0; \
1579 \
213dc9d1
KW
1580 PERL_NON_CORE_CHECK_EMPTY(s,e); \
1581 \
1582 do { \
50f7a4ce
KW
1583 state = dfa_tab[256 + state + dfa_tab[*s]]; \
1584 s++; \
1585 \
1586 if (state == 0) { /* Accepting state */ \
1587 accept_action; \
1588 } \
1589 \
1590 if (UNLIKELY(state == 1)) { /* Rejecting state */ \
1591 reject_action; \
1592 } \
213dc9d1 1593 } while (s < e); \
50f7a4ce
KW
1594 \
1595 /* Here, dropped out of loop before end-of-char */ \
1596 incomplete_char_action; \
1597 } STMT_END
1598
1599
1600/*
9f2abfde 1601
44170c9a 1602=for apidoc isUTF8_CHAR
8ed185f9
KW
1603
1604Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1605looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1606that represents some code point; otherwise it evaluates to 0. If non-zero, the
1607value gives how many bytes starting at C<s> comprise the code point's
1608representation. Any bytes remaining before C<e>, but beyond the ones needed to
1609form the first code point in C<s>, are not examined.
1610
13aab5dd 1611The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1612extension to official UTF-8 to represent those higher than the Unicode maximum
1613of 0x10FFFF. That means that this macro is used to efficiently decide if the
1614next few bytes in C<s> is legal UTF-8 for a single character.
1615
1616Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1617defined by Unicode to be fully interchangeable across applications;
1618C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1619#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1620code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1621
1622Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1623C<L</is_utf8_string_loclen>> to check entire strings.
1624
13aab5dd
KW
1625Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1626machines) is a valid UTF-8 character.
8ed185f9
KW
1627
1628=cut
1629
1630This uses an adaptation of the table and algorithm given in
f6521f7c 1631https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
8ed185f9
KW
1632documentation of the original version. A copyright notice for the original
1633version is given at the beginning of this file. The Perl adapation is
71525f77 1634documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1635*/
1636
1637PERL_STATIC_INLINE Size_t
c9182d9c 1638Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9 1639{
8ed185f9
KW
1640 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1641
50f7a4ce
KW
1642 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1643 DFA_RETURN_SUCCESS_,
1644 DFA_TEASE_APART_FF_,
1645 DFA_RETURN_FAILURE_);
8ed185f9 1646
50f7a4ce
KW
1647 /* Here, we didn't return success, but dropped out of the loop. In the
1648 * case of PL_extended_utf8_dfa_tab, this means the input is either
1649 * malformed, or the start byte was FF on a platform that the dfa doesn't
1650 * handle FF's. Call a helper function. */
ffea7477 1651
50f7a4ce 1652#ifdef HAS_EXTRA_LONG_UTF8
8ed185f9 1653
50f7a4ce 1654 tease_apart_FF:
8ed185f9 1655
50f7a4ce
KW
1656 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1657 * either malformed, or was for the largest possible start byte, which we
1658 * now check, not inline */
1659 if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1660 return 0;
8ed185f9
KW
1661 }
1662
50f7a4ce
KW
1663 return is_utf8_FF_helper_(s0, e,
1664 FALSE /* require full, not partial char */
1665 );
8ed185f9
KW
1666#endif
1667
8ed185f9
KW
1668}
1669
1670/*
1671
67049a5f
KW
1672=for apidoc isSTRICT_UTF8_CHAR
1673
1674Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1675looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1676Unicode code point completely acceptable for open interchange between all
1677applications; otherwise it evaluates to 0. If non-zero, the value gives how
1678many bytes starting at C<s> comprise the code point's representation. Any
1679bytes remaining before C<e>, but beyond the ones needed to form the first code
1680point in C<s>, are not examined.
1681
1682The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1683be a surrogate nor a non-character code point. Thus this excludes any code
1684point from Perl's extended UTF-8.
1685
1686This is used to efficiently decide if the next few bytes in C<s> is
1687legal Unicode-acceptable UTF-8 for a single character.
1688
1689Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1690#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1691code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1692and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1693
1694Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1695C<L</is_strict_utf8_string_loclen>> to check entire strings.
1696
1697=cut
1698
1699This uses an adaptation of the tables and algorithm given in
f6521f7c 1700https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
67049a5f
KW
1701documentation of the original version. A copyright notice for the original
1702version is given at the beginning of this file. The Perl adapation is
1703documented at the definition of strict_extended_utf8_dfa_tab[].
1704
1705*/
1706
1707PERL_STATIC_INLINE Size_t
c9182d9c 1708Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f 1709{
67049a5f
KW
1710 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1711
50f7a4ce
KW
1712 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1713 DFA_RETURN_SUCCESS_,
1714 goto check_hanguls,
1715 DFA_RETURN_FAILURE_);
1716 check_hanguls:
67049a5f 1717
50f7a4ce
KW
1718 /* Here, we didn't return success, but dropped out of the loop. In the
1719 * case of PL_strict_utf8_dfa_tab, this means the input is either
1720 * malformed, or was for certain Hanguls; handle them specially */
67049a5f 1721
67260a96
KW
1722 /* The dfa above drops out for incomplete or illegal inputs, and certain
1723 * legal Hanguls; check and return accordingly */
1724 return is_HANGUL_ED_utf8_safe(s0, e);
67049a5f
KW
1725}
1726
1727/*
1728
44170c9a 1729=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1730
1731Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1732looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1733Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1734the value gives how many bytes starting at C<s> comprise the code point's
1735representation. Any bytes remaining before C<e>, but beyond the ones needed to
1736form the first code point in C<s>, are not examined.
1737
1738The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1739differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1740code points. This corresponds to
1741L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1742which said that non-character code points are merely discouraged rather than
1743completely forbidden in open interchange. See
1744L<perlunicode/Noncharacter code points>.
1745
1746Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1747C<L</isUTF8_CHAR_flags>> for a more customized definition.
1748
1749Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1750C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1751
1752=cut
1753
1754This uses an adaptation of the tables and algorithm given in
f6521f7c 1755https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
c5bfbb64
KW
1756documentation of the original version. A copyright notice for the original
1757version is given at the beginning of this file. The Perl adapation is
71525f77 1758documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1759
1760*/
1761
1762PERL_STATIC_INLINE Size_t
c9182d9c 1763Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64 1764{
c5bfbb64
KW
1765 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1766
50f7a4ce
KW
1767 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1768 DFA_RETURN_SUCCESS_,
1769 DFA_RETURN_FAILURE_,
1770 DFA_RETURN_FAILURE_);
c5bfbb64
KW
1771}
1772
1773/*
1774
9f2abfde
KW
1775=for apidoc is_strict_utf8_string_loc
1776
1777Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1778case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1779"utf8ness success") in the C<ep> pointer.
1780
1781See also C<L</is_strict_utf8_string_loclen>>.
1782
1783=cut
1784*/
1785
1786#define is_strict_utf8_string_loc(s, len, ep) \
1787 is_strict_utf8_string_loclen(s, len, ep, 0)
1788
1789/*
1790
1791=for apidoc is_strict_utf8_string_loclen
1792
1793Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1794case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1795"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1796encoded characters in the C<el> pointer.
1797
1798See also C<L</is_strict_utf8_string_loc>>.
1799
1800=cut
1801*/
1802
1803PERL_STATIC_INLINE bool
c9182d9c 1804Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1805{
33756530 1806 const U8 * first_variant;
9f2abfde
KW
1807
1808 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1809
33756530
KW
1810 if (len == 0) {
1811 len = strlen((const char *) s);
1812 }
1813
1814 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1815 if (el)
1816 *el = len;
1817
1818 if (ep) {
1819 *ep = s + len;
1820 }
1821
1822 return TRUE;
1823 }
1824
1825 {
1826 const U8* const send = s + len;
1827 const U8* x = first_variant;
1828 STRLEN outlen = first_variant - s;
1829
a0d7f935
KW
1830 while (x < send) {
1831 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1832 if (UNLIKELY(! cur_len)) {
1833 break;
1834 }
1835 x += cur_len;
1836 outlen++;
9f2abfde 1837 }
9f2abfde 1838
a0d7f935
KW
1839 if (el)
1840 *el = outlen;
9f2abfde 1841
a0d7f935
KW
1842 if (ep) {
1843 *ep = x;
1844 }
9f2abfde 1845
a0d7f935 1846 return (x == send);
33756530 1847 }
9f2abfde
KW
1848}
1849
1850/*
1851
1852=for apidoc is_c9strict_utf8_string_loc
1853
1854Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1855the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1856"utf8ness success") in the C<ep> pointer.
1857
1858See also C<L</is_c9strict_utf8_string_loclen>>.
1859
1860=cut
1861*/
1862
1863#define is_c9strict_utf8_string_loc(s, len, ep) \
1864 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1865
1866/*
1867
1868=for apidoc is_c9strict_utf8_string_loclen
1869
1870Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1871the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1872"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1873characters in the C<el> pointer.
1874
1875See also C<L</is_c9strict_utf8_string_loc>>.
1876
1877=cut
1878*/
1879
1880PERL_STATIC_INLINE bool
c9182d9c 1881Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1882{
33756530 1883 const U8 * first_variant;
9f2abfde
KW
1884
1885 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1886
33756530
KW
1887 if (len == 0) {
1888 len = strlen((const char *) s);
1889 }
1890
1891 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1892 if (el)
1893 *el = len;
1894
1895 if (ep) {
1896 *ep = s + len;
1897 }
1898
1899 return TRUE;
1900 }
1901
1902 {
1903 const U8* const send = s + len;
1904 const U8* x = first_variant;
1905 STRLEN outlen = first_variant - s;
1906
a0d7f935
KW
1907 while (x < send) {
1908 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1909 if (UNLIKELY(! cur_len)) {
1910 break;
1911 }
1912 x += cur_len;
1913 outlen++;
9f2abfde 1914 }
9f2abfde 1915
a0d7f935
KW
1916 if (el)
1917 *el = outlen;
9f2abfde 1918
a0d7f935
KW
1919 if (ep) {
1920 *ep = x;
1921 }
9f2abfde 1922
a0d7f935 1923 return (x == send);
33756530 1924 }
9f2abfde
KW
1925}
1926
1927/*
1928
1929=for apidoc is_utf8_string_loc_flags
1930
1931Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1932case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1933"utf8ness success") in the C<ep> pointer.
1934
1935See also C<L</is_utf8_string_loclen_flags>>.
1936
1937=cut
1938*/
1939
1940#define is_utf8_string_loc_flags(s, len, ep, flags) \
1941 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1942
1943
1944/* The above 3 actual functions could have been moved into the more general one
1945 * just below, and made #defines that call it with the right 'flags'. They are
1946 * currently kept separate to increase their chances of getting inlined */
1947
1948/*
1949
1950=for apidoc is_utf8_string_loclen_flags
1951
1952Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1953case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1954"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1955encoded characters in the C<el> pointer.
1956
1957See also C<L</is_utf8_string_loc_flags>>.
1958
1959=cut
1960*/
1961
1962PERL_STATIC_INLINE bool
c9182d9c 1963Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1964{
33756530 1965 const U8 * first_variant;
9f2abfde
KW
1966
1967 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1968 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1969 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1970
f60f61fd 1971 if (len == 0) {
a0d7f935 1972 len = strlen((const char *) s);
f60f61fd
KW
1973 }
1974
9f2abfde
KW
1975 if (flags == 0) {
1976 return is_utf8_string_loclen(s, len, ep, el);
1977 }
1978
d044b7a7 1979 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1980 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1981 {
1982 return is_strict_utf8_string_loclen(s, len, ep, el);
1983 }
1984
d044b7a7 1985 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1986 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1987 {
1988 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1989 }
1990
33756530
KW
1991 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1992 if (el)
1993 *el = len;
1994
1995 if (ep) {
1996 *ep = s + len;
1997 }
1998
1999 return TRUE;
2000 }
2001
2002 {
2003 const U8* send = s + len;
2004 const U8* x = first_variant;
2005 STRLEN outlen = first_variant - s;
2006
a0d7f935
KW
2007 while (x < send) {
2008 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
2009 if (UNLIKELY(! cur_len)) {
2010 break;
2011 }
2012 x += cur_len;
2013 outlen++;
9f2abfde 2014 }
9f2abfde 2015
a0d7f935
KW
2016 if (el)
2017 *el = outlen;
9f2abfde 2018
a0d7f935
KW
2019 if (ep) {
2020 *ep = x;
2021 }
9f2abfde 2022
a0d7f935 2023 return (x == send);
33756530 2024 }
9f2abfde
KW
2025}
2026
2027/*
7c93d8f0
KW
2028=for apidoc utf8_distance
2029
2030Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
2031and C<b>.
2032
2033WARNING: use only if you *know* that the pointers point inside the
2034same UTF-8 buffer.
2035
2036=cut
2037*/
2038
2039PERL_STATIC_INLINE IV
2040Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
2041{
2042 PERL_ARGS_ASSERT_UTF8_DISTANCE;
2043
2044 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2045}
2046
2047/*
2048=for apidoc utf8_hop
2049
2050Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2051forward or backward.
2052
2053WARNING: do not use the following unless you *know* C<off> is within
2054the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
2055on the first byte of character or just after the last byte of a character.
2056
2057=cut
2058*/
2059
2060PERL_STATIC_INLINE U8 *
2061Perl_utf8_hop(const U8 *s, SSize_t off)
2062{
2063 PERL_ARGS_ASSERT_UTF8_HOP;
2064
2065 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2066 * the bitops (especially ~) can create illegal UTF-8.
2067 * In other words: in Perl UTF-8 is not just for Unicode. */
2068
2069 if (off >= 0) {
1604cfb0
MS
2070 while (off--)
2071 s += UTF8SKIP(s);
7c93d8f0
KW
2072 }
2073 else {
1604cfb0
MS
2074 while (off++) {
2075 s--;
2076 while (UTF8_IS_CONTINUATION(*s))
2077 s--;
2078 }
7c93d8f0 2079 }
e099ea69 2080 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 2081 return (U8 *)s;
e099ea69 2082 GCC_DIAG_RESTORE
7c93d8f0
KW
2083}
2084
4dab108f 2085/*
65df57a8
TC
2086=for apidoc utf8_hop_forward
2087
2088Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2089forward.
2090
2091C<off> must be non-negative.
2092
2093C<s> must be before or equal to C<end>.
2094
2095When moving forward it will not move beyond C<end>.
2096
2097Will not exceed this limit even if the string is not valid "UTF-8".
2098
2099=cut
2100*/
2101
2102PERL_STATIC_INLINE U8 *
2103Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2104{
2105 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2106
2107 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2108 * the bitops (especially ~) can create illegal UTF-8.
2109 * In other words: in Perl UTF-8 is not just for Unicode. */
2110
2111 assert(s <= end);
2112 assert(off >= 0);
2113
2114 while (off--) {
2115 STRLEN skip = UTF8SKIP(s);
de979548 2116 if ((STRLEN)(end - s) <= skip) {
e099ea69 2117 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 2118 return (U8 *)end;
e099ea69 2119 GCC_DIAG_RESTORE
de979548 2120 }
65df57a8
TC
2121 s += skip;
2122 }
2123
e099ea69 2124 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 2125 return (U8 *)s;
e099ea69 2126 GCC_DIAG_RESTORE
65df57a8
TC
2127}
2128
2129/*
2130=for apidoc utf8_hop_back
2131
2132Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2133backward.
2134
2135C<off> must be non-positive.
2136
2137C<s> must be after or equal to C<start>.
2138
2139When moving backward it will not move before C<start>.
2140
2141Will not exceed this limit even if the string is not valid "UTF-8".
2142
2143=cut
2144*/
2145
2146PERL_STATIC_INLINE U8 *
2147Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2148{
2149 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2150
2151 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2152 * the bitops (especially ~) can create illegal UTF-8.
2153 * In other words: in Perl UTF-8 is not just for Unicode. */
2154
2155 assert(start <= s);
2156 assert(off <= 0);
2157
2158 while (off++ && s > start) {
e7185695 2159 do {
65df57a8 2160 s--;
e7185695 2161 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8 2162 }
f6521f7c 2163
e099ea69 2164 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 2165 return (U8 *)s;
e099ea69 2166 GCC_DIAG_RESTORE
65df57a8
TC
2167}
2168
2169/*
2170=for apidoc utf8_hop_safe
2171
2172Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2173either forward or backward.
2174
2175When moving backward it will not move before C<start>.
2176
2177When moving forward it will not move beyond C<end>.
2178
2179Will not exceed those limits even if the string is not valid "UTF-8".
2180
2181=cut
2182*/
2183
2184PERL_STATIC_INLINE U8 *
2185Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2186{
2187 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2188
2189 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2190 * the bitops (especially ~) can create illegal UTF-8.
2191 * In other words: in Perl UTF-8 is not just for Unicode. */
2192
2193 assert(start <= s && s <= end);
2194
2195 if (off >= 0) {
2196 return utf8_hop_forward(s, off, end);
2197 }
2198 else {
2199 return utf8_hop_back(s, off, start);
2200 }
2201}
2202
2203/*
4dab108f 2204
247cc51e 2205=for apidoc isUTF8_CHAR_flags
22f363ff
KW
2206
2207Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2208looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2209that represents some code point, subject to the restrictions given by C<flags>;
2210otherwise it evaluates to 0. If non-zero, the value gives how many bytes
2211starting at C<s> comprise the code point's representation. Any bytes remaining
2212before C<e>, but beyond the ones needed to form the first code point in C<s>,
2213are not examined.
2214
2215If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2216if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2217as C<L</isSTRICT_UTF8_CHAR>>;
2218and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2219the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2220Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2221understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2222
2223The three alternative macros are for the most commonly needed validations; they
2224are likely to run somewhat faster than this more general one, as they can be
2225inlined into your code.
2226
2227Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2228L</is_utf8_string_loclen_flags> to check entire strings.
2229
2230=cut
2231*/
2232
2233PERL_STATIC_INLINE STRLEN
2234Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2235{
2236 PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2237 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2238 |UTF8_DISALLOW_PERL_EXTENDED)));
2239
2240 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2241 goto check_success,
2242 DFA_TEASE_APART_FF_,
2243 DFA_RETURN_FAILURE_);
2244
2245 check_success:
2246
1aa501c2 2247 return is_utf8_char_helper_(s0, e, flags);
22f363ff
KW
2248
2249#ifdef HAS_EXTRA_LONG_UTF8
2250
2251 tease_apart_FF:
2252
2253 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2254 * either malformed, or was for the largest possible start byte, which
2255 * indicates perl extended UTF-8, well above the Unicode maximum */
2256 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2257 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2258 {
2259 return 0;
2260 }
2261
2262 /* Otherwise examine the sequence not inline */
2263 return is_utf8_FF_helper_(s0, e,
2264 FALSE /* require full, not partial char */
2265 );
2266#endif
2267
2268}
2269
2270/*
2271
4dab108f
KW
2272=for apidoc is_utf8_valid_partial_char
2273
6cbb9248
KW
2274Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2275S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2276points. Otherwise, it returns 1 if there exists at least one non-empty
2277sequence of bytes that when appended to sequence C<s>, starting at position
2278C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2279otherwise returns 0.
2280
2281In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2282point.
2283
2284This is useful when a fixed-length buffer is being tested for being well-formed
2285UTF-8, but the final few bytes in it don't comprise a full character; that is,
2286it is split somewhere in the middle of the final code point's UTF-8
2287representation. (Presumably when the buffer is refreshed with the next chunk
2288of data, the new first bytes will complete the partial code point.) This
2289function is used to verify that the final bytes in the current buffer are in
2290fact the legal beginning of some code point, so that if they aren't, the
2291failure can be signalled without having to wait for the next read.
4dab108f
KW
2292
2293=cut
2294*/
2717076a
KW
2295#define is_utf8_valid_partial_char(s, e) \
2296 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
2297
2298/*
2299
2300=for apidoc is_utf8_valid_partial_char_flags
2301
2302Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2303or not the input is a valid UTF-8 encoded partial character, but it takes an
2304extra parameter, C<flags>, which can further restrict which code points are
2305considered valid.
2306
2307If C<flags> is 0, this behaves identically to
2308C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
2309of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
2310there is any sequence of bytes that can complete the input partial character in
2311such a way that a non-prohibited character is formed, the function returns
2717076a
KW
2312TRUE; otherwise FALSE. Non character code points cannot be determined based on
2313partial character input. But many of the other possible excluded types can be
f1c999a7
KW
2314determined from just the first one or two bytes.
2315
2316=cut
2317 */
2318
56e4cf64 2319PERL_STATIC_INLINE bool
22afef87 2320Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
4dab108f 2321{
f1c999a7 2322 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
f1c999a7 2323 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 2324 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 2325
22afef87
KW
2326 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2327 DFA_RETURN_FAILURE_,
2328 DFA_TEASE_APART_FF_,
2329 NOOP);
2330
2331 /* The NOOP above causes the DFA to drop down here iff the input was a
2332 * partial character. flags=0 => can return TRUE immediately; otherwise we
2333 * need to check (not inline) if the partial character is the beginning of
2334 * a disallowed one */
2335 if (flags == 0) {
2336 return TRUE;
2337 }
2338
1aa501c2 2339 return cBOOL(is_utf8_char_helper_(s0, e, flags));
22afef87
KW
2340
2341#ifdef HAS_EXTRA_LONG_UTF8
2342
2343 tease_apart_FF:
2344
2345 /* Getting here means the input is either malformed, or, in the case of
2346 * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
2347 * latter case has to be extended UTF-8, so can fail immediately if that is
2348 * forbidden */
2349
2350 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2351 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2352 {
2353 return 0;
4dab108f
KW
2354 }
2355
22afef87
KW
2356 return is_utf8_FF_helper_(s0, e,
2357 TRUE /* Require to be a partial character */
2358 );
2359#endif
2360
4dab108f
KW
2361}
2362
8bc127bf
KW
2363/*
2364
2365=for apidoc is_utf8_fixed_width_buf_flags
2366
2367Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2368is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2369otherwise it returns FALSE.
2370
2371If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2372without restriction. If the final few bytes of the buffer do not form a
2373complete code point, this will return TRUE anyway, provided that
2374C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2375
2376If C<flags> in non-zero, it can be any combination of the
2377C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2378same meanings.
2379
2380This function differs from C<L</is_utf8_string_flags>> only in that the latter
2381returns FALSE if the final few bytes of the string don't form a complete code
2382point.
2383
2384=cut
2385 */
2386#define is_utf8_fixed_width_buf_flags(s, len, flags) \
2387 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2388
2389/*
2390
2391=for apidoc is_utf8_fixed_width_buf_loc_flags
2392
2393Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2394failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
2395to the beginning of any partial character at the end of the buffer; if there is
2396no partial character C<*ep> will contain C<s>+C<len>.
2397
2398See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2399
2400=cut
2401*/
2402
2403#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
2404 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2405
2406/*
2407
2408=for apidoc is_utf8_fixed_width_buf_loclen_flags
2409
2410Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2411complete, valid characters found in the C<el> pointer.
2412
2413=cut
2414*/
2415
2416PERL_STATIC_INLINE bool
c9182d9c 2417Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 2418 STRLEN len,
8bc127bf
KW
2419 const U8 **ep,
2420 STRLEN *el,
2421 const U32 flags)
2422{
2423 const U8 * maybe_partial;
2424
2425 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2426
2427 if (! ep) {
2428 ep = &maybe_partial;
2429 }
2430
2431 /* If it's entirely valid, return that; otherwise see if the only error is
2432 * that the final few bytes are for a partial character */
2433 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
2434 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2435}
2436
e6a4ffc3 2437PERL_STATIC_INLINE UV
c9182d9c 2438Perl_utf8n_to_uvchr_msgs(const U8 *s,
e6a4ffc3
KW
2439 STRLEN curlen,
2440 STRLEN *retlen,
2441 const U32 flags,
2442 U32 * errors,
2443 AV ** msgs)
2444{
2445 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
2446 * simple cases, and, if necessary calls a helper function to deal with the
2447 * more complex ones. Almost all well-formed non-problematic code points
2448 * are considered simple, so that it's unlikely that the helper function
2449 * will need to be called.
2450 *
2451 * This is an adaptation of the tables and algorithm given in
f6521f7c 2452 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
e6a4ffc3
KW
2453 * comprehensive documentation of the original version. A copyright notice
2454 * for the original version is given at the beginning of this file. The
71525f77 2455 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
2456 */
2457
2458 const U8 * const s0 = s;
2459 const U8 * send = s0 + curlen;
a4609251
KW
2460 UV type;
2461 UV uv;
e6a4ffc3
KW
2462
2463 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2464
2465 /* This dfa is fast. If it accepts the input, it was for a well-formed,
2466 * non-problematic code point, which can be returned immediately.
2467 * Otherwise we call a helper function to figure out the more complicated
2468 * cases. */
2469
a4609251 2470 /* No calls from core pass in an empty string; non-core need a check */
d1e771d8
KW
2471#ifdef PERL_CORE
2472 assert(curlen > 0);
2473#else
2474 if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
2475 flags, errors, msgs);
2476#endif
e6a4ffc3 2477
a4609251 2478 type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3 2479
a4609251
KW
2480 /* The table is structured so that 'type' is 0 iff the input byte is
2481 * represented identically regardless of the UTF-8ness of the string */
2482 if (type == 0) { /* UTF-8 invariants are returned unchanged */
2483 uv = *s;
2484 }
2485 else {
2486 UV state = PL_strict_utf8_dfa_tab[256 + type];
2487 uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
e6a4ffc3 2488
a4609251
KW
2489 while (++s < send) {
2490 type = PL_strict_utf8_dfa_tab[*s];
2491 state = PL_strict_utf8_dfa_tab[256 + state + type];
2492
2493 uv = UTF8_ACCUMULATE(uv, *s);
2494
2495 if (state == 0) {
2496 goto success;
2497 }
2498
2499 if (UNLIKELY(state == 1)) {
2500 break;
2501 }
e6a4ffc3
KW
2502 }
2503
a4609251
KW
2504 /* Here is potentially problematic. Use the full mechanism */
2505 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
2506 errors, msgs);
2507 }
2508
2509 success:
2510 if (retlen) {
2511 *retlen = s - s0 + 1;
2512 }
2513 if (errors) {
2514 *errors = 0;
2515 }
2516 if (msgs) {
2517 *msgs = NULL;
e6a4ffc3
KW
2518 }
2519
a4609251 2520 return UNI_TO_NATIVE(uv);
e6a4ffc3
KW
2521}
2522
82651abe 2523PERL_STATIC_INLINE UV
9a9a6c98 2524Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 2525{
9a9a6c98 2526 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
2527
2528 assert(s < send);
2529
2530 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
2531
2532 /* EMPTY is not really allowed, and asserts on debugging builds. But
2533 * on non-debugging we have to deal with it, and this causes it to
2534 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 2535 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 2536 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
2537 }
2538 else {
2539 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
286a1bfd 2540 if (retlen && ret == 0 && (send <= s || *s != '\0')) {
82651abe
KW
2541 *retlen = (STRLEN) -1;
2542 }
2543
2544 return ret;
2545 }
2546}
2547
c8028aa6
TC
2548/* ------------------------------- perl.h ----------------------------- */
2549
2550/*
3f620621 2551=for apidoc_section $utility
dcccc8ff 2552
44170c9a 2553=for apidoc is_safe_syscall
c8028aa6 2554
1a0efc9a
KW
2555Test that the given C<pv> (with length C<len>) doesn't contain any internal
2556C<NUL> characters.
2557If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2558category, and return FALSE.
c8028aa6
TC
2559
2560Return TRUE if the name is safe.
2561
1a0efc9a
KW
2562C<what> and C<op_name> are used in any warning.
2563
796b6530 2564Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
2565
2566=cut
2567*/
2568
2569PERL_STATIC_INLINE bool
ffd62fc2
KW
2570Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2571{
c8028aa6
TC
2572 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2573 * perl itself uses xce*() functions which accept 8-bit strings.
2574 */
2575
2576 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2577
6c4650b3 2578 if (len > 1) {
c8028aa6 2579 char *null_at;
41188aa0 2580 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 2581 SETERRNO(ENOENT, LIB_INVARG);
1d505182 2582 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 2583 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 2584 what, op_name, pv, null_at+1);
c8028aa6
TC
2585 return FALSE;
2586 }
2587 }
2588
2589 return TRUE;
2590}
2591
2592/*
7cb3f959
TC
2593
2594Return true if the supplied filename has a newline character
fa6c7d00 2595immediately before the first (hopefully only) NUL.
7cb3f959
TC
2596
2597My original look at this incorrectly used the len from SvPV(), but
2598that's incorrect, since we allow for a NUL in pv[len-1].
2599
2600So instead, strlen() and work from there.
2601
2602This allow for the user reading a filename, forgetting to chomp it,
2603then calling:
2604
2605 open my $foo, "$file\0";
2606
2607*/
2608
2609#ifdef PERL_CORE
2610
2611PERL_STATIC_INLINE bool
ffd62fc2
KW
2612S_should_warn_nl(const char *pv)
2613{
7cb3f959
TC
2614 STRLEN len;
2615
2616 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2617
2618 len = strlen(pv);
2619
2620 return len > 0 && pv[len-1] == '\n';
2621}
2622
2623#endif
2624
3a019afd
KW
2625#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2626
2627PERL_STATIC_INLINE bool
2628S_lossless_NV_to_IV(const NV nv, IV *ivp)
2629{
2630 /* This function determines if the input NV 'nv' may be converted without
2631 * loss of data to an IV. If not, it returns FALSE taking no other action.
2632 * But if it is possible, it does the conversion, returning TRUE, and
2633 * storing the converted result in '*ivp' */
2634
2635 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2636
cd304e76
DM
2637# if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2638 /* Normally any comparison with a NaN returns false; if we can't rely
2639 * on that behaviour, check explicitly */
3a019afd
KW
2640 if (UNLIKELY(Perl_isnan(nv))) {
2641 return FALSE;
2642 }
3a019afd
KW
2643# endif
2644
cd304e76
DM
2645 /* Written this way so that with an always-false NaN comparison we
2646 * return false */
ef0a8475 2647 if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
3a019afd
KW
2648 return FALSE;
2649 }
2650
2651 if ((IV) nv != nv) {
2652 return FALSE;
2653 }
2654
2655 *ivp = (IV) nv;
2656 return TRUE;
2657}
2658
2659#endif
2660
81d52ecd
JH
2661/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2662
94b0cb42
KW
2663#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2664
81d52ecd
JH
2665#define MAX_CHARSET_NAME_LENGTH 2
2666
2667PERL_STATIC_INLINE const char *
94b0cb42 2668S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 2669{
94b0cb42
KW
2670 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2671
81d52ecd
JH
2672 /* Returns a string that corresponds to the name of the regex character set
2673 * given by 'flags', and *lenp is set the length of that string, which
2674 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2675
2676 *lenp = 1;
2677 switch (get_regex_charset(flags)) {
2678 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2679 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2680 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
1604cfb0
MS
2681 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2682 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2683 *lenp = 2;
2684 return ASCII_MORE_RESTRICT_PAT_MODS;
81d52ecd
JH
2685 }
2686 /* The NOT_REACHED; hides an assert() which has a rather complex
2687 * definition in perl.h. */
2688 NOT_REACHED; /* NOTREACHED */
2689 return "?"; /* Unknown */
2690}
2691
94b0cb42
KW
2692#endif
2693
7cb3f959 2694/*
ed382232
TC
2695
2696Return false if any get magic is on the SV other than taint magic.
2697
2698*/
2699
2700PERL_STATIC_INLINE bool
ffd62fc2
KW
2701Perl_sv_only_taint_gmagic(SV *sv)
2702{
ed382232
TC
2703 MAGIC *mg = SvMAGIC(sv);
2704
2705 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2706
2707 while (mg) {
2708 if (mg->mg_type != PERL_MAGIC_taint
2709 && !(mg->mg_flags & MGf_GSKIP)
2710 && mg->mg_virtual->svt_get) {
2711 return FALSE;
2712 }
2713 mg = mg->mg_moremagic;
2714 }
2715
2716 return TRUE;
2717}
2718
ed8ff0f3
DM
2719/* ------------------ cop.h ------------------------------------------- */
2720
5b6f7443
DM
2721/* implement GIMME_V() macro */
2722
2723PERL_STATIC_INLINE U8
2724Perl_gimme_V(pTHX)
2725{
2726 I32 cxix;
2727 U8 gimme = (PL_op->op_flags & OPf_WANT);
2728
2729 if (gimme)
2730 return gimme;
2731 cxix = PL_curstackinfo->si_cxsubix;
2732 if (cxix < 0)
390fe0c0 2733 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
5b6f7443
DM
2734 assert(cxstack[cxix].blk_gimme & G_WANT);
2735 return (cxstack[cxix].blk_gimme & G_WANT);
2736}
2737
ed8ff0f3
DM
2738
2739/* Enter a block. Push a new base context and return its address. */
2740
2741PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2742Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2743{
2744 PERL_CONTEXT * cx;
2745
2746 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2747
2748 CXINC;
2749 cx = CX_CUR();
2750 cx->cx_type = type;
2751 cx->blk_gimme = gimme;
2752 cx->blk_oldsaveix = saveix;
4caf7d8c 2753 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2754 cx->blk_oldcop = PL_curcop;
4caf7d8c 2755 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2756 cx->blk_oldscopesp = PL_scopestack_ix;
2757 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2758 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2759
2760 PL_tmps_floor = PL_tmps_ix;
2761 CX_DEBUG(cx, "PUSH");
2762 return cx;
2763}
2764
2765
2766/* Exit a block (RETURN and LAST). */
2767
2768PERL_STATIC_INLINE void
c9182d9c 2769Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2770{
2771 PERL_ARGS_ASSERT_CX_POPBLOCK;
2772
2773 CX_DEBUG(cx, "POP");
2774 /* these 3 are common to cx_popblock and cx_topblock */
2775 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2776 PL_scopestack_ix = cx->blk_oldscopesp;
2777 PL_curpm = cx->blk_oldpm;
2778
2779 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2780 * and leaves a CX entry lying around for repeated use, so
2781 * skip for multicall */ \
2782 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2783 || PL_savestack_ix == cx->blk_oldsaveix);
2784 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2785 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2786}
2787
2788/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2789 * Whereas cx_popblock() restores the state to the point just before
2790 * cx_pushblock() was called, cx_topblock() restores it to the point just
2791 * *after* cx_pushblock() was called. */
2792
2793PERL_STATIC_INLINE void
c9182d9c 2794Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2795{
2796 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2797
2798 CX_DEBUG(cx, "TOP");
2799 /* these 3 are common to cx_popblock and cx_topblock */
2800 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2801 PL_scopestack_ix = cx->blk_oldscopesp;
2802 PL_curpm = cx->blk_oldpm;
2803
2804 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2805}
2806
2807
a73d8813 2808PERL_STATIC_INLINE void
c9182d9c 2809Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2810{
2811 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2812
2813 PERL_ARGS_ASSERT_CX_PUSHSUB;
2814
3f6bd23a 2815 PERL_DTRACE_PROBE_ENTRY(cv);
5b6f7443
DM
2816 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2817 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
a73d8813
DM
2818 cx->blk_sub.cv = cv;
2819 cx->blk_sub.olddepth = CvDEPTH(cv);
2820 cx->blk_sub.prevcomppad = PL_comppad;
2821 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2822 cx->blk_sub.retop = retop;
2823 SvREFCNT_inc_simple_void_NN(cv);
2824 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2825}
2826
2827
2828/* subsets of cx_popsub() */
2829
2830PERL_STATIC_INLINE void
c9182d9c 2831Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2832{
2833 CV *cv;
2834
2835 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2836 assert(CxTYPE(cx) == CXt_SUB);
2837
2838 PL_comppad = cx->blk_sub.prevcomppad;
2839 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2840 cv = cx->blk_sub.cv;
2841 CvDEPTH(cv) = cx->blk_sub.olddepth;
2842 cx->blk_sub.cv = NULL;
2843 SvREFCNT_dec(cv);
5b6f7443 2844 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
a73d8813
DM
2845}
2846
2847
2848/* handle the @_ part of leaving a sub */
2849
2850PERL_STATIC_INLINE void
c9182d9c 2851Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2852{
2853 AV *av;
2854
2855 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2856 assert(CxTYPE(cx) == CXt_SUB);
2857 assert(AvARRAY(MUTABLE_AV(
2858 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2859 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2860
2861 CX_POP_SAVEARRAY(cx);
2862 av = MUTABLE_AV(PAD_SVl(0));
2863 if (UNLIKELY(AvREAL(av)))
2864 /* abandon @_ if it got reified */
2865 clear_defarray(av, 0);
2866 else {
2867 CLEAR_ARGARRAY(av);
2868 }
2869}
2870
2871
2872PERL_STATIC_INLINE void
c9182d9c 2873Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2874{
2875 PERL_ARGS_ASSERT_CX_POPSUB;
2876 assert(CxTYPE(cx) == CXt_SUB);
2877
3f6bd23a 2878 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2879
2880 if (CxHASARGS(cx))
2881 cx_popsub_args(cx);
2882 cx_popsub_common(cx);
2883}
2884
2885
6a7d52cc 2886PERL_STATIC_INLINE void
c9182d9c 2887Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2888{
2889 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2890
5b6f7443
DM
2891 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2892 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
6a7d52cc
DM
2893 cx->blk_format.cv = cv;
2894 cx->blk_format.retop = retop;
2895 cx->blk_format.gv = gv;
2896 cx->blk_format.dfoutgv = PL_defoutgv;
2897 cx->blk_format.prevcomppad = PL_comppad;
2898 cx->blk_u16 = 0;
2899
2900 SvREFCNT_inc_simple_void_NN(cv);
2901 CvDEPTH(cv)++;
2902 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2903}
2904
2905
2906PERL_STATIC_INLINE void
c9182d9c 2907Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2908{
2909 CV *cv;
2910 GV *dfout;
2911
2912 PERL_ARGS_ASSERT_CX_POPFORMAT;
2913 assert(CxTYPE(cx) == CXt_FORMAT);
2914
2915 dfout = cx->blk_format.dfoutgv;
2916 setdefout(dfout);
2917 cx->blk_format.dfoutgv = NULL;
2918 SvREFCNT_dec_NN(dfout);
2919
2920 PL_comppad = cx->blk_format.prevcomppad;
2921 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2922 cv = cx->blk_format.cv;
2923 cx->blk_format.cv = NULL;
2924 --CvDEPTH(cv);
2925 SvREFCNT_dec_NN(cv);
5b6f7443 2926 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
6a7d52cc
DM
2927}
2928
2929
13febba5 2930PERL_STATIC_INLINE void
6b729d24 2931Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5 2932{
13febba5
DM
2933 cx->blk_eval.retop = retop;
2934 cx->blk_eval.old_namesv = namesv;
2935 cx->blk_eval.old_eval_root = PL_eval_root;
2936 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2937 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2938 cx->blk_eval.cur_top_env = PL_top_env;
2939
4c57ced5 2940 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2941 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2942 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2943}
2944
6b729d24
TC
2945PERL_STATIC_INLINE void
2946Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2947{
2948 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2949
2950 Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2951
2952 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2953 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2954}
2955
2956PERL_STATIC_INLINE void
2957Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2958{
2959 PERL_ARGS_ASSERT_CX_PUSHTRY;
2960
2961 Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2962
2963 /* Don't actually change it, just store the current value so it's restored
2964 * by the common popeval */
2965 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2966}
2967
13febba5
DM
2968
2969PERL_STATIC_INLINE void
c9182d9c 2970Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2971{
2972 SV *sv;
2973
2974 PERL_ARGS_ASSERT_CX_POPEVAL;
2975 assert(CxTYPE(cx) == CXt_EVAL);
2976
2977 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2978 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2979 PL_eval_root = cx->blk_eval.old_eval_root;
2980 sv = cx->blk_eval.cur_text;
4c57ced5 2981 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2982 cx->blk_eval.cur_text = NULL;
2983 SvREFCNT_dec_NN(sv);
2984 }
2985
2986 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2987 if (sv) {
2988 cx->blk_eval.old_namesv = NULL;
2989 SvREFCNT_dec_NN(sv);
2990 }
5b6f7443 2991 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
13febba5 2992}
6a7d52cc 2993
a73d8813 2994
d1b6bf72
DM
2995/* push a plain loop, i.e.
2996 * { block }
2997 * while (cond) { block }
2998 * for (init;cond;continue) { block }
2999 * This loop can be last/redo'ed etc.
3000 */
3001
3002PERL_STATIC_INLINE void
c9182d9c 3003Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
3004{
3005 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
3006 cx->blk_loop.my_op = cLOOP;
3007}
3008
3009
3010/* push a true for loop, i.e.
3011 * for var (list) { block }
3012 */
3013
3014PERL_STATIC_INLINE void
c9182d9c 3015Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
3016{
3017 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
3018
3019 /* this one line is common with cx_pushloop_plain */
3020 cx->blk_loop.my_op = cLOOP;
3021
3022 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
3023 cx->blk_loop.itersave = itersave;
3024#ifdef USE_ITHREADS
3025 cx->blk_loop.oldcomppad = PL_comppad;
3026#endif
3027}
3028
3029
3030/* pop all loop types, including plain */
3031
3032PERL_STATIC_INLINE void
c9182d9c 3033Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
3034{
3035 PERL_ARGS_ASSERT_CX_POPLOOP;
3036
3037 assert(CxTYPE_is_LOOP(cx));
3038 if ( CxTYPE(cx) == CXt_LOOP_ARY
3039 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
3040 {
3041 /* Free ary or cur. This assumes that state_u.ary.ary
3042 * aligns with state_u.lazysv.cur. See cx_dup() */
3043 SV *sv = cx->blk_loop.state_u.lazysv.cur;
3044 cx->blk_loop.state_u.lazysv.cur = NULL;
3045 SvREFCNT_dec_NN(sv);
3046 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
3047 sv = cx->blk_loop.state_u.lazysv.end;
3048 cx->blk_loop.state_u.lazysv.end = NULL;
3049 SvREFCNT_dec_NN(sv);
3050 }
3051 }
3052 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3053 SV *cursv;
3054 SV **svp = (cx)->blk_loop.itervar_u.svp;
3055 if ((cx->cx_type & CXp_FOR_GV))
3056 svp = &GvSV((GV*)svp);
3057 cursv = *svp;
3058 *svp = cx->blk_loop.itersave;
3059 cx->blk_loop.itersave = NULL;
3060 SvREFCNT_dec(cursv);
3061 }
3062}
3063
2a7b7c61
DM
3064
3065PERL_STATIC_INLINE void
c9182d9c 3066Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 3067{
7896dde7 3068 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 3069
7896dde7 3070 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
3071}
3072
3073
3074PERL_STATIC_INLINE void
c9182d9c 3075Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 3076{
7896dde7
Z
3077 PERL_ARGS_ASSERT_CX_POPWHEN;
3078 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
3079
3080 PERL_UNUSED_ARG(cx);
59a14f30 3081 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
3082 /* currently NOOP */
3083}
3084
3085
7896dde7 3086PERL_STATIC_INLINE void
c9182d9c 3087Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
3088{
3089 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3090
3091 cx->blk_givwhen.leave_op = cLOGOP->op_other;
3092 cx->blk_givwhen.defsv_save = orig_defsv;
3093}
3094
3095
3096PERL_STATIC_INLINE void
c9182d9c 3097Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
3098{
3099 SV *sv;
3100
3101 PERL_ARGS_ASSERT_CX_POPGIVEN;
3102 assert(CxTYPE(cx) == CXt_GIVEN);
3103
3104 sv = GvSV(PL_defgv);
3105 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3106 cx->blk_givwhen.defsv_save = NULL;
3107 SvREFCNT_dec(sv);
3108}
3109
ec2c235b
KW
3110/* ------------------ util.h ------------------------------------------- */
3111
3112/*
3f620621 3113=for apidoc_section $string
ec2c235b
KW
3114
3115=for apidoc foldEQ
3116
3117Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3118same
3119case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
3120match themselves and their opposite case counterparts. Non-cased and non-ASCII
3121range bytes match only themselves.
3122
3123=cut
3124*/
3125
3126PERL_STATIC_INLINE I32
3127Perl_foldEQ(const char *s1, const char *s2, I32 len)
3128{
3129 const U8 *a = (const U8 *)s1;
3130 const U8 *b = (const U8 *)s2;
3131
3132 PERL_ARGS_ASSERT_FOLDEQ;
3133
3134 assert(len >= 0);
3135
3136 while (len--) {
1604cfb0
MS
3137 if (*a != *b && *a != PL_fold[*b])
3138 return 0;
3139 a++,b++;
ec2c235b
KW
3140 }
3141 return 1;
3142}
3143
0f9cb40c 3144PERL_STATIC_INLINE I32
ec2c235b
KW
3145Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
3146{
79a1fabd
KW
3147 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
3148 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3149 * does not check for this. Nor does it check that the strings each have
3150 * at least 'len' characters. */
ec2c235b
KW
3151
3152 const U8 *a = (const U8 *)s1;
3153 const U8 *b = (const U8 *)s2;
3154
3155 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3156
3157 assert(len >= 0);
3158
3159 while (len--) {
1604cfb0
MS
3160 if (*a != *b && *a != PL_fold_latin1[*b]) {
3161 return 0;
3162 }
3163 a++, b++;
ec2c235b
KW
3164 }
3165 return 1;
3166}
3167
3168/*
3f620621 3169=for apidoc_section $locale
ec2c235b
KW
3170=for apidoc foldEQ_locale
3171
3172Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3173same case-insensitively in the current locale; false otherwise.
3174
3175=cut
3176*/
3177
0f9cb40c 3178PERL_STATIC_INLINE I32
ec2c235b
KW
3179Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
3180{
ec2c235b
KW
3181 const U8 *a = (const U8 *)s1;
3182 const U8 *b = (const U8 *)s2;
3183
3184 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3185
3186 assert(len >= 0);
3187
3188 while (len--) {
1604cfb0
MS
3189 if (*a != *b && *a != PL_fold_locale[*b])
3190 return 0;
3191 a++,b++;
ec2c235b
KW
3192 }
3193 return 1;
3194}
3195
1ab100a8 3196/*
3f620621 3197=for apidoc_section $string
1ab100a8
KW
3198=for apidoc my_strnlen
3199
3200The C library C<strnlen> if available, or a Perl implementation of it.
3201
3202C<my_strnlen()> computes the length of the string, up to C<maxlen>
a3815e44 3203characters. It will never attempt to address more than C<maxlen>
1ab100a8
KW
3204characters, making it suitable for use with strings that are not
3205guaranteed to be NUL-terminated.
3206
3207=cut
3208
3209Description stolen from http://man.openbsd.org/strnlen.3,
3210implementation stolen from PostgreSQL.
3211*/
3212#ifndef HAS_STRNLEN
3213
3214PERL_STATIC_INLINE Size_t
3215Perl_my_strnlen(const char *str, Size_t maxlen)
3216{
3217 const char *end = (char *) memchr(str, '\0', maxlen);
3218
3219 PERL_ARGS_ASSERT_MY_STRNLEN;
3220
3221 if (end == NULL) return maxlen;
3222 return end - str;
3223}
3224
3225#endif
3226
6dba01e2
KW
3227#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3228
3229PERL_STATIC_INLINE void *
3230S_my_memrchr(const char * s, const char c, const STRLEN len)
3231{
3232 /* memrchr(), since many platforms lack it */
3233
3234 const char * t = s + len - 1;
3235
3236 PERL_ARGS_ASSERT_MY_MEMRCHR;
3237
3238 while (t >= s) {
3239 if (*t == c) {
3240 return (void *) t;
3241 }
3242 t--;
3243 }
3244
3245 return NULL;
3246}
3247
3248#endif
3249
24f3e849
KW
3250PERL_STATIC_INLINE char *
3251Perl_mortal_getenv(const char * str)
3252{
3253 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3254 *
03694582
KW
3255 * It's (mostly) thread-safe because it uses a mutex to prevent other
3256 * threads (that look at this mutex) from destroying the result before this
3257 * routine has a chance to copy the result to a place that won't be
3258 * destroyed before the caller gets a chance to handle it. That place is a
3259 * mortal SV. khw chose this over SAVEFREEPV because he is under the
3260 * impression that the SV will hang around longer under more circumstances
24f3e849 3261 *
03694582
KW
3262 * The reason it isn't completely thread-safe is that other code could
3263 * simply not pay attention to the mutex. All of the Perl core uses the
3264 * mutex, but it is possible for code from, say XS, to not use this mutex,
3265 * defeating the safety.
24f3e849 3266 *
03694582
KW
3267 * getenv() returns, in some implementations, a pointer to a spot in the
3268 * **environ array, which could be invalidated at any time by this or
3269 * another thread changing the environment. Other implementations copy the
3270 * **environ value to a static buffer, returning a pointer to that. That
3271 * buffer might or might not be invalidated by a getenv() call in another
3272 * thread. If it does get zapped, we need an exclusive lock. Otherwise,
3273 * many getenv() calls can safely be running simultaneously, so a
3274 * many-reader (but no simultaneous writers) lock is ok. There is a
3275 * Configure probe to see if another thread destroys the buffer, and the
3276 * mutex is defined accordingly.
3277 *
3278 * But in all cases, using the mutex prevents these problems, as long as
3279 * all code uses the same mutex..
24f3e849
KW
3280 *
3281 * A complication is that this can be called during phases where the
3282 * mortalization process isn't available. These are in interpreter
3283 * destruction or early in construction. khw believes that at these times
3284 * there shouldn't be anything else going on, so plain getenv is safe AS
3285 * LONG AS the caller acts on the return before calling it again. */
3286
3287 char * ret;
3288 dTHX;
3289
3290 PERL_ARGS_ASSERT_MORTAL_GETENV;
3291
3292 /* Can't mortalize without stacks. khw believes that no other threads
3293 * should be running, so no need to lock things, and this may be during a
3294 * phase when locking isn't even available */
3295 if (UNLIKELY(PL_scopestack_ix == 0)) {
3296 return getenv(str);
3297 }
3298
03694582
KW
3299#ifdef PERL_MEM_LOG
3300
3301 /* A major complication arises under PERL_MEM_LOG. When that is active,
3302 * every memory allocation may result in logging, depending on the value of
3303 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for
3304 * saving ENV{foo}'s value (but before saving it), the logging code will
3305 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some
3306 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3307 * lock a boolean mutex recursively); 3) destroying the getenv() static
3308 * buffer; or 4) destroying the temporary created by this for the copy
3309 * causes a log entry to be made which could cause a new temporary to be
3310 * created, which will need to be destroyed at some point, leading to an
3311 * infinite loop.
3312 *
3313 * The solution adopted here (after some gnashing of teeth) is to detect
3314 * the recursive calls and calls from the logger, and treat them specially.
3315 * Let's say we want to do getenv("foo"). We first find
3316 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3317 * variable, so no temporary is required. Then we do getenv(foo}, and in
3318 * the process of creating a temporary to save it, this function will be
3319 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call,
3320 * we detect that it is such a call and return our saved value instead of
3321 * locking and doing a new getenv(). This solves all of problems 1), 2),
3322 * and 3). Because all the getenv()s are done while the mutex is locked,
3323 * the state cannot have changed. To solve 4), we don't create a temporary
3324 * when this is called from the logging code. That code disposes of the
3325 * return value while the mutex is still locked.
3326 *
3327 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3328 * digits and 3 particular letters are significant; the rest are ignored by
3329 * the memory logging code. Thus the per-interpreter variable only needs
3330 * to be large enough to save the significant information, the size of
3331 * which is known at compile time. The first byte is extra, reserved for
3332 * flags for our use. To protect against overflowing, only the reserved
3333 * byte, as many digits as don't overflow, and the three letters are
3334 * stored.
3335 *
3336 * The reserved byte has two bits:
3337 * 0x1 if set indicates that if we get here, it is a recursive call of
3338 * getenv()
3339 * 0x2 if set indicates that the call is from the logging code.
3340 *
3341 * If the flag indicates this is a recursive call, just return the stored
3342 * value of PL_mem_log; An empty value gets turned into NULL. */
3343 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3344 if (PL_mem_log[1] == '\0') {
3345 return NULL;
3346 } else {
3347 return PL_mem_log + 1;
3348 }
3349 }
3350
3351#endif
3352
35bcf7ff 3353 GETENV_LOCK;
24f3e849 3354
03694582
KW
3355#ifdef PERL_MEM_LOG
3356
3357 /* Here we are in a critical section. As explained above, we do our own
3358 * getenv(PERL_MEM_LOG), saving the result safely. */
3359 ret = getenv("PERL_MEM_LOG");
3360 if (ret == NULL) { /* No logging active */
3361
3362 /* Return that immediately if called from the logging code */
3363 if (PL_mem_log[0] & 0x2) {
3364 GETENV_UNLOCK;
3365 return NULL;
3366 }
3367
3368 PL_mem_log[1] = '\0';
3369 }
3370 else {
3371 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */
3372
3373 /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3374 * extremely long string. But we want only a few characters from it.
3375 * PL_mem_log has been made large enough to hold just the ones we need.
3376 * First the file descriptor. */
3377 if (isDIGIT(*ret)) {
3378 const char * s = ret;
3379 if (UNLIKELY(*s == '0')) {
3380
3381 /* Reduce multiple leading zeros to a single one. This is to
3382 * allow the caller to change what to do with leading zeros. */
3383 *mem_log_meat++ = '0';
3384 s++;
3385 while (*s == '0') {
3386 s++;
3387 }
3388 }
3389
3390 /* If the input overflows, copy just enough for the result to also
3391 * overflow, plus 1 to make sure */
3392 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3393 *mem_log_meat++ = *s++;
3394 }
3395 }
3396
3397 /* Then each of the three significant characters */
3398 if (strchr(ret, 'm')) {
3399 *mem_log_meat++ = 'm';
3400 }
3401 if (strchr(ret, 's')) {
3402 *mem_log_meat++ = 's';
3403 }
3404 if (strchr(ret, 't')) {
3405 *mem_log_meat++ = 't';
3406 }
3407 *mem_log_meat = '\0';
3408
3409 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3410 }
3411
3412 /* If we are being called from the logger, it only needs the significant
3413 * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3414 if (PL_mem_log[0] & 0x2) {
3415 assert(strEQ(str, "PERL_MEM_LOG"));
3416 GETENV_UNLOCK;
3417 return PL_mem_log + 1;
3418 }
3419
3420 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that
3421 * is coming from other than the logging code, so it should be treated the
3422 * same as any other getenv(), returning the full value, not just the
3423 * significant part, and having its value saved. Set the flag that
3424 * indicates any call to this routine will be a recursion from here */
3425 PL_mem_log[0] = 0x1;
3426
3427#endif
3428
3429 /* Now get the value of the real desired variable, and save a copy */
24f3e849
KW
3430 ret = getenv(str);
3431
3432 if (ret != NULL) {
c80a8618 3433 ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
24f3e849
KW
3434 }
3435
35bcf7ff
KW
3436 GETENV_UNLOCK;
3437
03694582
KW
3438#ifdef PERL_MEM_LOG
3439
3440 /* Clear the buffer */
3441 Zero(PL_mem_log, sizeof(PL_mem_log), char);
3442
3443#endif
3444
24f3e849
KW
3445 return ret;
3446}
3447
1d0d673f
PE
3448PERL_STATIC_INLINE bool
3449Perl_sv_isbool(pTHX_ const SV *sv)
3450{
3451 return SvIOK(sv) && SvPOK(sv) && SvIsCOW_static(sv) &&
3452 (SvPVX_const(sv) == PL_Yes || SvPVX_const(sv) == PL_No);
3453}
3454
9c913148
TC
3455#ifdef USE_ITHREADS
3456
3457PERL_STATIC_INLINE AV *
3458Perl_cop_file_avn(pTHX_ const COP *cop) {
3459
3460 PERL_ARGS_ASSERT_COP_FILE_AVN;
3461
3462 const char *file = CopFILE(cop);
3463 if (file) {
3464 GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3465 if (gv) {
3466 return GvAVn(gv);
3467 }
3468 else
3469 return NULL;
3470 }
3471 else
3472 return NULL;
3473}
3474
3475#endif
3476
ed382232 3477/*
c8028aa6
TC
3478 * ex: set ts=8 sts=4 sw=4 et:
3479 */