This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Promote v5.36 usage and feature bundles doc
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9 8 * This file contains tables and code adapted from
f6521f7c 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
8ed185f9
KW
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
88dfbb19
KW
37 * Note also perlstatic.h for functions that can't or shouldn't be inlined, but
38 * whose details should be exposed to the compiler, for such things as tail
39 * call optimization.
40 *
25468daa
FC
41 * Each section names the header file that the functions "belong" to.
42 */
27669aa4 43
be3a7a5d
KW
44/* ------------------------------- av.h ------------------------------- */
45
87306e06 46/*
3f620621 47=for apidoc_section $AV
87306e06
KW
48=for apidoc av_count
49Returns the number of elements in the array C<av>. This is the true length of
50the array, including any undefined elements. It is always the same as
51S<C<av_top_index(av) + 1>>.
52
53=cut
54*/
55PERL_STATIC_INLINE Size_t
56Perl_av_count(pTHX_ AV *av)
be3a7a5d 57{
87306e06 58 PERL_ARGS_ASSERT_AV_COUNT;
be3a7a5d
KW
59 assert(SvTYPE(av) == SVt_PVAV);
60
87306e06 61 return AvFILL(av) + 1;
be3a7a5d
KW
62}
63
84c75204
RL
64/* ------------------------------- av.c ------------------------------- */
65
66/*
67=for apidoc av_store_simple
68
69This is a cut-down version of av_store that assumes that the array is
70very straightforward - no magic, not readonly, and AvREAL - and that
71C<key> is not negative. This function MUST NOT be used in situations
72where any of those assumptions may not hold.
73
74Stores an SV in an array. The array index is specified as C<key>. It
75can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
76
77Note that the caller is responsible for suitably incrementing the reference
78count of C<val> before the call.
79
80Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
81
82=cut
83*/
84
85PERL_STATIC_INLINE SV**
86Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
87{
56077d95
LT
88 SV** ary;
89
84c75204
RL
90 PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
91 assert(SvTYPE(av) == SVt_PVAV);
92 assert(!SvMAGICAL(av));
93 assert(!SvREADONLY(av));
94 assert(AvREAL(av));
95 assert(key > -1);
96
56077d95 97 ary = AvARRAY(av);
84c75204
RL
98
99 if (AvFILLp(av) < key) {
100 if (key > AvMAX(av)) {
101 av_extend(av,key);
102 ary = AvARRAY(av);
103 }
104 AvFILLp(av) = key;
105 } else
106 SvREFCNT_dec(ary[key]);
107
108 ary[key] = val;
109 return &ary[key];
110}
111
112/*
113=for apidoc av_fetch_simple
114
115This is a cut-down version of av_fetch that assumes that the array is
116very straightforward - no magic, not readonly, and AvREAL - and that
117C<key> is not negative. This function MUST NOT be used in situations
118where any of those assumptions may not hold.
119
120Returns the SV at the specified index in the array. The C<key> is the
121index. If lval is true, you are guaranteed to get a real SV back (in case
122it wasn't real before), which you can then modify. Check that the return
123value is non-null before dereferencing it to a C<SV*>.
124
125The rough perl equivalent is C<$myarray[$key]>.
126
127=cut
128*/
129
130PERL_STATIC_INLINE SV**
131Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
132{
133 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
134 assert(SvTYPE(av) == SVt_PVAV);
135 assert(!SvMAGICAL(av));
136 assert(!SvREADONLY(av));
137 assert(AvREAL(av));
138 assert(key > -1);
139
140 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
8fcb2425 141 return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
84c75204
RL
142 } else {
143 return &AvARRAY(av)[key];
144 }
145}
146
a9b64e60
RL
147/*
148=for apidoc av_push_simple
149
150This is a cut-down version of av_push that assumes that the array is very
151straightforward - no magic, not readonly, and AvREAL - and that C<key> is
152not less than -1. This function MUST NOT be used in situations where any
153of those assumptions may not hold.
154
155Pushes an SV (transferring control of one reference count) onto the end of the
156array. The array will grow automatically to accommodate the addition.
157
158Perl equivalent: C<push @myarray, $val;>.
159
160=cut
161*/
162
163PERL_STATIC_INLINE void
164Perl_av_push_simple(pTHX_ AV *av, SV *val)
165{
166 PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
167 assert(SvTYPE(av) == SVt_PVAV);
168 assert(!SvMAGICAL(av));
169 assert(!SvREADONLY(av));
170 assert(AvREAL(av));
171 assert(AvFILLp(av) > -2);
172
173 (void)av_store_simple(av,AvFILLp(av)+1,val);
174}
175
eae3cc96
RL
176/*
177=for apidoc av_new_alloc
178
179This implements L<perlapi/C<newAV_alloc_x>>
180and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this
181functionality.
182
183Creates a new AV and allocates its SV* array.
184
185This is similar to, but more efficient than doing:
186
187 AV *av = newAV();
188 av_extend(av, key);
189
190The size parameter is used to pre-allocate a SV* array large enough to
191hold at least elements C<0..(size-1)>. C<size> must be at least 1.
192
193The C<zeroflag> parameter controls whether or not the array is NULL
194initialized.
195
196=cut
197*/
198
199PERL_STATIC_INLINE AV *
200Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag)
201{
202 AV * const av = newAV();
203 SV** ary;
204 PERL_ARGS_ASSERT_AV_NEW_ALLOC;
205 assert(size > 0);
206
207 Newx(ary, size, SV*); /* Newx performs the memwrap check */
208 AvALLOC(av) = ary;
209 AvARRAY(av) = ary;
210 AvMAX(av) = size - 1;
211
212 if (zeroflag)
213 Zero(ary, size, SV*);
214
215 return av;
216}
217
218
1afe1db1
FC
219/* ------------------------------- cv.h ------------------------------- */
220
fa3e44c0 221/*
3f620621 222=for apidoc_section $CV
fa3e44c0
KW
223=for apidoc CvGV
224Returns the GV associated with the CV C<sv>, reifying it if necessary.
225
226=cut
227*/
ae77754a 228PERL_STATIC_INLINE GV *
c9182d9c 229Perl_CvGV(pTHX_ CV *sv)
ae77754a 230{
74804ad1
KW
231 PERL_ARGS_ASSERT_CVGV;
232
ae77754a 233 return CvNAMED(sv)
1604cfb0
MS
234 ? Perl_cvgv_from_hek(aTHX_ sv)
235 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
ae77754a
FC
236}
237
c8b3b0ee
KW
238/*
239=for apidoc CvDEPTH
240Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a
241recursive call.
242
243=cut
244*/
1afe1db1 245PERL_STATIC_INLINE I32 *
74804ad1 246Perl_CvDEPTH(const CV * const sv)
1afe1db1 247{
74804ad1 248 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 249 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 250
8de47657 251 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
252}
253
d16269d8
PM
254/*
255 CvPROTO returns the prototype as stored, which is not necessarily what
256 the interpreter should be using. Specifically, the interpreter assumes
257 that spaces have been stripped, which has been the case if the prototype
258 was added by toke.c, but is generally not the case if it was added elsewhere.
259 Since we can't enforce the spacelessness at assignment time, this routine
260 provides a temporary copy at parse time with spaces removed.
261 I<orig> is the start of the original buffer, I<len> is the length of the
262 prototype and will be updated when this returns.
263 */
264
5b67adb8 265#ifdef PERL_CORE
d16269d8
PM
266PERL_STATIC_INLINE char *
267S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
268{
269 SV * tmpsv;
270 char * tmps;
271 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
272 tmps = SvPVX(tmpsv);
273 while ((*len)--) {
1604cfb0
MS
274 if (!isSPACE(*orig))
275 *tmps++ = *orig;
276 orig++;
d16269d8
PM
277 }
278 *tmps = '\0';
279 *len = tmps - SvPVX(tmpsv);
1604cfb0 280 return SvPVX(tmpsv);
d16269d8 281}
5b67adb8 282#endif
d16269d8 283
25fdce4a
FC
284/* ------------------------------- mg.h ------------------------------- */
285
286#if defined(PERL_CORE) || defined(PERL_EXT)
287/* assumes get-magic and stringification have already occurred */
288PERL_STATIC_INLINE STRLEN
289S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
290{
291 assert(mg->mg_type == PERL_MAGIC_regex_global);
292 assert(mg->mg_len != -1);
293 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
1604cfb0 294 return (STRLEN)mg->mg_len;
25fdce4a 295 else {
1604cfb0
MS
296 const STRLEN pos = (STRLEN)mg->mg_len;
297 /* Without this check, we may read past the end of the buffer: */
298 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
299 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
25fdce4a
FC
300 }
301}
302#endif
303
03414f05
FC
304/* ------------------------------- pad.h ------------------------------ */
305
306#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
307PERL_STATIC_INLINE bool
b9d5702c 308S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 309{
b9d5702c
KW
310 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
311
03414f05
FC
312 /* is seq within the range _LOW to _HIGH ?
313 * This is complicated by the fact that PL_cop_seqmax
314 * may have wrapped around at some point */
315 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
1604cfb0 316 return FALSE; /* not yet introduced */
03414f05
FC
317
318 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
319 /* in compiling scope */
1604cfb0
MS
320 if (
321 (seq > COP_SEQ_RANGE_LOW(pn))
322 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
323 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
324 )
325 return TRUE;
03414f05
FC
326 }
327 else if (
1604cfb0
MS
328 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
329 ?
330 ( seq > COP_SEQ_RANGE_LOW(pn)
331 || seq <= COP_SEQ_RANGE_HIGH(pn))
03414f05 332
1604cfb0
MS
333 : ( seq > COP_SEQ_RANGE_LOW(pn)
334 && seq <= COP_SEQ_RANGE_HIGH(pn))
03414f05 335 )
1604cfb0 336 return TRUE;
03414f05
FC
337 return FALSE;
338}
339#endif
340
33a4312b
FC
341/* ------------------------------- pp.h ------------------------------- */
342
343PERL_STATIC_INLINE I32
c9182d9c 344Perl_TOPMARK(pTHX)
33a4312b
FC
345{
346 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
1604cfb0
MS
347 "MARK top %p %" IVdf "\n",
348 PL_markstack_ptr,
349 (IV)*PL_markstack_ptr)));
33a4312b
FC
350 return *PL_markstack_ptr;
351}
352
353PERL_STATIC_INLINE I32
c9182d9c 354Perl_POPMARK(pTHX)
33a4312b
FC
355{
356 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
1604cfb0
MS
357 "MARK pop %p %" IVdf "\n",
358 (PL_markstack_ptr-1),
359 (IV)*(PL_markstack_ptr-1))));
33a4312b
FC
360 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
361 return *PL_markstack_ptr--;
362}
363
8d919b0a
FC
364/* ----------------------------- regexp.h ----------------------------- */
365
9d0d3060
NC
366/* PVLVs need to act as a superset of all scalar types - they are basically
367 * PVMGs with a few extra fields.
368 * REGEXPs are first class scalars, but have many fields that can't be copied
369 * into a PVLV body.
370 *
371 * Hence we take a different approach - instead of a copy, PVLVs store a pointer
372 * back to the original body. To avoid increasing the size of PVLVs just for the
373 * rare case of REGEXP assignment, this pointer is stored in the memory usually
374 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
375 * read the pointer from the two possible locations. The macro SvLEN() wraps the
376 * access to the union's member xpvlenu_len, but there is no equivalent macro
377 * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
378 *
379 * See commit df6b4bd56551f2d3 for more details. */
380
8d919b0a 381PERL_STATIC_INLINE struct regexp *
c9182d9c 382Perl_ReANY(const REGEXP * const re)
8d919b0a 383{
df6b4bd5 384 XPV* const p = (XPV*)SvANY(re);
bdef45de
KW
385
386 PERL_ARGS_ASSERT_REANY;
8d919b0a 387 assert(isREGEXP(re));
bdef45de 388
df6b4bd5
DM
389 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
390 : (struct regexp *)p;
8d919b0a
FC
391}
392
a8a2ceaa
KW
393/* ------------------------------- utf8.h ------------------------------- */
394
2fe720e2 395/*
3f620621 396=for apidoc_section $unicode
2fe720e2
KW
397*/
398
55d09dc8 399PERL_STATIC_INLINE void
c9182d9c 400Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
401{
402 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
403 * encoded string at '*dest', updating '*dest' to include it */
404
55d09dc8
KW
405 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
406
6f2d5cbc 407 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 408 *((*dest)++) = byte;
55d09dc8 409 else {
a09ec51a
KW
410 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
411 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
412 }
413}
414
e123187a 415/*
2fe720e2 416=for apidoc valid_utf8_to_uvchr
09232555
KW
417Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
418known that the next character in the input UTF-8 string C<s> is well-formed
419(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
420points, and non-Unicode code points are allowed.
2fe720e2
KW
421
422=cut
423
424 */
425
426PERL_STATIC_INLINE UV
427Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
428{
c41b2540 429 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
430 const U8* send = s + expectlen;
431 UV uv = *s;
432
433 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
434
435 if (retlen) {
436 *retlen = expectlen;
437 }
438
439 /* An invariant is trivially returned */
440 if (expectlen == 1) {
1604cfb0 441 return uv;
2fe720e2
KW
442 }
443
444 /* Remove the leading bits that indicate the number of bytes, leaving just
445 * the bits that are part of the value */
446 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
447
448 /* Now, loop through the remaining bytes, accumulating each into the
449 * working total as we go. (I khw tried unrolling the loop for up to 4
450 * bytes, but there was no performance improvement) */
451 for (++s; s < send; s++) {
452 uv = UTF8_ACCUMULATE(uv, *s);
453 }
454
455 return UNI_TO_NATIVE(uv);
456
457}
458
1e599354
KW
459/*
460=for apidoc is_utf8_invariant_string
461
82c5d941 462Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 463regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
464EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
465are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
466the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
467characters are invariant, but so also are the C1 controls.
1e599354
KW
468
469If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
470use this option, that C<s> can't have embedded C<NUL> characters and has to
471have a terminating C<NUL> byte).
472
9f2abfde
KW
473See also
474C<L</is_utf8_string>>,
475C<L</is_utf8_string_flags>>,
476C<L</is_utf8_string_loc>>,
477C<L</is_utf8_string_loc_flags>>,
478C<L</is_utf8_string_loclen>>,
479C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
480C<L</is_utf8_fixed_width_buf_flags>>,
481C<L</is_utf8_fixed_width_buf_loc_flags>>,
482C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
483C<L</is_strict_utf8_string>>,
484C<L</is_strict_utf8_string_loc>>,
485C<L</is_strict_utf8_string_loclen>>,
486C<L</is_c9strict_utf8_string>>,
487C<L</is_c9strict_utf8_string_loc>>,
488and
489C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
490
491=cut
0cbf5865
KW
492
493*/
494
495#define is_utf8_invariant_string(s, len) \
496 is_utf8_invariant_string_loc(s, len, NULL)
497
498/*
499=for apidoc is_utf8_invariant_string_loc
500
501Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
502the first UTF-8 variant character in the C<ep> pointer; if all characters are
503UTF-8 invariant, this function does not change the contents of C<*ep>.
504
505=cut
506
1e599354
KW
507*/
508
509PERL_STATIC_INLINE bool
c9182d9c 510Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 511{
e17544a6 512 const U8* send;
1e599354
KW
513 const U8* x = s;
514
0cbf5865
KW
515 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
516
e17544a6
KW
517 if (len == 0) {
518 len = strlen((const char *)s);
519 }
520
521 send = s + len;
522
4ab2fd9b 523/* This looks like 0x010101... */
2c5c8af5 524# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
525
526/* This looks like 0x808080... */
2c5c8af5 527# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
9f4248c9 528# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 529# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 530
099e59a4
KW
531/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
532 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
533 * optimized out completely on a 32-bit system, and its mask gets optimized out
534 * on a 64-bit system */
2c5c8af5 535# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
536 | ( PTR2nat(x) >> 1) \
537 | ( ( (PTR2nat(x) \
538 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 539
3f515a2e
KW
540#ifndef EBCDIC
541
099e59a4
KW
542 /* Do the word-at-a-time iff there is at least one usable full word. That
543 * means that after advancing to a word boundary, there still is at least a
544 * full word left. The number of bytes needed to advance is 'wordsize -
545 * offset' unless offset is 0. */
546 if ((STRLEN) (send - x) >= PERL_WORDSIZE
547
548 /* This term is wordsize if subword; 0 if not */
549 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
550
551 /* 'offset' */
552 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
553 {
b40579ff 554
46bb68f6
KW
555 /* Process per-byte until reach word boundary. XXX This loop could be
556 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 557 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
558 if (! UTF8_IS_INVARIANT(*x)) {
559 if (ep) {
560 *ep = x;
561 }
e17544a6 562
46bb68f6
KW
563 return FALSE;
564 }
565 x++;
e17544a6 566 }
e17544a6 567
099e59a4
KW
568 /* Here, we know we have at least one full word to process. Process
569 * per-word as long as we have at least a full word left */
570 do {
4ab2fd9b 571 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 572
46bb68f6
KW
573 /* Found a variant. Just return if caller doesn't want its
574 * exact position */
575 if (! ep) {
576 return FALSE;
577 }
e17544a6 578
2c5c8af5
KW
579# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
580 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 581
73f0a2eb 582 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
583 assert(*ep >= s && *ep < send);
584
585 return FALSE;
586
2c5c8af5 587# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
588 checks. */
589
46bb68f6 590 break;
2c5c8af5 591# endif
46bb68f6 592 }
1d2af574 593
46bb68f6 594 x += PERL_WORDSIZE;
1d2af574 595
099e59a4 596 } while (x + PERL_WORDSIZE <= send);
b40579ff 597 }
e17544a6 598
0b08cab0 599#endif /* End of ! EBCDIC */
e17544a6
KW
600
601 /* Process per-byte */
602 while (x < send) {
1604cfb0 603 if (! UTF8_IS_INVARIANT(*x)) {
e17544a6
KW
604 if (ep) {
605 *ep = x;
606 }
0cbf5865 607
e17544a6 608 return FALSE;
0cbf5865 609 }
1e599354 610
e17544a6 611 x++;
1e599354
KW
612 }
613
614 return TRUE;
615}
bf874180 616
fc1bb663
KW
617/* See if the platform has builtins for finding the most/least significant bit,
618 * and which one is right for using on 32 and 64 bit operands */
619#if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
620# if U32SIZE == INTSIZE
621# define PERL_CLZ_32 __builtin_clz
622# endif
623# if defined(U64TYPE) && U64SIZE == INTSIZE
624# define PERL_CLZ_64 __builtin_clz
625# endif
626#endif
627#if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
628# if U32SIZE == INTSIZE
629# define PERL_CTZ_32 __builtin_ctz
630# endif
631# if defined(U64TYPE) && U64SIZE == INTSIZE
632# define PERL_CTZ_64 __builtin_ctz
633# endif
634#endif
635
636#if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
637# if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
638# define PERL_CLZ_32 __builtin_clzl
639# endif
640# if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
641# define PERL_CLZ_64 __builtin_clzl
642# endif
643#endif
644#if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
645# if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
646# define PERL_CTZ_32 __builtin_ctzl
647# endif
648# if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
649# define PERL_CTZ_64 __builtin_ctzl
650# endif
651#endif
652
653#if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
654# if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
655# define PERL_CLZ_32 __builtin_clzll
656# endif
657# if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
658# define PERL_CLZ_64 __builtin_clzll
659# endif
660#endif
661#if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
662# if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
663# define PERL_CTZ_32 __builtin_ctzll
664# endif
665# if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
666# define PERL_CTZ_64 __builtin_ctzll
667# endif
668#endif
669
e1f6bdff 670#if defined(_MSC_VER)
e88dde50
KW
671# include <intrin.h>
672# pragma intrinsic(_BitScanForward)
673# pragma intrinsic(_BitScanReverse)
674# ifdef _WIN64
675# pragma intrinsic(_BitScanForward64)
676# pragma intrinsic(_BitScanReverse64)
677# endif
678#endif
679
250e5324
KW
680/* The reason there are not checks to see if ffs() and ffsl() are available for
681 * determining the lsb, is because these don't improve on the deBruijn method
682 * fallback, which is just a branchless integer multiply, array element
683 * retrieval, and shift. The others, even if the function call overhead is
684 * optimized out, have to cope with the possibility of the input being all
685 * zeroes, and almost certainly will have conditionals for this eventuality.
686 * khw, at the time of this commit, looked at the source for both gcc and clang
687 * to verify this. (gcc used a method inferior to deBruijn.) */
688
330cd0ce 689/* Below are functions to find the first, last, or only set bit in a word. On
19d2c525
KW
690 * platforms with 64-bit capability, there is a pair for each operation; the
691 * first taking a 64 bit operand, and the second a 32 bit one. The logic is
692 * the same in each pair, so the second is stripped of most comments. */
693
694#ifdef U64TYPE /* HAS_QUAD not usable outside the core */
695
696PERL_STATIC_INLINE unsigned
697Perl_lsbit_pos64(U64 word)
698{
699 /* Find the position (0..63) of the least significant set bit in the input
700 * word */
701
702 ASSUME(word != 0);
703
fc1bb663
KW
704 /* If we can determine that the platform has a usable fast method to get
705 * this info, use that */
706
707# if defined(PERL_CTZ_64)
2e0bc9ce 708# define PERL_HAS_FAST_GET_LSB_POS64
fc1bb663
KW
709
710 return (unsigned) PERL_CTZ_64(word);
711
a333292f 712# elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
2e0bc9ce 713# define PERL_HAS_FAST_GET_LSB_POS64
e88dde50
KW
714
715 {
716 unsigned long index;
717 _BitScanForward64(&index, word);
718 return (unsigned)index;
719 }
720
fc1bb663
KW
721# else
722
723 /* Here, we didn't find a fast method for finding the lsb. Fall back to
724 * making the lsb the only set bit in the word, and use our function that
725 * works on words with a single bit set.
726 *
727 * Isolate the lsb;
19d2c525
KW
728 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
729 *
730 * The word will look like this, with a rightmost set bit in position 's':
731 * ('x's are don't cares, and 'y's are their complements)
732 * s
733 * x..x100..00
734 * y..y011..11 Complement
735 * y..y100..00 Add 1
736 * 0..0100..00 And with the original
737 *
738 * (Yes, complementing and adding 1 is just taking the negative on 2's
739 * complement machines, but not on 1's complement ones, and some compilers
740 * complain about negating an unsigned.)
741 */
742 return single_1bit_pos64(word & (~word + 1));
fc1bb663
KW
743
744# endif
745
19d2c525
KW
746}
747
748# define lsbit_pos_uintmax_(word) lsbit_pos64(word)
749#else /* ! QUAD */
750# define lsbit_pos_uintmax_(word) lsbit_pos32(word)
751#endif
752
753PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */
754Perl_lsbit_pos32(U32 word)
755{
756 /* Find the position (0..31) of the least significant set bit in the input
757 * word */
758
759 ASSUME(word != 0);
760
fc1bb663 761#if defined(PERL_CTZ_32)
2e0bc9ce 762# define PERL_HAS_FAST_GET_LSB_POS32
fc1bb663
KW
763
764 return (unsigned) PERL_CTZ_32(word);
765
e1f6bdff 766#elif U32SIZE == 4 && defined(_MSC_VER)
2e0bc9ce 767# define PERL_HAS_FAST_GET_LSB_POS32
e88dde50
KW
768
769 {
770 unsigned long index;
771 _BitScanForward(&index, word);
772 return (unsigned)index;
773 }
774
fc1bb663
KW
775#else
776
19d2c525 777 return single_1bit_pos32(word & (~word + 1));
fc1bb663
KW
778
779#endif
780
19d2c525
KW
781}
782
4a1b7bb2 783
fc1bb663 784/* Convert the leading zeros count to the bit position of the first set bit.
4a1b7bb2
KW
785 * This just subtracts from the highest position, 31 or 63. But some compilers
786 * don't optimize this optimally, and so a bit of bit twiddling encourages them
787 * to do the right thing. It turns out that subtracting a smaller non-negative
788 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
789 * the two numbers. To see why, first note that the sum of any number, x, and
790 * its complement, x', is all ones. So all ones minus x is x'. Then note that
791 * the xor of x and all ones is x'. */
792#define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc))
fc1bb663 793
995a4954
KW
794#ifdef U64TYPE /* HAS_QUAD not usable outside the core */
795
796PERL_STATIC_INLINE unsigned
330cd0ce
KW
797Perl_msbit_pos64(U64 word)
798{
799 /* Find the position (0..63) of the most significant set bit in the input
800 * word */
801
802 ASSUME(word != 0);
803
fc1bb663
KW
804 /* If we can determine that the platform has a usable fast method to get
805 * this, use that */
806
807# if defined(PERL_CLZ_64)
2e0bc9ce 808# define PERL_HAS_FAST_GET_MSB_POS64
fc1bb663
KW
809
810 return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
811
e1f6bdff 812# elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
2e0bc9ce 813# define PERL_HAS_FAST_GET_MSB_POS64
e88dde50
KW
814
815 {
816 unsigned long index;
817 _BitScanReverse64(&index, word);
818 return (unsigned)index;
819 }
820
fc1bb663
KW
821# else
822
823 /* Here, we didn't find a fast method for finding the msb. Fall back to
824 * making the msb the only set bit in the word, and use our function that
825 * works on words with a single bit set.
826 *
827 * Isolate the msb; http://codeforces.com/blog/entry/10330
330cd0ce
KW
828 *
829 * Only the most significant set bit matters. Or'ing word with its right
830 * shift of 1 makes that bit and the next one to its right both 1.
831 * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
832 * ... We end with the msb and all to the right being 1. */
833 word |= (word >> 1);
834 word |= (word >> 2);
835 word |= (word >> 4);
836 word |= (word >> 8);
837 word |= (word >> 16);
838 word |= (word >> 32);
839
840 /* Then subtracting the right shift by 1 clears all but the left-most of
841 * the 1 bits, which is our desired result */
842 word -= (word >> 1);
843
844 /* Now we have a single bit set */
845 return single_1bit_pos64(word);
fc1bb663
KW
846
847# endif
848
330cd0ce
KW
849}
850
851# define msbit_pos_uintmax_(word) msbit_pos64(word)
852#else /* ! QUAD */
853# define msbit_pos_uintmax_(word) msbit_pos32(word)
854#endif
855
856PERL_STATIC_INLINE unsigned
857Perl_msbit_pos32(U32 word)
858{
859 /* Find the position (0..31) of the most significant set bit in the input
860 * word */
861
862 ASSUME(word != 0);
863
fc1bb663 864#if defined(PERL_CLZ_32)
2e0bc9ce 865# define PERL_HAS_FAST_GET_MSB_POS32
fc1bb663
KW
866
867 return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
868
e1f6bdff 869#elif U32SIZE == 4 && defined(_MSC_VER)
2e0bc9ce 870# define PERL_HAS_FAST_GET_MSB_POS32
e88dde50
KW
871
872 {
873 unsigned long index;
874 _BitScanReverse(&index, word);
875 return (unsigned)index;
876 }
877
fc1bb663
KW
878#else
879
330cd0ce
KW
880 word |= (word >> 1);
881 word |= (word >> 2);
882 word |= (word >> 4);
883 word |= (word >> 8);
884 word |= (word >> 16);
885 word -= (word >> 1);
886 return single_1bit_pos32(word);
fc1bb663
KW
887
888#endif
889
330cd0ce
KW
890}
891
787e8384
KW
892#if UVSIZE == U64SIZE
893# define msbit_pos(word) msbit_pos64(word)
894# define lsbit_pos(word) lsbit_pos64(word)
895#elif UVSIZE == U32SIZE
896# define msbit_pos(word) msbit_pos32(word)
897# define lsbit_pos(word) lsbit_pos32(word)
898#endif
899
330cd0ce
KW
900#ifdef U64TYPE /* HAS_QUAD not usable outside the core */
901
902PERL_STATIC_INLINE unsigned
995a4954
KW
903Perl_single_1bit_pos64(U64 word)
904{
905 /* Given a 64-bit word known to contain all zero bits except one 1 bit,
906 * find and return the 1's position: 0..63 */
907
908# ifdef PERL_CORE /* macro not exported */
909 ASSUME(isPOWER_OF_2(word));
910# else
911 ASSUME(word && (word & (word-1)) == 0);
912# endif
913
2e0bc9ce
KW
914 /* The only set bit is both the most and least significant bit. If we have
915 * a fast way of finding either one, use that.
916 *
917 * It may appear at first glance that those functions call this one, but
918 * they don't if the corresponding #define is set */
919
920# ifdef PERL_HAS_FAST_GET_MSB_POS64
921
922 return msbit_pos64(word);
923
924# elif defined(PERL_HAS_FAST_GET_LSB_POS64)
925
926 return lsbit_pos64(word);
927
928# else
929
995a4954
KW
930 /* The position of the only set bit in a word can be quickly calculated
931 * using deBruijn sequences. See for example
932 * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
933 return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
934 >> PERL_deBruijnShift64_];
2e0bc9ce
KW
935# endif
936
995a4954
KW
937}
938
939#endif
940
bf874180
KW
941PERL_STATIC_INLINE unsigned
942Perl_single_1bit_pos32(U32 word)
943{
944 /* Given a 32-bit word known to contain all zero bits except one 1 bit,
945 * find and return the 1's position: 0..31 */
946
947#ifdef PERL_CORE /* macro not exported */
948 ASSUME(isPOWER_OF_2(word));
949#else
950 ASSUME(word && (word & (word-1)) == 0);
951#endif
2e0bc9ce
KW
952#ifdef PERL_HAS_FAST_GET_MSB_POS32
953
954 return msbit_pos32(word);
955
956#elif defined(PERL_HAS_FAST_GET_LSB_POS32)
957
958 return lsbit_pos32(word);
959
960/* Unlikely, but possible for the platform to have a wider fast operation but
961 * not a narrower one. But easy enough to handle the case by widening the
962 * parameter size. (Going the other way, emulating 64 bit by two 32 bit ops
963 * would be slower than the deBruijn method.) */
964#elif defined(PERL_HAS_FAST_GET_MSB_POS64)
965
966 return msbit_pos64(word);
967
968#elif defined(PERL_HAS_FAST_GET_LSB_POS64)
969
970 return lsbit_pos64(word);
971
972#else
bf874180 973
bf874180
KW
974 return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
975 >> PERL_deBruijnShift32_];
2e0bc9ce
KW
976#endif
977
bf874180 978}
1e599354 979
23a7ee81
KW
980#ifndef EBCDIC
981
1d2af574 982PERL_STATIC_INLINE unsigned int
73f0a2eb 983Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574 984{
1d2af574
KW
985 /* This returns the position in a word (0..7) of the first variant byte in
986 * it. This is a helper function. Note that there are no branches */
987
1d2af574
KW
988 /* Get just the msb bits of each byte */
989 word &= PERL_VARIANTS_WORD_MASK;
990
58ddb8c5
KW
991 /* This should only be called if we know there is a variant byte in the
992 * word */
993 assert(word);
994
7adf2470 995# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
996
997 /* Bytes are stored like
998 * Byte8 ... Byte2 Byte1
999 * 63..56...15...8 7...0
19d2c525
KW
1000 * so getting the lsb of the whole modified word is getting the msb of the
1001 * first byte that has its msb set */
1002 word = lsbit_pos_uintmax_(word);
1003
1004 /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert
1005 * to 0..7 */
1006 return (unsigned int) ((word + 1) >> 3) - 1;
1d2af574
KW
1007
1008# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1009
1010 /* Bytes are stored like
1011 * Byte1 Byte2 ... Byte8
1012 * 63..56 55..47 ... 7...0
330cd0ce
KW
1013 * so getting the msb of the whole modified word is getting the msb of the
1014 * first byte that has its msb set */
1015 word = msbit_pos_uintmax_(word);
1d2af574 1016
330cd0ce
KW
1017 /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert
1018 * to 0..7 */
1d2af574
KW
1019 word = ((word + 1) >> 3) - 1;
1020
330cd0ce
KW
1021 /* And invert the result because of the reversed byte order on this
1022 * platform */
1d2af574
KW
1023 word = CHARBITS - word - 1;
1024
330cd0ce
KW
1025 return (unsigned int) word;
1026
1027# else
1028# error Unexpected byte order
1d2af574
KW
1029# endif
1030
1d2af574
KW
1031}
1032
23a7ee81 1033#endif
03c1e4ab
KW
1034#if defined(PERL_CORE) || defined(PERL_EXT)
1035
1036/*
1037=for apidoc variant_under_utf8_count
1038
1039This function looks at the sequence of bytes between C<s> and C<e>, which are
1040assumed to be encoded in ASCII/Latin1, and returns how many of them would
1041change should the string be translated into UTF-8. Due to the nature of UTF-8,
1042each of these would occupy two bytes instead of the single one in the input
1043string. Thus, this function returns the precise number of bytes the string
1044would expand by when translated to UTF-8.
1045
1046Unlike most of the other functions that have C<utf8> in their name, the input
1047to this function is NOT a UTF-8-encoded string. The function name is slightly
1048I<odd> to emphasize this.
1049
1050This function is internal to Perl because khw thinks that any XS code that
1051would want this is probably operating too close to the internals. Presenting a
1052valid use case could change that.
1053
1054See also
1055C<L<perlapi/is_utf8_invariant_string>>
1056and
1057C<L<perlapi/is_utf8_invariant_string_loc>>,
1058
1059=cut
1060
1061*/
1062
1063PERL_STATIC_INLINE Size_t
1064S_variant_under_utf8_count(const U8* const s, const U8* const e)
1065{
1066 const U8* x = s;
1067 Size_t count = 0;
1068
1069 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1070
1071# ifndef EBCDIC
1072
5d0379de
KW
1073 /* Test if the string is long enough to use word-at-a-time. (Logic is the
1074 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
1075 if ((STRLEN) (e - x) >= PERL_WORDSIZE
1076 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1077 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1078 {
1079
1080 /* Process per-byte until reach word boundary. XXX This loop could be
1081 * eliminated if we knew that this platform had fast unaligned reads */
1082 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1083 count += ! UTF8_IS_INVARIANT(*x++);
1084 }
1085
1086 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
1087 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1088 explanation of how this works */
e5863284
KW
1089 PERL_UINTMAX_T increment
1090 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
1091 * PERL_COUNT_MULTIPLIER)
1092 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 1093 count += (Size_t) increment;
03c1e4ab
KW
1094 x += PERL_WORDSIZE;
1095 } while (x + PERL_WORDSIZE <= e);
1096 }
1097
1098# endif
1099
1100 /* Process per-byte */
1101 while (x < e) {
1604cfb0 1102 if (! UTF8_IS_INVARIANT(*x)) {
03c1e4ab
KW
1103 count++;
1104 }
1105
1106 x++;
1107 }
1108
1109 return count;
1110}
1111
1112#endif
1113
9b6b0f24
KW
1114 /* Keep these around for these files */
1115#if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C)
9f4248c9
KW
1116# undef PERL_WORDSIZE
1117# undef PERL_COUNT_MULTIPLIER
1118# undef PERL_WORD_BOUNDARY_MASK
1119# undef PERL_VARIANTS_WORD_MASK
1120#endif
1121
7c93d8f0 1122/*
5ff889fb
KW
1123=for apidoc is_utf8_string
1124
82c5d941
KW
1125Returns TRUE if the first C<len> bytes of string C<s> form a valid
1126Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
1127be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1128can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1129byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1130
2717076a
KW
1131This function considers Perl's extended UTF-8 to be valid. That means that
1132code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
1133considered valid by this function. Use C<L</is_strict_utf8_string>>,
1134C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1135code points are considered valid.
5ff889fb 1136
9f2abfde
KW
1137See also
1138C<L</is_utf8_invariant_string>>,
0cbf5865 1139C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1140C<L</is_utf8_string_loc>>,
1141C<L</is_utf8_string_loclen>>,
8bc127bf
KW
1142C<L</is_utf8_fixed_width_buf_flags>>,
1143C<L</is_utf8_fixed_width_buf_loc_flags>>,
1144C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
1145
1146=cut
1147*/
1148
dd237e82 1149#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 1150
c9cd936b
KW
1151#if defined(PERL_CORE) || defined (PERL_EXT)
1152
1153/*
1154=for apidoc is_utf8_non_invariant_string
1155
1156Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1157C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1158UTF-8; otherwise returns FALSE.
1159
1160A TRUE return means that at least one code point represented by the sequence
1161either is a wide character not representable as a single byte, or the
1162representation differs depending on whether the sequence is encoded in UTF-8 or
1163not.
1164
1165See also
1166C<L<perlapi/is_utf8_invariant_string>>,
1167C<L<perlapi/is_utf8_string>>
1168
1169=cut
1170
1171This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
1172It generally needn't be if its string is entirely UTF-8 invariant, and it
1173shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
1174
1175It is an internal function because khw thinks that XS code shouldn't be working
1176at this low a level. A valid use case could change that.
1177
1178*/
1179
1180PERL_STATIC_INLINE bool
86a87e17 1181Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
c9cd936b
KW
1182{
1183 const U8 * first_variant;
1184
1185 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1186
1187 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1188 return FALSE;
1189 }
1190
1191 return is_utf8_string(first_variant, len - (first_variant - s));
1192}
1193
1194#endif
1195
5ff889fb 1196/*
9f2abfde
KW
1197=for apidoc is_strict_utf8_string
1198
1199Returns TRUE if the first C<len> bytes of string C<s> form a valid
1200UTF-8-encoded string that is fully interchangeable by any application using
1201Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
1202calculated using C<strlen(s)> (which means if you use this option, that C<s>
1203can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1204byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1205
1206This function returns FALSE for strings containing any
1207code points above the Unicode max of 0x10FFFF, surrogate code points, or
1208non-character code points.
1209
1210See also
1211C<L</is_utf8_invariant_string>>,
0cbf5865 1212C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1213C<L</is_utf8_string>>,
1214C<L</is_utf8_string_flags>>,
1215C<L</is_utf8_string_loc>>,
1216C<L</is_utf8_string_loc_flags>>,
1217C<L</is_utf8_string_loclen>>,
1218C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
1219C<L</is_utf8_fixed_width_buf_flags>>,
1220C<L</is_utf8_fixed_width_buf_loc_flags>>,
1221C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
1222C<L</is_strict_utf8_string_loc>>,
1223C<L</is_strict_utf8_string_loclen>>,
1224C<L</is_c9strict_utf8_string>>,
1225C<L</is_c9strict_utf8_string_loc>>,
1226and
1227C<L</is_c9strict_utf8_string_loclen>>.
1228
1229=cut
1230*/
1231
dd237e82 1232#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
1233
1234/*
1235=for apidoc is_c9strict_utf8_string
1236
1237Returns TRUE if the first C<len> bytes of string C<s> form a valid
1238UTF-8-encoded string that conforms to
1239L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1240otherwise it returns FALSE. If C<len> is 0, it will be calculated using
1241C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1242C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
1243characters being ASCII constitute 'a valid UTF-8 string'.
1244
1245This function returns FALSE for strings containing any code points above the
1246Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1247code points per
1248L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1249
1250See also
1251C<L</is_utf8_invariant_string>>,
0cbf5865 1252C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1253C<L</is_utf8_string>>,
1254C<L</is_utf8_string_flags>>,
1255C<L</is_utf8_string_loc>>,
1256C<L</is_utf8_string_loc_flags>>,
1257C<L</is_utf8_string_loclen>>,
1258C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
1259C<L</is_utf8_fixed_width_buf_flags>>,
1260C<L</is_utf8_fixed_width_buf_loc_flags>>,
1261C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
1262C<L</is_strict_utf8_string>>,
1263C<L</is_strict_utf8_string_loc>>,
1264C<L</is_strict_utf8_string_loclen>>,
1265C<L</is_c9strict_utf8_string_loc>>,
1266and
1267C<L</is_c9strict_utf8_string_loclen>>.
1268
1269=cut
1270*/
1271
dd237e82 1272#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
1273
1274/*
1275=for apidoc is_utf8_string_flags
1276
1277Returns TRUE if the first C<len> bytes of string C<s> form a valid
1278UTF-8 string, subject to the restrictions imposed by C<flags>;
1279returns FALSE otherwise. If C<len> is 0, it will be calculated
1280using C<strlen(s)> (which means if you use this option, that C<s> can't have
1281embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
1282that all characters being ASCII constitute 'a valid UTF-8 string'.
1283
1284If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1285C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1286as C<L</is_strict_utf8_string>>; and if C<flags> is
1287C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1288C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
1289combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1290C<L</utf8n_to_uvchr>>, with the same meanings.
1291
1292See also
1293C<L</is_utf8_invariant_string>>,
0cbf5865 1294C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1295C<L</is_utf8_string>>,
1296C<L</is_utf8_string_loc>>,
1297C<L</is_utf8_string_loc_flags>>,
1298C<L</is_utf8_string_loclen>>,
1299C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
1300C<L</is_utf8_fixed_width_buf_flags>>,
1301C<L</is_utf8_fixed_width_buf_loc_flags>>,
1302C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
1303C<L</is_strict_utf8_string>>,
1304C<L</is_strict_utf8_string_loc>>,
1305C<L</is_strict_utf8_string_loclen>>,
1306C<L</is_c9strict_utf8_string>>,
1307C<L</is_c9strict_utf8_string_loc>>,
1308and
1309C<L</is_c9strict_utf8_string_loclen>>.
1310
1311=cut
1312*/
1313
1314PERL_STATIC_INLINE bool
c9182d9c 1315Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 1316{
33756530 1317 const U8 * first_variant;
9f2abfde
KW
1318
1319 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1320 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1321 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1322
f60f61fd
KW
1323 if (len == 0) {
1324 len = strlen((const char *)s);
1325 }
1326
9f2abfde
KW
1327 if (flags == 0) {
1328 return is_utf8_string(s, len);
1329 }
1330
d044b7a7 1331 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1332 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1333 {
1334 return is_strict_utf8_string(s, len);
1335 }
1336
d044b7a7 1337 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1338 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1339 {
1340 return is_c9strict_utf8_string(s, len);
1341 }
1342
33756530
KW
1343 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1344 const U8* const send = s + len;
1345 const U8* x = first_variant;
1346
a0d7f935
KW
1347 while (x < send) {
1348 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1349 if (UNLIKELY(! cur_len)) {
1350 return FALSE;
1351 }
1352 x += cur_len;
9f2abfde 1353 }
33756530 1354 }
9f2abfde
KW
1355
1356 return TRUE;
1357}
1358
1359/*
5ff889fb
KW
1360
1361=for apidoc is_utf8_string_loc
1362
2717076a 1363Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 1364case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 1365"utf8ness success") in the C<ep> pointer.
5ff889fb 1366
2717076a 1367See also C<L</is_utf8_string_loclen>>.
5ff889fb 1368
3964c812
KW
1369=cut
1370*/
1371
1372#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
1373
1374/*
1375
5ff889fb
KW
1376=for apidoc is_utf8_string_loclen
1377
2717076a 1378Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 1379case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 1380"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 1381encoded characters in the C<el> pointer.
5ff889fb 1382
2717076a 1383See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
1384
1385=cut
1386*/
1387
56e4cf64 1388PERL_STATIC_INLINE bool
33756530 1389Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 1390{
33756530 1391 const U8 * first_variant;
5ff889fb
KW
1392
1393 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1394
33756530
KW
1395 if (len == 0) {
1396 len = strlen((const char *) s);
1397 }
1398
1399 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1400 if (el)
1401 *el = len;
1402
1403 if (ep) {
1404 *ep = s + len;
1405 }
1406
1407 return TRUE;
1408 }
1409
1410 {
1411 const U8* const send = s + len;
1412 const U8* x = first_variant;
1413 STRLEN outlen = first_variant - s;
1414
a0d7f935
KW
1415 while (x < send) {
1416 const STRLEN cur_len = isUTF8_CHAR(x, send);
1417 if (UNLIKELY(! cur_len)) {
1418 break;
1419 }
1420 x += cur_len;
1421 outlen++;
5ff889fb 1422 }
5ff889fb 1423
a0d7f935
KW
1424 if (el)
1425 *el = outlen;
5ff889fb 1426
a0d7f935
KW
1427 if (ep) {
1428 *ep = x;
1429 }
5ff889fb 1430
a0d7f935 1431 return (x == send);
33756530 1432 }
5ff889fb
KW
1433}
1434
213dc9d1
KW
1435/* The perl core arranges to never call the DFA below without there being at
1436 * least one byte available to look at. This allows the DFA to use a do {}
1437 * while loop which means that calling it with a UTF-8 invariant has a single
1438 * conditional, same as the calling code checking for invariance ahead of time.
1439 * And having the calling code remove that conditional speeds up by that
1440 * conditional, the case where it wasn't invariant. So there's no reason to
1441 * check before caling this.
1442 *
1443 * But we don't know this for non-core calls, so have to retain the check for
1444 * them. */
1445#ifdef PERL_CORE
1446# define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s))
1447#else
1448# define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE
1449#endif
1450
5ff889fb 1451/*
50f7a4ce
KW
1452 * DFA for checking input is valid UTF-8 syntax.
1453 *
1454 * This uses adaptations of the table and algorithm given in
1455 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1456 * documentation of the original version. A copyright notice for the original
1457 * version is given at the beginning of this file. The Perl adapations are
1458 * documented at the definition of PL_extended_utf8_dfa_tab[].
1459 *
1460 * This dfa is fast. There are three exit conditions:
1461 * 1) a well-formed code point, acceptable to the table
1462 * 2) the beginning bytes of an incomplete character, whose completion might
1463 * or might not be acceptable
1464 * 3) unacceptable to the table. Some of the adaptations have certain,
1465 * hopefully less likely to occur, legal inputs be unacceptable to the
1466 * table, so these must be sorted out afterwards.
1467 *
1468 * This macro is a complete implementation of the code executing the DFA. It
1469 * is passed the input sequence bounds and the table to use, and what to do
1470 * for each of the exit conditions. There are three canned actions, likely to
1471 * be the ones you want:
1472 * DFA_RETURN_SUCCESS_
1473 * DFA_RETURN_FAILURE_
1474 * DFA_GOTO_TEASE_APART_FF_
1475 *
1476 * You pass a parameter giving the action to take for each of the three
1477 * possible exit conditions:
1478 *
1479 * 'accept_action' This is executed when the DFA accepts the input.
1480 * DFA_RETURN_SUCCESS_ is the most likely candidate.
1481 * 'reject_action' This is executed when the DFA rejects the input.
1482 * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1483 * you have written code to distinguish the rejecting state
1484 * results. Because it happens in several places, and
1485 * involves #ifdefs, the special action
1486 * DFA_GOTO_TEASE_APART_FF_ is what you want with
1487 * PL_extended_utf8_dfa_tab. On platforms without
1488 * EXTRA_LONG_UTF8, there is no need to tease anything apart,
1489 * so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1490 * need to have a label 'tease_apart_FF' that it will transfer
1491 * to.
1492 * 'incomplete_char_action' This is executed when the DFA ran off the end
1493 * before accepting or rejecting the input.
1494 * DFA_RETURN_FAILURE_ is the likely action, but you could
1495 * have a 'goto', or NOOP. In the latter case the DFA drops
1496 * off the end, and you place your code to handle this case
1497 * immediately after it.
1498 */
1499
1500#define DFA_RETURN_SUCCESS_ return s - s0
1501#define DFA_RETURN_FAILURE_ return 0
1502#ifdef HAS_EXTRA_LONG_UTF8
1503# define DFA_TEASE_APART_FF_ goto tease_apart_FF
1504#else
1505# define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_
1506#endif
1507
1508#define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \
1509 accept_action, \
1510 reject_action, \
1511 incomplete_char_action) \
1512 STMT_START { \
1513 const U8 * s = s0; \
5da55c47 1514 const U8 * e_ = e; \
50f7a4ce
KW
1515 UV state = 0; \
1516 \
5da55c47 1517 PERL_NON_CORE_CHECK_EMPTY(s, e_); \
213dc9d1
KW
1518 \
1519 do { \
50f7a4ce
KW
1520 state = dfa_tab[256 + state + dfa_tab[*s]]; \
1521 s++; \
1522 \
1523 if (state == 0) { /* Accepting state */ \
1524 accept_action; \
1525 } \
1526 \
1527 if (UNLIKELY(state == 1)) { /* Rejecting state */ \
1528 reject_action; \
1529 } \
5da55c47 1530 } while (s < e_); \
50f7a4ce
KW
1531 \
1532 /* Here, dropped out of loop before end-of-char */ \
1533 incomplete_char_action; \
1534 } STMT_END
1535
1536
1537/*
9f2abfde 1538
44170c9a 1539=for apidoc isUTF8_CHAR
8ed185f9
KW
1540
1541Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1542looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1543that represents some code point; otherwise it evaluates to 0. If non-zero, the
1544value gives how many bytes starting at C<s> comprise the code point's
1545representation. Any bytes remaining before C<e>, but beyond the ones needed to
1546form the first code point in C<s>, are not examined.
1547
13aab5dd 1548The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1549extension to official UTF-8 to represent those higher than the Unicode maximum
1550of 0x10FFFF. That means that this macro is used to efficiently decide if the
1551next few bytes in C<s> is legal UTF-8 for a single character.
1552
1553Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1554defined by Unicode to be fully interchangeable across applications;
1555C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1556#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1557code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1558
1559Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1560C<L</is_utf8_string_loclen>> to check entire strings.
1561
13aab5dd
KW
1562Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1563machines) is a valid UTF-8 character.
8ed185f9
KW
1564
1565=cut
1566
1567This uses an adaptation of the table and algorithm given in
f6521f7c 1568https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
8ed185f9
KW
1569documentation of the original version. A copyright notice for the original
1570version is given at the beginning of this file. The Perl adapation is
71525f77 1571documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1572*/
1573
1574PERL_STATIC_INLINE Size_t
c9182d9c 1575Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9 1576{
8ed185f9
KW
1577 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1578
50f7a4ce
KW
1579 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1580 DFA_RETURN_SUCCESS_,
1581 DFA_TEASE_APART_FF_,
1582 DFA_RETURN_FAILURE_);
8ed185f9 1583
50f7a4ce
KW
1584 /* Here, we didn't return success, but dropped out of the loop. In the
1585 * case of PL_extended_utf8_dfa_tab, this means the input is either
1586 * malformed, or the start byte was FF on a platform that the dfa doesn't
1587 * handle FF's. Call a helper function. */
ffea7477 1588
50f7a4ce 1589#ifdef HAS_EXTRA_LONG_UTF8
8ed185f9 1590
50f7a4ce 1591 tease_apart_FF:
8ed185f9 1592
50f7a4ce
KW
1593 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1594 * either malformed, or was for the largest possible start byte, which we
1595 * now check, not inline */
1596 if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1597 return 0;
8ed185f9
KW
1598 }
1599
50f7a4ce
KW
1600 return is_utf8_FF_helper_(s0, e,
1601 FALSE /* require full, not partial char */
1602 );
8ed185f9
KW
1603#endif
1604
8ed185f9
KW
1605}
1606
1607/*
1608
67049a5f
KW
1609=for apidoc isSTRICT_UTF8_CHAR
1610
1611Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1612looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1613Unicode code point completely acceptable for open interchange between all
1614applications; otherwise it evaluates to 0. If non-zero, the value gives how
1615many bytes starting at C<s> comprise the code point's representation. Any
1616bytes remaining before C<e>, but beyond the ones needed to form the first code
1617point in C<s>, are not examined.
1618
1619The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1620be a surrogate nor a non-character code point. Thus this excludes any code
1621point from Perl's extended UTF-8.
1622
1623This is used to efficiently decide if the next few bytes in C<s> is
1624legal Unicode-acceptable UTF-8 for a single character.
1625
1626Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1627#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1628code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1629and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1630
1631Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1632C<L</is_strict_utf8_string_loclen>> to check entire strings.
1633
1634=cut
1635
1636This uses an adaptation of the tables and algorithm given in
f6521f7c 1637https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
67049a5f
KW
1638documentation of the original version. A copyright notice for the original
1639version is given at the beginning of this file. The Perl adapation is
1640documented at the definition of strict_extended_utf8_dfa_tab[].
1641
1642*/
1643
1644PERL_STATIC_INLINE Size_t
c9182d9c 1645Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f 1646{
67049a5f
KW
1647 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1648
50f7a4ce
KW
1649 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1650 DFA_RETURN_SUCCESS_,
1651 goto check_hanguls,
1652 DFA_RETURN_FAILURE_);
1653 check_hanguls:
67049a5f 1654
50f7a4ce
KW
1655 /* Here, we didn't return success, but dropped out of the loop. In the
1656 * case of PL_strict_utf8_dfa_tab, this means the input is either
1657 * malformed, or was for certain Hanguls; handle them specially */
67049a5f 1658
67260a96
KW
1659 /* The dfa above drops out for incomplete or illegal inputs, and certain
1660 * legal Hanguls; check and return accordingly */
1661 return is_HANGUL_ED_utf8_safe(s0, e);
67049a5f
KW
1662}
1663
1664/*
1665
44170c9a 1666=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1667
1668Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1669looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1670Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1671the value gives how many bytes starting at C<s> comprise the code point's
1672representation. Any bytes remaining before C<e>, but beyond the ones needed to
1673form the first code point in C<s>, are not examined.
1674
1675The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1676differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1677code points. This corresponds to
1678L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1679which said that non-character code points are merely discouraged rather than
1680completely forbidden in open interchange. See
1681L<perlunicode/Noncharacter code points>.
1682
1683Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1684C<L</isUTF8_CHAR_flags>> for a more customized definition.
1685
1686Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1687C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1688
1689=cut
1690
1691This uses an adaptation of the tables and algorithm given in
f6521f7c 1692https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
c5bfbb64
KW
1693documentation of the original version. A copyright notice for the original
1694version is given at the beginning of this file. The Perl adapation is
71525f77 1695documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1696
1697*/
1698
1699PERL_STATIC_INLINE Size_t
c9182d9c 1700Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64 1701{
c5bfbb64
KW
1702 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1703
50f7a4ce
KW
1704 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1705 DFA_RETURN_SUCCESS_,
1706 DFA_RETURN_FAILURE_,
1707 DFA_RETURN_FAILURE_);
c5bfbb64
KW
1708}
1709
1710/*
1711
9f2abfde
KW
1712=for apidoc is_strict_utf8_string_loc
1713
1714Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1715case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1716"utf8ness success") in the C<ep> pointer.
1717
1718See also C<L</is_strict_utf8_string_loclen>>.
1719
1720=cut
1721*/
1722
1723#define is_strict_utf8_string_loc(s, len, ep) \
1724 is_strict_utf8_string_loclen(s, len, ep, 0)
1725
1726/*
1727
1728=for apidoc is_strict_utf8_string_loclen
1729
1730Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1731case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1732"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1733encoded characters in the C<el> pointer.
1734
1735See also C<L</is_strict_utf8_string_loc>>.
1736
1737=cut
1738*/
1739
1740PERL_STATIC_INLINE bool
c9182d9c 1741Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1742{
33756530 1743 const U8 * first_variant;
9f2abfde
KW
1744
1745 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1746
33756530
KW
1747 if (len == 0) {
1748 len = strlen((const char *) s);
1749 }
1750
1751 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1752 if (el)
1753 *el = len;
1754
1755 if (ep) {
1756 *ep = s + len;
1757 }
1758
1759 return TRUE;
1760 }
1761
1762 {
1763 const U8* const send = s + len;
1764 const U8* x = first_variant;
1765 STRLEN outlen = first_variant - s;
1766
a0d7f935
KW
1767 while (x < send) {
1768 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1769 if (UNLIKELY(! cur_len)) {
1770 break;
1771 }
1772 x += cur_len;
1773 outlen++;
9f2abfde 1774 }
9f2abfde 1775
a0d7f935
KW
1776 if (el)
1777 *el = outlen;
9f2abfde 1778
a0d7f935
KW
1779 if (ep) {
1780 *ep = x;
1781 }
9f2abfde 1782
a0d7f935 1783 return (x == send);
33756530 1784 }
9f2abfde
KW
1785}
1786
1787/*
1788
1789=for apidoc is_c9strict_utf8_string_loc
1790
1791Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1792the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1793"utf8ness success") in the C<ep> pointer.
1794
1795See also C<L</is_c9strict_utf8_string_loclen>>.
1796
1797=cut
1798*/
1799
1800#define is_c9strict_utf8_string_loc(s, len, ep) \
1801 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1802
1803/*
1804
1805=for apidoc is_c9strict_utf8_string_loclen
1806
1807Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1808the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1809"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1810characters in the C<el> pointer.
1811
1812See also C<L</is_c9strict_utf8_string_loc>>.
1813
1814=cut
1815*/
1816
1817PERL_STATIC_INLINE bool
c9182d9c 1818Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1819{
33756530 1820 const U8 * first_variant;
9f2abfde
KW
1821
1822 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1823
33756530
KW
1824 if (len == 0) {
1825 len = strlen((const char *) s);
1826 }
1827
1828 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1829 if (el)
1830 *el = len;
1831
1832 if (ep) {
1833 *ep = s + len;
1834 }
1835
1836 return TRUE;
1837 }
1838
1839 {
1840 const U8* const send = s + len;
1841 const U8* x = first_variant;
1842 STRLEN outlen = first_variant - s;
1843
a0d7f935
KW
1844 while (x < send) {
1845 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1846 if (UNLIKELY(! cur_len)) {
1847 break;
1848 }
1849 x += cur_len;
1850 outlen++;
9f2abfde 1851 }
9f2abfde 1852
a0d7f935
KW
1853 if (el)
1854 *el = outlen;
9f2abfde 1855
a0d7f935
KW
1856 if (ep) {
1857 *ep = x;
1858 }
9f2abfde 1859
a0d7f935 1860 return (x == send);
33756530 1861 }
9f2abfde
KW
1862}
1863
1864/*
1865
1866=for apidoc is_utf8_string_loc_flags
1867
1868Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1869case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1870"utf8ness success") in the C<ep> pointer.
1871
1872See also C<L</is_utf8_string_loclen_flags>>.
1873
1874=cut
1875*/
1876
1877#define is_utf8_string_loc_flags(s, len, ep, flags) \
1878 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1879
1880
1881/* The above 3 actual functions could have been moved into the more general one
1882 * just below, and made #defines that call it with the right 'flags'. They are
1883 * currently kept separate to increase their chances of getting inlined */
1884
1885/*
1886
1887=for apidoc is_utf8_string_loclen_flags
1888
1889Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1890case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1891"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1892encoded characters in the C<el> pointer.
1893
1894See also C<L</is_utf8_string_loc_flags>>.
1895
1896=cut
1897*/
1898
1899PERL_STATIC_INLINE bool
c9182d9c 1900Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1901{
33756530 1902 const U8 * first_variant;
9f2abfde
KW
1903
1904 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1905 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1906 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1907
f60f61fd 1908 if (len == 0) {
a0d7f935 1909 len = strlen((const char *) s);
f60f61fd
KW
1910 }
1911
9f2abfde
KW
1912 if (flags == 0) {
1913 return is_utf8_string_loclen(s, len, ep, el);
1914 }
1915
d044b7a7 1916 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1917 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1918 {
1919 return is_strict_utf8_string_loclen(s, len, ep, el);
1920 }
1921
d044b7a7 1922 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1923 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1924 {
1925 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1926 }
1927
33756530
KW
1928 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1929 if (el)
1930 *el = len;
1931
1932 if (ep) {
1933 *ep = s + len;
1934 }
1935
1936 return TRUE;
1937 }
1938
1939 {
1940 const U8* send = s + len;
1941 const U8* x = first_variant;
1942 STRLEN outlen = first_variant - s;
1943
a0d7f935
KW
1944 while (x < send) {
1945 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1946 if (UNLIKELY(! cur_len)) {
1947 break;
1948 }
1949 x += cur_len;
1950 outlen++;
9f2abfde 1951 }
9f2abfde 1952
a0d7f935
KW
1953 if (el)
1954 *el = outlen;
9f2abfde 1955
a0d7f935
KW
1956 if (ep) {
1957 *ep = x;
1958 }
9f2abfde 1959
a0d7f935 1960 return (x == send);
33756530 1961 }
9f2abfde
KW
1962}
1963
1964/*
7c93d8f0
KW
1965=for apidoc utf8_distance
1966
1967Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1968and C<b>.
1969
1970WARNING: use only if you *know* that the pointers point inside the
1971same UTF-8 buffer.
1972
1973=cut
1974*/
1975
1976PERL_STATIC_INLINE IV
1977Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1978{
1979 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1980
1981 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1982}
1983
1984/*
1985=for apidoc utf8_hop
1986
1987Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1988forward or backward.
1989
1990WARNING: do not use the following unless you *know* C<off> is within
1991the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1992on the first byte of character or just after the last byte of a character.
1993
1994=cut
1995*/
1996
1997PERL_STATIC_INLINE U8 *
1998Perl_utf8_hop(const U8 *s, SSize_t off)
1999{
2000 PERL_ARGS_ASSERT_UTF8_HOP;
2001
2002 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2003 * the bitops (especially ~) can create illegal UTF-8.
2004 * In other words: in Perl UTF-8 is not just for Unicode. */
2005
2006 if (off >= 0) {
1604cfb0
MS
2007 while (off--)
2008 s += UTF8SKIP(s);
7c93d8f0
KW
2009 }
2010 else {
1604cfb0
MS
2011 while (off++) {
2012 s--;
2013 while (UTF8_IS_CONTINUATION(*s))
2014 s--;
2015 }
7c93d8f0 2016 }
e099ea69 2017 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 2018 return (U8 *)s;
e099ea69 2019 GCC_DIAG_RESTORE
7c93d8f0
KW
2020}
2021
4dab108f 2022/*
65df57a8
TC
2023=for apidoc utf8_hop_forward
2024
2025Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2026forward.
2027
2028C<off> must be non-negative.
2029
2030C<s> must be before or equal to C<end>.
2031
2032When moving forward it will not move beyond C<end>.
2033
2034Will not exceed this limit even if the string is not valid "UTF-8".
2035
2036=cut
2037*/
2038
2039PERL_STATIC_INLINE U8 *
2040Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2041{
2042 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2043
2044 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2045 * the bitops (especially ~) can create illegal UTF-8.
2046 * In other words: in Perl UTF-8 is not just for Unicode. */
2047
2048 assert(s <= end);
2049 assert(off >= 0);
2050
2051 while (off--) {
2052 STRLEN skip = UTF8SKIP(s);
de979548 2053 if ((STRLEN)(end - s) <= skip) {
e099ea69 2054 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 2055 return (U8 *)end;
e099ea69 2056 GCC_DIAG_RESTORE
de979548 2057 }
65df57a8
TC
2058 s += skip;
2059 }
2060
e099ea69 2061 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 2062 return (U8 *)s;
e099ea69 2063 GCC_DIAG_RESTORE
65df57a8
TC
2064}
2065
2066/*
2067=for apidoc utf8_hop_back
2068
2069Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2070backward.
2071
2072C<off> must be non-positive.
2073
2074C<s> must be after or equal to C<start>.
2075
2076When moving backward it will not move before C<start>.
2077
2078Will not exceed this limit even if the string is not valid "UTF-8".
2079
2080=cut
2081*/
2082
2083PERL_STATIC_INLINE U8 *
2084Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2085{
2086 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2087
9f4248c9 2088 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
65df57a8
TC
2089 * the bitops (especially ~) can create illegal UTF-8.
2090 * In other words: in Perl UTF-8 is not just for Unicode. */
2091
2092 assert(start <= s);
2093 assert(off <= 0);
2094
9f4248c9
KW
2095 /* Note: if we know that the input is well-formed, we can do per-word
2096 * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
2097 * that. But it was reverted because doing per-word has some
2098 * start-up/tear-down overhead, so only makes sense if the distance to be
2099 * moved is large, and core perl doesn't currently move more than a few
2100 * characters at a time. You can reinstate it if it does become
2101 * advantageous. */
2102 while (off++ && s > start) {
e7185695 2103 do {
65df57a8 2104 s--;
e7185695 2105 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8 2106 }
f6521f7c 2107
e099ea69 2108 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 2109 return (U8 *)s;
e099ea69 2110 GCC_DIAG_RESTORE
65df57a8
TC
2111}
2112
2113/*
2114=for apidoc utf8_hop_safe
2115
2116Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2117either forward or backward.
2118
2119When moving backward it will not move before C<start>.
2120
2121When moving forward it will not move beyond C<end>.
2122
2123Will not exceed those limits even if the string is not valid "UTF-8".
2124
2125=cut
2126*/
2127
2128PERL_STATIC_INLINE U8 *
2129Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2130{
2131 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2132
2133 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2134 * the bitops (especially ~) can create illegal UTF-8.
2135 * In other words: in Perl UTF-8 is not just for Unicode. */
2136
2137 assert(start <= s && s <= end);
2138
2139 if (off >= 0) {
2140 return utf8_hop_forward(s, off, end);
2141 }
2142 else {
2143 return utf8_hop_back(s, off, start);
2144 }
2145}
2146
2147/*
4dab108f 2148
247cc51e 2149=for apidoc isUTF8_CHAR_flags
22f363ff
KW
2150
2151Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2152looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2153that represents some code point, subject to the restrictions given by C<flags>;
2154otherwise it evaluates to 0. If non-zero, the value gives how many bytes
2155starting at C<s> comprise the code point's representation. Any bytes remaining
2156before C<e>, but beyond the ones needed to form the first code point in C<s>,
2157are not examined.
2158
2159If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2160if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2161as C<L</isSTRICT_UTF8_CHAR>>;
2162and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2163the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2164Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2165understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2166
2167The three alternative macros are for the most commonly needed validations; they
2168are likely to run somewhat faster than this more general one, as they can be
2169inlined into your code.
2170
2171Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2172L</is_utf8_string_loclen_flags> to check entire strings.
2173
2174=cut
2175*/
2176
2177PERL_STATIC_INLINE STRLEN
2178Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2179{
2180 PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2181 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2182 |UTF8_DISALLOW_PERL_EXTENDED)));
2183
2184 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2185 goto check_success,
2186 DFA_TEASE_APART_FF_,
2187 DFA_RETURN_FAILURE_);
2188
2189 check_success:
2190
1aa501c2 2191 return is_utf8_char_helper_(s0, e, flags);
22f363ff
KW
2192
2193#ifdef HAS_EXTRA_LONG_UTF8
2194
2195 tease_apart_FF:
2196
2197 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2198 * either malformed, or was for the largest possible start byte, which
2199 * indicates perl extended UTF-8, well above the Unicode maximum */
2200 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2201 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2202 {
2203 return 0;
2204 }
2205
2206 /* Otherwise examine the sequence not inline */
2207 return is_utf8_FF_helper_(s0, e,
2208 FALSE /* require full, not partial char */
2209 );
2210#endif
2211
2212}
2213
2214/*
2215
4dab108f
KW
2216=for apidoc is_utf8_valid_partial_char
2217
6cbb9248
KW
2218Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2219S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2220points. Otherwise, it returns 1 if there exists at least one non-empty
2221sequence of bytes that when appended to sequence C<s>, starting at position
2222C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2223otherwise returns 0.
2224
2225In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2226point.
2227
2228This is useful when a fixed-length buffer is being tested for being well-formed
2229UTF-8, but the final few bytes in it don't comprise a full character; that is,
2230it is split somewhere in the middle of the final code point's UTF-8
2231representation. (Presumably when the buffer is refreshed with the next chunk
2232of data, the new first bytes will complete the partial code point.) This
2233function is used to verify that the final bytes in the current buffer are in
2234fact the legal beginning of some code point, so that if they aren't, the
2235failure can be signalled without having to wait for the next read.
4dab108f
KW
2236
2237=cut
2238*/
2717076a
KW
2239#define is_utf8_valid_partial_char(s, e) \
2240 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
2241
2242/*
2243
2244=for apidoc is_utf8_valid_partial_char_flags
2245
2246Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2247or not the input is a valid UTF-8 encoded partial character, but it takes an
2248extra parameter, C<flags>, which can further restrict which code points are
2249considered valid.
2250
2251If C<flags> is 0, this behaves identically to
2252C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
2253of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
2254there is any sequence of bytes that can complete the input partial character in
2255such a way that a non-prohibited character is formed, the function returns
2717076a
KW
2256TRUE; otherwise FALSE. Non character code points cannot be determined based on
2257partial character input. But many of the other possible excluded types can be
f1c999a7
KW
2258determined from just the first one or two bytes.
2259
2260=cut
2261 */
2262
56e4cf64 2263PERL_STATIC_INLINE bool
22afef87 2264Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
4dab108f 2265{
f1c999a7 2266 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
f1c999a7 2267 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 2268 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 2269
22afef87
KW
2270 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2271 DFA_RETURN_FAILURE_,
2272 DFA_TEASE_APART_FF_,
2273 NOOP);
2274
2275 /* The NOOP above causes the DFA to drop down here iff the input was a
2276 * partial character. flags=0 => can return TRUE immediately; otherwise we
2277 * need to check (not inline) if the partial character is the beginning of
2278 * a disallowed one */
2279 if (flags == 0) {
2280 return TRUE;
2281 }
2282
1aa501c2 2283 return cBOOL(is_utf8_char_helper_(s0, e, flags));
22afef87
KW
2284
2285#ifdef HAS_EXTRA_LONG_UTF8
2286
2287 tease_apart_FF:
2288
2289 /* Getting here means the input is either malformed, or, in the case of
2290 * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
2291 * latter case has to be extended UTF-8, so can fail immediately if that is
2292 * forbidden */
2293
2294 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2295 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2296 {
2297 return 0;
4dab108f
KW
2298 }
2299
22afef87
KW
2300 return is_utf8_FF_helper_(s0, e,
2301 TRUE /* Require to be a partial character */
2302 );
2303#endif
2304
4dab108f
KW
2305}
2306
8bc127bf
KW
2307/*
2308
2309=for apidoc is_utf8_fixed_width_buf_flags
2310
2311Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2312is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2313otherwise it returns FALSE.
2314
2315If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2316without restriction. If the final few bytes of the buffer do not form a
2317complete code point, this will return TRUE anyway, provided that
2318C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2319
2320If C<flags> in non-zero, it can be any combination of the
2321C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2322same meanings.
2323
2324This function differs from C<L</is_utf8_string_flags>> only in that the latter
2325returns FALSE if the final few bytes of the string don't form a complete code
2326point.
2327
2328=cut
2329 */
2330#define is_utf8_fixed_width_buf_flags(s, len, flags) \
2331 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2332
2333/*
2334
2335=for apidoc is_utf8_fixed_width_buf_loc_flags
2336
2337Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2338failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
2339to the beginning of any partial character at the end of the buffer; if there is
2340no partial character C<*ep> will contain C<s>+C<len>.
2341
2342See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2343
2344=cut
2345*/
2346
2347#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
2348 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2349
2350/*
2351
2352=for apidoc is_utf8_fixed_width_buf_loclen_flags
2353
2354Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2355complete, valid characters found in the C<el> pointer.
2356
2357=cut
2358*/
2359
2360PERL_STATIC_INLINE bool
c9182d9c 2361Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 2362 STRLEN len,
8bc127bf
KW
2363 const U8 **ep,
2364 STRLEN *el,
2365 const U32 flags)
2366{
2367 const U8 * maybe_partial;
2368
2369 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2370
2371 if (! ep) {
2372 ep = &maybe_partial;
2373 }
2374
2375 /* If it's entirely valid, return that; otherwise see if the only error is
2376 * that the final few bytes are for a partial character */
2377 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
2378 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2379}
2380
e6a4ffc3 2381PERL_STATIC_INLINE UV
c9182d9c 2382Perl_utf8n_to_uvchr_msgs(const U8 *s,
59c18386
KW
2383 STRLEN curlen,
2384 STRLEN *retlen,
2385 const U32 flags,
2386 U32 * errors,
2387 AV ** msgs)
e6a4ffc3
KW
2388{
2389 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
2390 * simple cases, and, if necessary calls a helper function to deal with the
2391 * more complex ones. Almost all well-formed non-problematic code points
2392 * are considered simple, so that it's unlikely that the helper function
2393 * will need to be called.
2394 *
2395 * This is an adaptation of the tables and algorithm given in
f6521f7c 2396 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
e6a4ffc3
KW
2397 * comprehensive documentation of the original version. A copyright notice
2398 * for the original version is given at the beginning of this file. The
71525f77 2399 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
2400 */
2401
2402 const U8 * const s0 = s;
2403 const U8 * send = s0 + curlen;
a4609251
KW
2404 UV type;
2405 UV uv;
e6a4ffc3
KW
2406
2407 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2408
2409 /* This dfa is fast. If it accepts the input, it was for a well-formed,
2410 * non-problematic code point, which can be returned immediately.
2411 * Otherwise we call a helper function to figure out the more complicated
2412 * cases. */
2413
a4609251 2414 /* No calls from core pass in an empty string; non-core need a check */
d1e771d8
KW
2415#ifdef PERL_CORE
2416 assert(curlen > 0);
2417#else
2418 if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
2419 flags, errors, msgs);
2420#endif
e6a4ffc3 2421
a4609251 2422 type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3 2423
a4609251
KW
2424 /* The table is structured so that 'type' is 0 iff the input byte is
2425 * represented identically regardless of the UTF-8ness of the string */
2426 if (type == 0) { /* UTF-8 invariants are returned unchanged */
2427 uv = *s;
2428 }
2429 else {
2430 UV state = PL_strict_utf8_dfa_tab[256 + type];
2431 uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
e6a4ffc3 2432
a4609251
KW
2433 while (++s < send) {
2434 type = PL_strict_utf8_dfa_tab[*s];
2435 state = PL_strict_utf8_dfa_tab[256 + state + type];
2436
2437 uv = UTF8_ACCUMULATE(uv, *s);
2438
2439 if (state == 0) {
b9f49957
KW
2440#ifdef EBCDIC
2441 uv = UNI_TO_NATIVE(uv);
2442#endif
a4609251
KW
2443 goto success;
2444 }
2445
2446 if (UNLIKELY(state == 1)) {
2447 break;
2448 }
e6a4ffc3
KW
2449 }
2450
a4609251
KW
2451 /* Here is potentially problematic. Use the full mechanism */
2452 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
2453 errors, msgs);
2454 }
2455
2456 success:
2457 if (retlen) {
2458 *retlen = s - s0 + 1;
2459 }
2460 if (errors) {
2461 *errors = 0;
2462 }
2463 if (msgs) {
2464 *msgs = NULL;
e6a4ffc3
KW
2465 }
2466
b9f49957 2467 return uv;
e6a4ffc3
KW
2468}
2469
82651abe 2470PERL_STATIC_INLINE UV
9a9a6c98 2471Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 2472{
9a9a6c98 2473 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
2474
2475 assert(s < send);
2476
2477 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
2478
2479 /* EMPTY is not really allowed, and asserts on debugging builds. But
2480 * on non-debugging we have to deal with it, and this causes it to
2481 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 2482 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 2483 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
2484 }
2485 else {
2486 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
286a1bfd 2487 if (retlen && ret == 0 && (send <= s || *s != '\0')) {
82651abe
KW
2488 *retlen = (STRLEN) -1;
2489 }
2490
2491 return ret;
2492 }
2493}
2494
c8028aa6
TC
2495/* ------------------------------- perl.h ----------------------------- */
2496
2497/*
3f620621 2498=for apidoc_section $utility
dcccc8ff 2499
44170c9a 2500=for apidoc is_safe_syscall
c8028aa6 2501
1a0efc9a
KW
2502Test that the given C<pv> (with length C<len>) doesn't contain any internal
2503C<NUL> characters.
2504If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2505category, and return FALSE.
c8028aa6
TC
2506
2507Return TRUE if the name is safe.
2508
1a0efc9a
KW
2509C<what> and C<op_name> are used in any warning.
2510
796b6530 2511Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
2512
2513=cut
2514*/
2515
2516PERL_STATIC_INLINE bool
ffd62fc2
KW
2517Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2518{
c8028aa6
TC
2519 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2520 * perl itself uses xce*() functions which accept 8-bit strings.
2521 */
2522
2523 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2524
6c4650b3 2525 if (len > 1) {
c8028aa6 2526 char *null_at;
41188aa0 2527 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 2528 SETERRNO(ENOENT, LIB_INVARG);
1d505182 2529 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 2530 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 2531 what, op_name, pv, null_at+1);
c8028aa6
TC
2532 return FALSE;
2533 }
2534 }
2535
2536 return TRUE;
2537}
2538
2539/*
7cb3f959
TC
2540
2541Return true if the supplied filename has a newline character
fa6c7d00 2542immediately before the first (hopefully only) NUL.
7cb3f959
TC
2543
2544My original look at this incorrectly used the len from SvPV(), but
2545that's incorrect, since we allow for a NUL in pv[len-1].
2546
2547So instead, strlen() and work from there.
2548
2549This allow for the user reading a filename, forgetting to chomp it,
2550then calling:
2551
2552 open my $foo, "$file\0";
2553
2554*/
2555
2556#ifdef PERL_CORE
2557
2558PERL_STATIC_INLINE bool
ffd62fc2
KW
2559S_should_warn_nl(const char *pv)
2560{
7cb3f959
TC
2561 STRLEN len;
2562
2563 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2564
2565 len = strlen(pv);
2566
2567 return len > 0 && pv[len-1] == '\n';
2568}
2569
2570#endif
2571
3a019afd
KW
2572#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2573
2574PERL_STATIC_INLINE bool
2575S_lossless_NV_to_IV(const NV nv, IV *ivp)
2576{
2577 /* This function determines if the input NV 'nv' may be converted without
2578 * loss of data to an IV. If not, it returns FALSE taking no other action.
2579 * But if it is possible, it does the conversion, returning TRUE, and
2580 * storing the converted result in '*ivp' */
2581
2582 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2583
cd304e76
DM
2584# if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2585 /* Normally any comparison with a NaN returns false; if we can't rely
2586 * on that behaviour, check explicitly */
3a019afd
KW
2587 if (UNLIKELY(Perl_isnan(nv))) {
2588 return FALSE;
2589 }
3a019afd
KW
2590# endif
2591
cd304e76
DM
2592 /* Written this way so that with an always-false NaN comparison we
2593 * return false */
ef0a8475 2594 if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
3a019afd
KW
2595 return FALSE;
2596 }
2597
2598 if ((IV) nv != nv) {
2599 return FALSE;
2600 }
2601
2602 *ivp = (IV) nv;
2603 return TRUE;
2604}
2605
2606#endif
2607
81d52ecd
JH
2608/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2609
94b0cb42
KW
2610#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2611
81d52ecd
JH
2612#define MAX_CHARSET_NAME_LENGTH 2
2613
2614PERL_STATIC_INLINE const char *
94b0cb42 2615S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 2616{
94b0cb42
KW
2617 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2618
81d52ecd
JH
2619 /* Returns a string that corresponds to the name of the regex character set
2620 * given by 'flags', and *lenp is set the length of that string, which
2621 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2622
2623 *lenp = 1;
2624 switch (get_regex_charset(flags)) {
2625 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2626 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2627 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
1604cfb0
MS
2628 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2629 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2630 *lenp = 2;
2631 return ASCII_MORE_RESTRICT_PAT_MODS;
81d52ecd
JH
2632 }
2633 /* The NOT_REACHED; hides an assert() which has a rather complex
2634 * definition in perl.h. */
2635 NOT_REACHED; /* NOTREACHED */
2636 return "?"; /* Unknown */
2637}
2638
94b0cb42
KW
2639#endif
2640
7cb3f959 2641/*
ed382232
TC
2642
2643Return false if any get magic is on the SV other than taint magic.
2644
2645*/
2646
2647PERL_STATIC_INLINE bool
ffd62fc2
KW
2648Perl_sv_only_taint_gmagic(SV *sv)
2649{
ed382232
TC
2650 MAGIC *mg = SvMAGIC(sv);
2651
2652 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2653
2654 while (mg) {
2655 if (mg->mg_type != PERL_MAGIC_taint
2656 && !(mg->mg_flags & MGf_GSKIP)
2657 && mg->mg_virtual->svt_get) {
2658 return FALSE;
2659 }
2660 mg = mg->mg_moremagic;
2661 }
2662
2663 return TRUE;
2664}
2665
ed8ff0f3
DM
2666/* ------------------ cop.h ------------------------------------------- */
2667
5b6f7443
DM
2668/* implement GIMME_V() macro */
2669
2670PERL_STATIC_INLINE U8
2671Perl_gimme_V(pTHX)
2672{
2673 I32 cxix;
2674 U8 gimme = (PL_op->op_flags & OPf_WANT);
2675
2676 if (gimme)
2677 return gimme;
2678 cxix = PL_curstackinfo->si_cxsubix;
2679 if (cxix < 0)
390fe0c0 2680 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
5b6f7443
DM
2681 assert(cxstack[cxix].blk_gimme & G_WANT);
2682 return (cxstack[cxix].blk_gimme & G_WANT);
2683}
2684
ed8ff0f3
DM
2685
2686/* Enter a block. Push a new base context and return its address. */
2687
2688PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2689Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2690{
2691 PERL_CONTEXT * cx;
2692
2693 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2694
2695 CXINC;
2696 cx = CX_CUR();
2697 cx->cx_type = type;
2698 cx->blk_gimme = gimme;
2699 cx->blk_oldsaveix = saveix;
4caf7d8c 2700 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2701 cx->blk_oldcop = PL_curcop;
4caf7d8c 2702 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2703 cx->blk_oldscopesp = PL_scopestack_ix;
2704 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2705 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2706
2707 PL_tmps_floor = PL_tmps_ix;
2708 CX_DEBUG(cx, "PUSH");
2709 return cx;
2710}
2711
2712
2713/* Exit a block (RETURN and LAST). */
2714
2715PERL_STATIC_INLINE void
c9182d9c 2716Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2717{
2718 PERL_ARGS_ASSERT_CX_POPBLOCK;
2719
2720 CX_DEBUG(cx, "POP");
2721 /* these 3 are common to cx_popblock and cx_topblock */
2722 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2723 PL_scopestack_ix = cx->blk_oldscopesp;
2724 PL_curpm = cx->blk_oldpm;
2725
2726 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2727 * and leaves a CX entry lying around for repeated use, so
2728 * skip for multicall */ \
2729 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2730 || PL_savestack_ix == cx->blk_oldsaveix);
2731 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2732 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2733}
2734
2735/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2736 * Whereas cx_popblock() restores the state to the point just before
2737 * cx_pushblock() was called, cx_topblock() restores it to the point just
2738 * *after* cx_pushblock() was called. */
2739
2740PERL_STATIC_INLINE void
c9182d9c 2741Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2742{
2743 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2744
2745 CX_DEBUG(cx, "TOP");
2746 /* these 3 are common to cx_popblock and cx_topblock */
2747 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2748 PL_scopestack_ix = cx->blk_oldscopesp;
2749 PL_curpm = cx->blk_oldpm;
2750
2751 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2752}
2753
2754
a73d8813 2755PERL_STATIC_INLINE void
c9182d9c 2756Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2757{
2758 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2759
2760 PERL_ARGS_ASSERT_CX_PUSHSUB;
2761
3f6bd23a 2762 PERL_DTRACE_PROBE_ENTRY(cv);
5b6f7443
DM
2763 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2764 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
a73d8813
DM
2765 cx->blk_sub.cv = cv;
2766 cx->blk_sub.olddepth = CvDEPTH(cv);
2767 cx->blk_sub.prevcomppad = PL_comppad;
2768 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2769 cx->blk_sub.retop = retop;
2770 SvREFCNT_inc_simple_void_NN(cv);
2771 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2772}
2773
2774
2775/* subsets of cx_popsub() */
2776
2777PERL_STATIC_INLINE void
c9182d9c 2778Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2779{
2780 CV *cv;
2781
2782 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2783 assert(CxTYPE(cx) == CXt_SUB);
2784
2785 PL_comppad = cx->blk_sub.prevcomppad;
2786 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2787 cv = cx->blk_sub.cv;
2788 CvDEPTH(cv) = cx->blk_sub.olddepth;
2789 cx->blk_sub.cv = NULL;
2790 SvREFCNT_dec(cv);
5b6f7443 2791 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
a73d8813
DM
2792}
2793
2794
2795/* handle the @_ part of leaving a sub */
2796
2797PERL_STATIC_INLINE void
c9182d9c 2798Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2799{
2800 AV *av;
2801
2802 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2803 assert(CxTYPE(cx) == CXt_SUB);
2804 assert(AvARRAY(MUTABLE_AV(
2805 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2806 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2807
2808 CX_POP_SAVEARRAY(cx);
2809 av = MUTABLE_AV(PAD_SVl(0));
2810 if (UNLIKELY(AvREAL(av)))
2811 /* abandon @_ if it got reified */
2812 clear_defarray(av, 0);
2813 else {
2814 CLEAR_ARGARRAY(av);
2815 }
2816}
2817
2818
2819PERL_STATIC_INLINE void
c9182d9c 2820Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2821{
2822 PERL_ARGS_ASSERT_CX_POPSUB;
2823 assert(CxTYPE(cx) == CXt_SUB);
2824
3f6bd23a 2825 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2826
2827 if (CxHASARGS(cx))
2828 cx_popsub_args(cx);
2829 cx_popsub_common(cx);
2830}
2831
2832
6a7d52cc 2833PERL_STATIC_INLINE void
c9182d9c 2834Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2835{
2836 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2837
5b6f7443
DM
2838 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2839 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
6a7d52cc
DM
2840 cx->blk_format.cv = cv;
2841 cx->blk_format.retop = retop;
2842 cx->blk_format.gv = gv;
2843 cx->blk_format.dfoutgv = PL_defoutgv;
2844 cx->blk_format.prevcomppad = PL_comppad;
2845 cx->blk_u16 = 0;
2846
2847 SvREFCNT_inc_simple_void_NN(cv);
2848 CvDEPTH(cv)++;
2849 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2850}
2851
2852
2853PERL_STATIC_INLINE void
c9182d9c 2854Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2855{
2856 CV *cv;
2857 GV *dfout;
2858
2859 PERL_ARGS_ASSERT_CX_POPFORMAT;
2860 assert(CxTYPE(cx) == CXt_FORMAT);
2861
2862 dfout = cx->blk_format.dfoutgv;
2863 setdefout(dfout);
2864 cx->blk_format.dfoutgv = NULL;
2865 SvREFCNT_dec_NN(dfout);
2866
2867 PL_comppad = cx->blk_format.prevcomppad;
2868 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2869 cv = cx->blk_format.cv;
2870 cx->blk_format.cv = NULL;
2871 --CvDEPTH(cv);
2872 SvREFCNT_dec_NN(cv);
5b6f7443 2873 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
6a7d52cc
DM
2874}
2875
2876
13febba5 2877PERL_STATIC_INLINE void
6b729d24 2878Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5 2879{
13febba5
DM
2880 cx->blk_eval.retop = retop;
2881 cx->blk_eval.old_namesv = namesv;
2882 cx->blk_eval.old_eval_root = PL_eval_root;
2883 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2884 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2885 cx->blk_eval.cur_top_env = PL_top_env;
2886
4c57ced5 2887 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2888 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2889 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2890}
2891
6b729d24
TC
2892PERL_STATIC_INLINE void
2893Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2894{
2895 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2896
2897 Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2898
2899 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2900 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2901}
2902
2903PERL_STATIC_INLINE void
2904Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2905{
2906 PERL_ARGS_ASSERT_CX_PUSHTRY;
2907
2908 Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2909
2910 /* Don't actually change it, just store the current value so it's restored
2911 * by the common popeval */
2912 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2913}
2914
13febba5
DM
2915
2916PERL_STATIC_INLINE void
c9182d9c 2917Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2918{
2919 SV *sv;
2920
2921 PERL_ARGS_ASSERT_CX_POPEVAL;
2922 assert(CxTYPE(cx) == CXt_EVAL);
2923
2924 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2925 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2926 PL_eval_root = cx->blk_eval.old_eval_root;
2927 sv = cx->blk_eval.cur_text;
4c57ced5 2928 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2929 cx->blk_eval.cur_text = NULL;
2930 SvREFCNT_dec_NN(sv);
2931 }
2932
2933 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2934 if (sv) {
2935 cx->blk_eval.old_namesv = NULL;
2936 SvREFCNT_dec_NN(sv);
2937 }
5b6f7443 2938 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
13febba5 2939}
6a7d52cc 2940
a73d8813 2941
d1b6bf72
DM
2942/* push a plain loop, i.e.
2943 * { block }
2944 * while (cond) { block }
2945 * for (init;cond;continue) { block }
2946 * This loop can be last/redo'ed etc.
2947 */
2948
2949PERL_STATIC_INLINE void
c9182d9c 2950Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2951{
2952 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2953 cx->blk_loop.my_op = cLOOP;
2954}
2955
2956
2957/* push a true for loop, i.e.
2958 * for var (list) { block }
2959 */
2960
2961PERL_STATIC_INLINE void
c9182d9c 2962Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
2963{
2964 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2965
2966 /* this one line is common with cx_pushloop_plain */
2967 cx->blk_loop.my_op = cLOOP;
2968
2969 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2970 cx->blk_loop.itersave = itersave;
2971#ifdef USE_ITHREADS
2972 cx->blk_loop.oldcomppad = PL_comppad;
2973#endif
2974}
2975
2976
2977/* pop all loop types, including plain */
2978
2979PERL_STATIC_INLINE void
c9182d9c 2980Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2981{
2982 PERL_ARGS_ASSERT_CX_POPLOOP;
2983
2984 assert(CxTYPE_is_LOOP(cx));
2985 if ( CxTYPE(cx) == CXt_LOOP_ARY
2986 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2987 {
2988 /* Free ary or cur. This assumes that state_u.ary.ary
2989 * aligns with state_u.lazysv.cur. See cx_dup() */
2990 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2991 cx->blk_loop.state_u.lazysv.cur = NULL;
2992 SvREFCNT_dec_NN(sv);
2993 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2994 sv = cx->blk_loop.state_u.lazysv.end;
2995 cx->blk_loop.state_u.lazysv.end = NULL;
2996 SvREFCNT_dec_NN(sv);
2997 }
2998 }
2999 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3000 SV *cursv;
3001 SV **svp = (cx)->blk_loop.itervar_u.svp;
3002 if ((cx->cx_type & CXp_FOR_GV))
3003 svp = &GvSV((GV*)svp);
3004 cursv = *svp;
3005 *svp = cx->blk_loop.itersave;
3006 cx->blk_loop.itersave = NULL;
3007 SvREFCNT_dec(cursv);
3008 }
3009}
3010
2a7b7c61
DM
3011
3012PERL_STATIC_INLINE void
c9182d9c 3013Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 3014{
7896dde7 3015 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 3016
7896dde7 3017 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
3018}
3019
3020
3021PERL_STATIC_INLINE void
c9182d9c 3022Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 3023{
7896dde7
Z
3024 PERL_ARGS_ASSERT_CX_POPWHEN;
3025 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
3026
3027 PERL_UNUSED_ARG(cx);
59a14f30 3028 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
3029 /* currently NOOP */
3030}
3031
3032
7896dde7 3033PERL_STATIC_INLINE void
c9182d9c 3034Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
3035{
3036 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3037
3038 cx->blk_givwhen.leave_op = cLOGOP->op_other;
3039 cx->blk_givwhen.defsv_save = orig_defsv;
3040}
3041
3042
3043PERL_STATIC_INLINE void
c9182d9c 3044Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
3045{
3046 SV *sv;
3047
3048 PERL_ARGS_ASSERT_CX_POPGIVEN;
3049 assert(CxTYPE(cx) == CXt_GIVEN);
3050
3051 sv = GvSV(PL_defgv);
3052 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3053 cx->blk_givwhen.defsv_save = NULL;
3054 SvREFCNT_dec(sv);
3055}
3056
ec2c235b
KW
3057/* ------------------ util.h ------------------------------------------- */
3058
3059/*
3f620621 3060=for apidoc_section $string
ec2c235b
KW
3061
3062=for apidoc foldEQ
3063
3064Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3065same
3066case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
3067match themselves and their opposite case counterparts. Non-cased and non-ASCII
3068range bytes match only themselves.
3069
3070=cut
3071*/
3072
3073PERL_STATIC_INLINE I32
c3c9077b 3074Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len)
ec2c235b
KW
3075{
3076 const U8 *a = (const U8 *)s1;
3077 const U8 *b = (const U8 *)s2;
3078
3079 PERL_ARGS_ASSERT_FOLDEQ;
3080
3081 assert(len >= 0);
3082
3083 while (len--) {
1604cfb0
MS
3084 if (*a != *b && *a != PL_fold[*b])
3085 return 0;
3086 a++,b++;
ec2c235b
KW
3087 }
3088 return 1;
3089}
3090
0f9cb40c 3091PERL_STATIC_INLINE I32
c3c9077b 3092Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len)
ec2c235b 3093{
79a1fabd
KW
3094 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
3095 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3096 * does not check for this. Nor does it check that the strings each have
3097 * at least 'len' characters. */
ec2c235b
KW
3098
3099 const U8 *a = (const U8 *)s1;
3100 const U8 *b = (const U8 *)s2;
3101
3102 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3103
3104 assert(len >= 0);
3105
3106 while (len--) {
1604cfb0
MS
3107 if (*a != *b && *a != PL_fold_latin1[*b]) {
3108 return 0;
3109 }
3110 a++, b++;
ec2c235b
KW
3111 }
3112 return 1;
3113}
3114
3115/*
3f620621 3116=for apidoc_section $locale
ec2c235b
KW
3117=for apidoc foldEQ_locale
3118
3119Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3120same case-insensitively in the current locale; false otherwise.
3121
3122=cut
3123*/
3124
0f9cb40c 3125PERL_STATIC_INLINE I32
c3c9077b 3126Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len)
ec2c235b 3127{
ec2c235b
KW
3128 const U8 *a = (const U8 *)s1;
3129 const U8 *b = (const U8 *)s2;
3130
3131 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3132
3133 assert(len >= 0);
3134
3135 while (len--) {
1604cfb0
MS
3136 if (*a != *b && *a != PL_fold_locale[*b])
3137 return 0;
3138 a++,b++;
ec2c235b
KW
3139 }
3140 return 1;
3141}
3142
1ab100a8 3143/*
3f620621 3144=for apidoc_section $string
1ab100a8
KW
3145=for apidoc my_strnlen
3146
3147The C library C<strnlen> if available, or a Perl implementation of it.
3148
3149C<my_strnlen()> computes the length of the string, up to C<maxlen>
a3815e44 3150characters. It will never attempt to address more than C<maxlen>
1ab100a8
KW
3151characters, making it suitable for use with strings that are not
3152guaranteed to be NUL-terminated.
3153
3154=cut
3155
3156Description stolen from http://man.openbsd.org/strnlen.3,
3157implementation stolen from PostgreSQL.
3158*/
3159#ifndef HAS_STRNLEN
3160
3161PERL_STATIC_INLINE Size_t
3162Perl_my_strnlen(const char *str, Size_t maxlen)
3163{
3164 const char *end = (char *) memchr(str, '\0', maxlen);
3165
3166 PERL_ARGS_ASSERT_MY_STRNLEN;
3167
3168 if (end == NULL) return maxlen;
3169 return end - str;
3170}
3171
3172#endif
3173
6dba01e2
KW
3174#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3175
3176PERL_STATIC_INLINE void *
3177S_my_memrchr(const char * s, const char c, const STRLEN len)
3178{
3179 /* memrchr(), since many platforms lack it */
3180
3181 const char * t = s + len - 1;
3182
3183 PERL_ARGS_ASSERT_MY_MEMRCHR;
3184
3185 while (t >= s) {
3186 if (*t == c) {
3187 return (void *) t;
3188 }
3189 t--;
3190 }
3191
3192 return NULL;
3193}
3194
3195#endif
3196
24f3e849
KW
3197PERL_STATIC_INLINE char *
3198Perl_mortal_getenv(const char * str)
3199{
3200 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3201 *
03694582
KW
3202 * It's (mostly) thread-safe because it uses a mutex to prevent other
3203 * threads (that look at this mutex) from destroying the result before this
3204 * routine has a chance to copy the result to a place that won't be
3205 * destroyed before the caller gets a chance to handle it. That place is a
3206 * mortal SV. khw chose this over SAVEFREEPV because he is under the
3207 * impression that the SV will hang around longer under more circumstances
24f3e849 3208 *
03694582
KW
3209 * The reason it isn't completely thread-safe is that other code could
3210 * simply not pay attention to the mutex. All of the Perl core uses the
3211 * mutex, but it is possible for code from, say XS, to not use this mutex,
3212 * defeating the safety.
24f3e849 3213 *
03694582
KW
3214 * getenv() returns, in some implementations, a pointer to a spot in the
3215 * **environ array, which could be invalidated at any time by this or
3216 * another thread changing the environment. Other implementations copy the
3217 * **environ value to a static buffer, returning a pointer to that. That
3218 * buffer might or might not be invalidated by a getenv() call in another
3219 * thread. If it does get zapped, we need an exclusive lock. Otherwise,
3220 * many getenv() calls can safely be running simultaneously, so a
3221 * many-reader (but no simultaneous writers) lock is ok. There is a
3222 * Configure probe to see if another thread destroys the buffer, and the
3223 * mutex is defined accordingly.
3224 *
3225 * But in all cases, using the mutex prevents these problems, as long as
57681073 3226 * all code uses the same mutex.
24f3e849
KW
3227 *
3228 * A complication is that this can be called during phases where the
3229 * mortalization process isn't available. These are in interpreter
3230 * destruction or early in construction. khw believes that at these times
3231 * there shouldn't be anything else going on, so plain getenv is safe AS
3232 * LONG AS the caller acts on the return before calling it again. */
3233
3234 char * ret;
3235 dTHX;
3236
3237 PERL_ARGS_ASSERT_MORTAL_GETENV;
3238
3239 /* Can't mortalize without stacks. khw believes that no other threads
3240 * should be running, so no need to lock things, and this may be during a
3241 * phase when locking isn't even available */
3242 if (UNLIKELY(PL_scopestack_ix == 0)) {
3243 return getenv(str);
3244 }
3245
03694582
KW
3246#ifdef PERL_MEM_LOG
3247
3248 /* A major complication arises under PERL_MEM_LOG. When that is active,
3249 * every memory allocation may result in logging, depending on the value of
3250 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for
3251 * saving ENV{foo}'s value (but before saving it), the logging code will
3252 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some
3253 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3254 * lock a boolean mutex recursively); 3) destroying the getenv() static
3255 * buffer; or 4) destroying the temporary created by this for the copy
3256 * causes a log entry to be made which could cause a new temporary to be
3257 * created, which will need to be destroyed at some point, leading to an
3258 * infinite loop.
3259 *
3260 * The solution adopted here (after some gnashing of teeth) is to detect
3261 * the recursive calls and calls from the logger, and treat them specially.
3262 * Let's say we want to do getenv("foo"). We first find
3263 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3264 * variable, so no temporary is required. Then we do getenv(foo}, and in
3265 * the process of creating a temporary to save it, this function will be
3266 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call,
3267 * we detect that it is such a call and return our saved value instead of
3268 * locking and doing a new getenv(). This solves all of problems 1), 2),
3269 * and 3). Because all the getenv()s are done while the mutex is locked,
3270 * the state cannot have changed. To solve 4), we don't create a temporary
3271 * when this is called from the logging code. That code disposes of the
3272 * return value while the mutex is still locked.
3273 *
3274 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3275 * digits and 3 particular letters are significant; the rest are ignored by
3276 * the memory logging code. Thus the per-interpreter variable only needs
3277 * to be large enough to save the significant information, the size of
3278 * which is known at compile time. The first byte is extra, reserved for
3279 * flags for our use. To protect against overflowing, only the reserved
3280 * byte, as many digits as don't overflow, and the three letters are
3281 * stored.
3282 *
3283 * The reserved byte has two bits:
3284 * 0x1 if set indicates that if we get here, it is a recursive call of
3285 * getenv()
3286 * 0x2 if set indicates that the call is from the logging code.
3287 *
3288 * If the flag indicates this is a recursive call, just return the stored
3289 * value of PL_mem_log; An empty value gets turned into NULL. */
3290 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3291 if (PL_mem_log[1] == '\0') {
3292 return NULL;
3293 } else {
3294 return PL_mem_log + 1;
3295 }
3296 }
3297
3298#endif
3299
35bcf7ff 3300 GETENV_LOCK;
24f3e849 3301
03694582
KW
3302#ifdef PERL_MEM_LOG
3303
3304 /* Here we are in a critical section. As explained above, we do our own
3305 * getenv(PERL_MEM_LOG), saving the result safely. */
3306 ret = getenv("PERL_MEM_LOG");
3307 if (ret == NULL) { /* No logging active */
3308
3309 /* Return that immediately if called from the logging code */
3310 if (PL_mem_log[0] & 0x2) {
3311 GETENV_UNLOCK;
3312 return NULL;
3313 }
3314
3315 PL_mem_log[1] = '\0';
3316 }
3317 else {
3318 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */
3319
3320 /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3321 * extremely long string. But we want only a few characters from it.
3322 * PL_mem_log has been made large enough to hold just the ones we need.
3323 * First the file descriptor. */
3324 if (isDIGIT(*ret)) {
3325 const char * s = ret;
3326 if (UNLIKELY(*s == '0')) {
3327
3328 /* Reduce multiple leading zeros to a single one. This is to
3329 * allow the caller to change what to do with leading zeros. */
3330 *mem_log_meat++ = '0';
3331 s++;
3332 while (*s == '0') {
3333 s++;
3334 }
3335 }
3336
3337 /* If the input overflows, copy just enough for the result to also
3338 * overflow, plus 1 to make sure */
3339 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3340 *mem_log_meat++ = *s++;
3341 }
3342 }
3343
467fdaa2 3344 /* Then each of the four significant characters */
03694582
KW
3345 if (strchr(ret, 'm')) {
3346 *mem_log_meat++ = 'm';
3347 }
3348 if (strchr(ret, 's')) {
3349 *mem_log_meat++ = 's';
3350 }
3351 if (strchr(ret, 't')) {
3352 *mem_log_meat++ = 't';
3353 }
467fdaa2
PE
3354 if (strchr(ret, 'c')) {
3355 *mem_log_meat++ = 'c';
3356 }
03694582
KW
3357 *mem_log_meat = '\0';
3358
3359 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3360 }
3361
3362 /* If we are being called from the logger, it only needs the significant
3363 * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3364 if (PL_mem_log[0] & 0x2) {
3365 assert(strEQ(str, "PERL_MEM_LOG"));
3366 GETENV_UNLOCK;
3367 return PL_mem_log + 1;
3368 }
3369
3370 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that
3371 * is coming from other than the logging code, so it should be treated the
3372 * same as any other getenv(), returning the full value, not just the
3373 * significant part, and having its value saved. Set the flag that
3374 * indicates any call to this routine will be a recursion from here */
3375 PL_mem_log[0] = 0x1;
3376
3377#endif
3378
3379 /* Now get the value of the real desired variable, and save a copy */
24f3e849
KW
3380 ret = getenv(str);
3381
3382 if (ret != NULL) {
c80a8618 3383 ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
24f3e849
KW
3384 }
3385
35bcf7ff
KW
3386 GETENV_UNLOCK;
3387
03694582
KW
3388#ifdef PERL_MEM_LOG
3389
3390 /* Clear the buffer */
3391 Zero(PL_mem_log, sizeof(PL_mem_log), char);
3392
3393#endif
3394
24f3e849
KW
3395 return ret;
3396}
3397
1d0d673f
PE
3398PERL_STATIC_INLINE bool
3399Perl_sv_isbool(pTHX_ const SV *sv)
3400{
57e785fd
YO
3401 /* change to the following in 5.37, logically the same but
3402 * more efficient and more future proof */
3403#if 0
3404 return (SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv));
3405#else
3406 return SvIOK(sv) && SvPOK(sv) && SvIsCOW_static(sv) &&
3407 (SvPVX_const(sv) == PL_Yes || SvPVX_const(sv) == PL_No);
3408#endif
3409
1d0d673f
PE
3410}
3411
9c913148
TC
3412#ifdef USE_ITHREADS
3413
3414PERL_STATIC_INLINE AV *
3415Perl_cop_file_avn(pTHX_ const COP *cop) {
3416
3417 PERL_ARGS_ASSERT_COP_FILE_AVN;
3418
3419 const char *file = CopFILE(cop);
3420 if (file) {
3421 GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3422 if (gv) {
3423 return GvAVn(gv);
3424 }
3425 else
3426 return NULL;
3427 }
3428 else
3429 return NULL;
3430}
3431
3432#endif
3433
79277e97
PE
3434PERL_STATIC_INLINE PADNAME *
3435Perl_padname_refcnt_inc(PADNAME *pn)
3436{
3437 PadnameREFCNT(pn)++;
3438 return pn;
3439}
3440
ed382232 3441/*
c8028aa6
TC
3442 * ex: set ts=8 sts=4 sw=4 et:
3443 */