This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
PathTools/Cwd.xs: define SYSNAME/SYSNAME_LEN for OS390 only
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9 8 * This file contains tables and code adapted from
f6521f7c 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
8ed185f9
KW
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
88dfbb19
KW
37 * Note also perlstatic.h for functions that can't or shouldn't be inlined, but
38 * whose details should be exposed to the compiler, for such things as tail
39 * call optimization.
40 *
25468daa
FC
41 * Each section names the header file that the functions "belong" to.
42 */
27669aa4 43
be3a7a5d
KW
44/* ------------------------------- av.h ------------------------------- */
45
87306e06 46/*
3f620621 47=for apidoc_section $AV
87306e06
KW
48=for apidoc av_count
49Returns the number of elements in the array C<av>. This is the true length of
50the array, including any undefined elements. It is always the same as
51S<C<av_top_index(av) + 1>>.
52
53=cut
54*/
55PERL_STATIC_INLINE Size_t
56Perl_av_count(pTHX_ AV *av)
be3a7a5d 57{
87306e06 58 PERL_ARGS_ASSERT_AV_COUNT;
be3a7a5d
KW
59 assert(SvTYPE(av) == SVt_PVAV);
60
87306e06 61 return AvFILL(av) + 1;
be3a7a5d
KW
62}
63
84c75204
RL
64/* ------------------------------- av.c ------------------------------- */
65
66/*
67=for apidoc av_store_simple
68
69This is a cut-down version of av_store that assumes that the array is
70very straightforward - no magic, not readonly, and AvREAL - and that
71C<key> is not negative. This function MUST NOT be used in situations
72where any of those assumptions may not hold.
73
74Stores an SV in an array. The array index is specified as C<key>. It
75can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
76
77Note that the caller is responsible for suitably incrementing the reference
78count of C<val> before the call.
79
80Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
81
82=cut
83*/
84
85PERL_STATIC_INLINE SV**
86Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
87{
56077d95
LT
88 SV** ary;
89
84c75204
RL
90 PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
91 assert(SvTYPE(av) == SVt_PVAV);
92 assert(!SvMAGICAL(av));
93 assert(!SvREADONLY(av));
94 assert(AvREAL(av));
95 assert(key > -1);
96
56077d95 97 ary = AvARRAY(av);
84c75204
RL
98
99 if (AvFILLp(av) < key) {
100 if (key > AvMAX(av)) {
101 av_extend(av,key);
102 ary = AvARRAY(av);
103 }
104 AvFILLp(av) = key;
105 } else
106 SvREFCNT_dec(ary[key]);
107
108 ary[key] = val;
109 return &ary[key];
110}
111
112/*
113=for apidoc av_fetch_simple
114
115This is a cut-down version of av_fetch that assumes that the array is
116very straightforward - no magic, not readonly, and AvREAL - and that
117C<key> is not negative. This function MUST NOT be used in situations
118where any of those assumptions may not hold.
119
120Returns the SV at the specified index in the array. The C<key> is the
121index. If lval is true, you are guaranteed to get a real SV back (in case
122it wasn't real before), which you can then modify. Check that the return
123value is non-null before dereferencing it to a C<SV*>.
124
125The rough perl equivalent is C<$myarray[$key]>.
126
127=cut
128*/
129
130PERL_STATIC_INLINE SV**
131Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
132{
133 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
134 assert(SvTYPE(av) == SVt_PVAV);
135 assert(!SvMAGICAL(av));
136 assert(!SvREADONLY(av));
137 assert(AvREAL(av));
138 assert(key > -1);
139
140 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
8fcb2425 141 return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
84c75204
RL
142 } else {
143 return &AvARRAY(av)[key];
144 }
145}
146
a9b64e60
RL
147/*
148=for apidoc av_push_simple
149
150This is a cut-down version of av_push that assumes that the array is very
151straightforward - no magic, not readonly, and AvREAL - and that C<key> is
152not less than -1. This function MUST NOT be used in situations where any
153of those assumptions may not hold.
154
155Pushes an SV (transferring control of one reference count) onto the end of the
156array. The array will grow automatically to accommodate the addition.
157
158Perl equivalent: C<push @myarray, $val;>.
159
160=cut
161*/
162
163PERL_STATIC_INLINE void
164Perl_av_push_simple(pTHX_ AV *av, SV *val)
165{
166 PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
167 assert(SvTYPE(av) == SVt_PVAV);
168 assert(!SvMAGICAL(av));
169 assert(!SvREADONLY(av));
170 assert(AvREAL(av));
171 assert(AvFILLp(av) > -2);
172
173 (void)av_store_simple(av,AvFILLp(av)+1,val);
174}
175
eae3cc96
RL
176/*
177=for apidoc av_new_alloc
178
179This implements L<perlapi/C<newAV_alloc_x>>
180and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this
181functionality.
182
183Creates a new AV and allocates its SV* array.
184
185This is similar to, but more efficient than doing:
186
187 AV *av = newAV();
188 av_extend(av, key);
189
190The size parameter is used to pre-allocate a SV* array large enough to
191hold at least elements C<0..(size-1)>. C<size> must be at least 1.
192
193The C<zeroflag> parameter controls whether or not the array is NULL
194initialized.
195
196=cut
197*/
198
199PERL_STATIC_INLINE AV *
200Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag)
201{
202 AV * const av = newAV();
203 SV** ary;
204 PERL_ARGS_ASSERT_AV_NEW_ALLOC;
205 assert(size > 0);
206
207 Newx(ary, size, SV*); /* Newx performs the memwrap check */
208 AvALLOC(av) = ary;
209 AvARRAY(av) = ary;
210 AvMAX(av) = size - 1;
211
212 if (zeroflag)
213 Zero(ary, size, SV*);
214
215 return av;
216}
217
218
1afe1db1
FC
219/* ------------------------------- cv.h ------------------------------- */
220
fa3e44c0 221/*
3f620621 222=for apidoc_section $CV
fa3e44c0
KW
223=for apidoc CvGV
224Returns the GV associated with the CV C<sv>, reifying it if necessary.
225
226=cut
227*/
ae77754a 228PERL_STATIC_INLINE GV *
c9182d9c 229Perl_CvGV(pTHX_ CV *sv)
ae77754a 230{
74804ad1
KW
231 PERL_ARGS_ASSERT_CVGV;
232
ae77754a 233 return CvNAMED(sv)
1604cfb0
MS
234 ? Perl_cvgv_from_hek(aTHX_ sv)
235 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
ae77754a
FC
236}
237
c8b3b0ee
KW
238/*
239=for apidoc CvDEPTH
240Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a
241recursive call.
242
243=cut
244*/
1afe1db1 245PERL_STATIC_INLINE I32 *
74804ad1 246Perl_CvDEPTH(const CV * const sv)
1afe1db1 247{
74804ad1 248 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 249 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 250
8de47657 251 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
252}
253
d16269d8
PM
254/*
255 CvPROTO returns the prototype as stored, which is not necessarily what
256 the interpreter should be using. Specifically, the interpreter assumes
257 that spaces have been stripped, which has been the case if the prototype
258 was added by toke.c, but is generally not the case if it was added elsewhere.
259 Since we can't enforce the spacelessness at assignment time, this routine
260 provides a temporary copy at parse time with spaces removed.
261 I<orig> is the start of the original buffer, I<len> is the length of the
262 prototype and will be updated when this returns.
263 */
264
5b67adb8 265#ifdef PERL_CORE
d16269d8
PM
266PERL_STATIC_INLINE char *
267S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
268{
269 SV * tmpsv;
270 char * tmps;
271 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
272 tmps = SvPVX(tmpsv);
273 while ((*len)--) {
1604cfb0
MS
274 if (!isSPACE(*orig))
275 *tmps++ = *orig;
276 orig++;
d16269d8
PM
277 }
278 *tmps = '\0';
279 *len = tmps - SvPVX(tmpsv);
1604cfb0 280 return SvPVX(tmpsv);
d16269d8 281}
5b67adb8 282#endif
d16269d8 283
25fdce4a
FC
284/* ------------------------------- mg.h ------------------------------- */
285
286#if defined(PERL_CORE) || defined(PERL_EXT)
287/* assumes get-magic and stringification have already occurred */
288PERL_STATIC_INLINE STRLEN
289S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
290{
291 assert(mg->mg_type == PERL_MAGIC_regex_global);
292 assert(mg->mg_len != -1);
293 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
1604cfb0 294 return (STRLEN)mg->mg_len;
25fdce4a 295 else {
1604cfb0
MS
296 const STRLEN pos = (STRLEN)mg->mg_len;
297 /* Without this check, we may read past the end of the buffer: */
298 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
299 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
25fdce4a
FC
300 }
301}
302#endif
303
03414f05
FC
304/* ------------------------------- pad.h ------------------------------ */
305
306#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
307PERL_STATIC_INLINE bool
b9d5702c 308S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 309{
b9d5702c
KW
310 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
311
03414f05
FC
312 /* is seq within the range _LOW to _HIGH ?
313 * This is complicated by the fact that PL_cop_seqmax
314 * may have wrapped around at some point */
315 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
1604cfb0 316 return FALSE; /* not yet introduced */
03414f05
FC
317
318 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
319 /* in compiling scope */
1604cfb0
MS
320 if (
321 (seq > COP_SEQ_RANGE_LOW(pn))
322 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
323 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
324 )
325 return TRUE;
03414f05
FC
326 }
327 else if (
1604cfb0
MS
328 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
329 ?
330 ( seq > COP_SEQ_RANGE_LOW(pn)
331 || seq <= COP_SEQ_RANGE_HIGH(pn))
03414f05 332
1604cfb0
MS
333 : ( seq > COP_SEQ_RANGE_LOW(pn)
334 && seq <= COP_SEQ_RANGE_HIGH(pn))
03414f05 335 )
1604cfb0 336 return TRUE;
03414f05
FC
337 return FALSE;
338}
339#endif
340
33a4312b
FC
341/* ------------------------------- pp.h ------------------------------- */
342
343PERL_STATIC_INLINE I32
c9182d9c 344Perl_TOPMARK(pTHX)
33a4312b
FC
345{
346 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
1604cfb0
MS
347 "MARK top %p %" IVdf "\n",
348 PL_markstack_ptr,
349 (IV)*PL_markstack_ptr)));
33a4312b
FC
350 return *PL_markstack_ptr;
351}
352
353PERL_STATIC_INLINE I32
c9182d9c 354Perl_POPMARK(pTHX)
33a4312b
FC
355{
356 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
1604cfb0
MS
357 "MARK pop %p %" IVdf "\n",
358 (PL_markstack_ptr-1),
359 (IV)*(PL_markstack_ptr-1))));
33a4312b
FC
360 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
361 return *PL_markstack_ptr--;
362}
363
8d919b0a
FC
364/* ----------------------------- regexp.h ----------------------------- */
365
9d0d3060
NC
366/* PVLVs need to act as a superset of all scalar types - they are basically
367 * PVMGs with a few extra fields.
368 * REGEXPs are first class scalars, but have many fields that can't be copied
369 * into a PVLV body.
370 *
371 * Hence we take a different approach - instead of a copy, PVLVs store a pointer
372 * back to the original body. To avoid increasing the size of PVLVs just for the
373 * rare case of REGEXP assignment, this pointer is stored in the memory usually
374 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
375 * read the pointer from the two possible locations. The macro SvLEN() wraps the
376 * access to the union's member xpvlenu_len, but there is no equivalent macro
377 * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
378 *
379 * See commit df6b4bd56551f2d3 for more details. */
380
8d919b0a 381PERL_STATIC_INLINE struct regexp *
c9182d9c 382Perl_ReANY(const REGEXP * const re)
8d919b0a 383{
df6b4bd5 384 XPV* const p = (XPV*)SvANY(re);
bdef45de
KW
385
386 PERL_ARGS_ASSERT_REANY;
8d919b0a 387 assert(isREGEXP(re));
bdef45de 388
df6b4bd5
DM
389 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
390 : (struct regexp *)p;
8d919b0a
FC
391}
392
a8a2ceaa
KW
393/* ------------------------------- utf8.h ------------------------------- */
394
2fe720e2 395/*
3f620621 396=for apidoc_section $unicode
2fe720e2
KW
397*/
398
55d09dc8 399PERL_STATIC_INLINE void
c9182d9c 400Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
401{
402 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
403 * encoded string at '*dest', updating '*dest' to include it */
404
55d09dc8
KW
405 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
406
6f2d5cbc 407 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 408 *((*dest)++) = byte;
55d09dc8 409 else {
a09ec51a
KW
410 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
411 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
412 }
413}
414
e123187a 415/*
2fe720e2 416=for apidoc valid_utf8_to_uvchr
09232555
KW
417Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
418known that the next character in the input UTF-8 string C<s> is well-formed
419(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
420points, and non-Unicode code points are allowed.
2fe720e2
KW
421
422=cut
423
424 */
425
426PERL_STATIC_INLINE UV
427Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
428{
c41b2540 429 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
430 const U8* send = s + expectlen;
431 UV uv = *s;
432
433 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
434
435 if (retlen) {
436 *retlen = expectlen;
437 }
438
439 /* An invariant is trivially returned */
440 if (expectlen == 1) {
1604cfb0 441 return uv;
2fe720e2
KW
442 }
443
444 /* Remove the leading bits that indicate the number of bytes, leaving just
445 * the bits that are part of the value */
446 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
447
448 /* Now, loop through the remaining bytes, accumulating each into the
449 * working total as we go. (I khw tried unrolling the loop for up to 4
450 * bytes, but there was no performance improvement) */
451 for (++s; s < send; s++) {
452 uv = UTF8_ACCUMULATE(uv, *s);
453 }
454
455 return UNI_TO_NATIVE(uv);
456
457}
458
1e599354
KW
459/*
460=for apidoc is_utf8_invariant_string
461
82c5d941 462Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 463regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
464EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
465are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
466the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
467characters are invariant, but so also are the C1 controls.
1e599354
KW
468
469If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
470use this option, that C<s> can't have embedded C<NUL> characters and has to
471have a terminating C<NUL> byte).
472
9f2abfde
KW
473See also
474C<L</is_utf8_string>>,
475C<L</is_utf8_string_flags>>,
476C<L</is_utf8_string_loc>>,
477C<L</is_utf8_string_loc_flags>>,
478C<L</is_utf8_string_loclen>>,
479C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
480C<L</is_utf8_fixed_width_buf_flags>>,
481C<L</is_utf8_fixed_width_buf_loc_flags>>,
482C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
483C<L</is_strict_utf8_string>>,
484C<L</is_strict_utf8_string_loc>>,
485C<L</is_strict_utf8_string_loclen>>,
486C<L</is_c9strict_utf8_string>>,
487C<L</is_c9strict_utf8_string_loc>>,
488and
489C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
490
491=cut
0cbf5865
KW
492
493*/
494
495#define is_utf8_invariant_string(s, len) \
496 is_utf8_invariant_string_loc(s, len, NULL)
497
498/*
499=for apidoc is_utf8_invariant_string_loc
500
501Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
502the first UTF-8 variant character in the C<ep> pointer; if all characters are
503UTF-8 invariant, this function does not change the contents of C<*ep>.
504
505=cut
506
1e599354
KW
507*/
508
509PERL_STATIC_INLINE bool
c9182d9c 510Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 511{
e17544a6 512 const U8* send;
1e599354
KW
513 const U8* x = s;
514
0cbf5865
KW
515 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
516
e17544a6
KW
517 if (len == 0) {
518 len = strlen((const char *)s);
519 }
520
521 send = s + len;
522
4ab2fd9b 523/* This looks like 0x010101... */
2c5c8af5 524# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
525
526/* This looks like 0x808080... */
2c5c8af5 527# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
9f4248c9 528# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 529# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 530
099e59a4
KW
531/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
532 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
533 * optimized out completely on a 32-bit system, and its mask gets optimized out
534 * on a 64-bit system */
2c5c8af5 535# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
536 | ( PTR2nat(x) >> 1) \
537 | ( ( (PTR2nat(x) \
538 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 539
3f515a2e
KW
540#ifndef EBCDIC
541
099e59a4
KW
542 /* Do the word-at-a-time iff there is at least one usable full word. That
543 * means that after advancing to a word boundary, there still is at least a
544 * full word left. The number of bytes needed to advance is 'wordsize -
545 * offset' unless offset is 0. */
546 if ((STRLEN) (send - x) >= PERL_WORDSIZE
547
548 /* This term is wordsize if subword; 0 if not */
549 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
550
551 /* 'offset' */
552 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
553 {
b40579ff 554
46bb68f6
KW
555 /* Process per-byte until reach word boundary. XXX This loop could be
556 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 557 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
558 if (! UTF8_IS_INVARIANT(*x)) {
559 if (ep) {
560 *ep = x;
561 }
e17544a6 562
46bb68f6
KW
563 return FALSE;
564 }
565 x++;
e17544a6 566 }
e17544a6 567
099e59a4
KW
568 /* Here, we know we have at least one full word to process. Process
569 * per-word as long as we have at least a full word left */
570 do {
4ab2fd9b 571 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 572
46bb68f6
KW
573 /* Found a variant. Just return if caller doesn't want its
574 * exact position */
575 if (! ep) {
576 return FALSE;
577 }
e17544a6 578
2c5c8af5
KW
579# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
580 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 581
73f0a2eb 582 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
583 assert(*ep >= s && *ep < send);
584
585 return FALSE;
586
2c5c8af5 587# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
588 checks. */
589
46bb68f6 590 break;
2c5c8af5 591# endif
46bb68f6 592 }
1d2af574 593
46bb68f6 594 x += PERL_WORDSIZE;
1d2af574 595
099e59a4 596 } while (x + PERL_WORDSIZE <= send);
b40579ff 597 }
e17544a6 598
0b08cab0 599#endif /* End of ! EBCDIC */
e17544a6
KW
600
601 /* Process per-byte */
602 while (x < send) {
1604cfb0 603 if (! UTF8_IS_INVARIANT(*x)) {
e17544a6
KW
604 if (ep) {
605 *ep = x;
606 }
0cbf5865 607
e17544a6 608 return FALSE;
0cbf5865 609 }
1e599354 610
e17544a6 611 x++;
1e599354
KW
612 }
613
614 return TRUE;
615}
bf874180 616
fc1bb663
KW
617/* See if the platform has builtins for finding the most/least significant bit,
618 * and which one is right for using on 32 and 64 bit operands */
619#if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
620# if U32SIZE == INTSIZE
621# define PERL_CLZ_32 __builtin_clz
622# endif
623# if defined(U64TYPE) && U64SIZE == INTSIZE
624# define PERL_CLZ_64 __builtin_clz
625# endif
626#endif
627#if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
628# if U32SIZE == INTSIZE
629# define PERL_CTZ_32 __builtin_ctz
630# endif
631# if defined(U64TYPE) && U64SIZE == INTSIZE
632# define PERL_CTZ_64 __builtin_ctz
633# endif
634#endif
635
636#if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
637# if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
638# define PERL_CLZ_32 __builtin_clzl
639# endif
640# if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
641# define PERL_CLZ_64 __builtin_clzl
642# endif
643#endif
644#if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
645# if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
646# define PERL_CTZ_32 __builtin_ctzl
647# endif
648# if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
649# define PERL_CTZ_64 __builtin_ctzl
650# endif
651#endif
652
653#if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
654# if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
655# define PERL_CLZ_32 __builtin_clzll
656# endif
657# if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
658# define PERL_CLZ_64 __builtin_clzll
659# endif
660#endif
661#if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
662# if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
663# define PERL_CTZ_32 __builtin_ctzll
664# endif
665# if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
666# define PERL_CTZ_64 __builtin_ctzll
667# endif
668#endif
669
e1f6bdff 670#if defined(_MSC_VER)
e88dde50
KW
671# include <intrin.h>
672# pragma intrinsic(_BitScanForward)
673# pragma intrinsic(_BitScanReverse)
674# ifdef _WIN64
675# pragma intrinsic(_BitScanForward64)
676# pragma intrinsic(_BitScanReverse64)
677# endif
678#endif
679
250e5324
KW
680/* The reason there are not checks to see if ffs() and ffsl() are available for
681 * determining the lsb, is because these don't improve on the deBruijn method
682 * fallback, which is just a branchless integer multiply, array element
683 * retrieval, and shift. The others, even if the function call overhead is
684 * optimized out, have to cope with the possibility of the input being all
685 * zeroes, and almost certainly will have conditionals for this eventuality.
686 * khw, at the time of this commit, looked at the source for both gcc and clang
687 * to verify this. (gcc used a method inferior to deBruijn.) */
688
330cd0ce 689/* Below are functions to find the first, last, or only set bit in a word. On
19d2c525
KW
690 * platforms with 64-bit capability, there is a pair for each operation; the
691 * first taking a 64 bit operand, and the second a 32 bit one. The logic is
692 * the same in each pair, so the second is stripped of most comments. */
693
694#ifdef U64TYPE /* HAS_QUAD not usable outside the core */
695
696PERL_STATIC_INLINE unsigned
697Perl_lsbit_pos64(U64 word)
698{
699 /* Find the position (0..63) of the least significant set bit in the input
700 * word */
701
702 ASSUME(word != 0);
703
fc1bb663
KW
704 /* If we can determine that the platform has a usable fast method to get
705 * this info, use that */
706
707# if defined(PERL_CTZ_64)
2e0bc9ce 708# define PERL_HAS_FAST_GET_LSB_POS64
fc1bb663
KW
709
710 return (unsigned) PERL_CTZ_64(word);
711
a333292f 712# elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
2e0bc9ce 713# define PERL_HAS_FAST_GET_LSB_POS64
e88dde50
KW
714
715 {
716 unsigned long index;
717 _BitScanForward64(&index, word);
718 return (unsigned)index;
719 }
720
fc1bb663
KW
721# else
722
723 /* Here, we didn't find a fast method for finding the lsb. Fall back to
724 * making the lsb the only set bit in the word, and use our function that
725 * works on words with a single bit set.
726 *
727 * Isolate the lsb;
19d2c525
KW
728 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
729 *
730 * The word will look like this, with a rightmost set bit in position 's':
731 * ('x's are don't cares, and 'y's are their complements)
732 * s
733 * x..x100..00
734 * y..y011..11 Complement
735 * y..y100..00 Add 1
736 * 0..0100..00 And with the original
737 *
738 * (Yes, complementing and adding 1 is just taking the negative on 2's
739 * complement machines, but not on 1's complement ones, and some compilers
740 * complain about negating an unsigned.)
741 */
742 return single_1bit_pos64(word & (~word + 1));
fc1bb663
KW
743
744# endif
745
19d2c525
KW
746}
747
748# define lsbit_pos_uintmax_(word) lsbit_pos64(word)
749#else /* ! QUAD */
750# define lsbit_pos_uintmax_(word) lsbit_pos32(word)
751#endif
752
753PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */
754Perl_lsbit_pos32(U32 word)
755{
756 /* Find the position (0..31) of the least significant set bit in the input
757 * word */
758
759 ASSUME(word != 0);
760
fc1bb663 761#if defined(PERL_CTZ_32)
2e0bc9ce 762# define PERL_HAS_FAST_GET_LSB_POS32
fc1bb663
KW
763
764 return (unsigned) PERL_CTZ_32(word);
765
e1f6bdff 766#elif U32SIZE == 4 && defined(_MSC_VER)
2e0bc9ce 767# define PERL_HAS_FAST_GET_LSB_POS32
e88dde50
KW
768
769 {
770 unsigned long index;
771 _BitScanForward(&index, word);
772 return (unsigned)index;
773 }
774
fc1bb663
KW
775#else
776
19d2c525 777 return single_1bit_pos32(word & (~word + 1));
fc1bb663
KW
778
779#endif
780
19d2c525
KW
781}
782
4a1b7bb2 783
fc1bb663 784/* Convert the leading zeros count to the bit position of the first set bit.
4a1b7bb2
KW
785 * This just subtracts from the highest position, 31 or 63. But some compilers
786 * don't optimize this optimally, and so a bit of bit twiddling encourages them
787 * to do the right thing. It turns out that subtracting a smaller non-negative
788 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
789 * the two numbers. To see why, first note that the sum of any number, x, and
790 * its complement, x', is all ones. So all ones minus x is x'. Then note that
791 * the xor of x and all ones is x'. */
792#define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc))
fc1bb663 793
995a4954
KW
794#ifdef U64TYPE /* HAS_QUAD not usable outside the core */
795
796PERL_STATIC_INLINE unsigned
330cd0ce
KW
797Perl_msbit_pos64(U64 word)
798{
799 /* Find the position (0..63) of the most significant set bit in the input
800 * word */
801
802 ASSUME(word != 0);
803
fc1bb663
KW
804 /* If we can determine that the platform has a usable fast method to get
805 * this, use that */
806
807# if defined(PERL_CLZ_64)
2e0bc9ce 808# define PERL_HAS_FAST_GET_MSB_POS64
fc1bb663
KW
809
810 return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
811
e1f6bdff 812# elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
2e0bc9ce 813# define PERL_HAS_FAST_GET_MSB_POS64
e88dde50
KW
814
815 {
816 unsigned long index;
817 _BitScanReverse64(&index, word);
818 return (unsigned)index;
819 }
820
fc1bb663
KW
821# else
822
823 /* Here, we didn't find a fast method for finding the msb. Fall back to
824 * making the msb the only set bit in the word, and use our function that
825 * works on words with a single bit set.
826 *
827 * Isolate the msb; http://codeforces.com/blog/entry/10330
330cd0ce
KW
828 *
829 * Only the most significant set bit matters. Or'ing word with its right
830 * shift of 1 makes that bit and the next one to its right both 1.
831 * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
832 * ... We end with the msb and all to the right being 1. */
833 word |= (word >> 1);
834 word |= (word >> 2);
835 word |= (word >> 4);
836 word |= (word >> 8);
837 word |= (word >> 16);
838 word |= (word >> 32);
839
840 /* Then subtracting the right shift by 1 clears all but the left-most of
841 * the 1 bits, which is our desired result */
842 word -= (word >> 1);
843
844 /* Now we have a single bit set */
845 return single_1bit_pos64(word);
fc1bb663
KW
846
847# endif
848
330cd0ce
KW
849}
850
851# define msbit_pos_uintmax_(word) msbit_pos64(word)
852#else /* ! QUAD */
853# define msbit_pos_uintmax_(word) msbit_pos32(word)
854#endif
855
856PERL_STATIC_INLINE unsigned
857Perl_msbit_pos32(U32 word)
858{
859 /* Find the position (0..31) of the most significant set bit in the input
860 * word */
861
862 ASSUME(word != 0);
863
fc1bb663 864#if defined(PERL_CLZ_32)
2e0bc9ce 865# define PERL_HAS_FAST_GET_MSB_POS32
fc1bb663
KW
866
867 return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
868
e1f6bdff 869#elif U32SIZE == 4 && defined(_MSC_VER)
2e0bc9ce 870# define PERL_HAS_FAST_GET_MSB_POS32
e88dde50
KW
871
872 {
873 unsigned long index;
874 _BitScanReverse(&index, word);
875 return (unsigned)index;
876 }
877
fc1bb663
KW
878#else
879
330cd0ce
KW
880 word |= (word >> 1);
881 word |= (word >> 2);
882 word |= (word >> 4);
883 word |= (word >> 8);
884 word |= (word >> 16);
885 word -= (word >> 1);
886 return single_1bit_pos32(word);
fc1bb663
KW
887
888#endif
889
330cd0ce
KW
890}
891
787e8384
KW
892#if UVSIZE == U64SIZE
893# define msbit_pos(word) msbit_pos64(word)
894# define lsbit_pos(word) lsbit_pos64(word)
895#elif UVSIZE == U32SIZE
896# define msbit_pos(word) msbit_pos32(word)
897# define lsbit_pos(word) lsbit_pos32(word)
898#endif
899
330cd0ce
KW
900#ifdef U64TYPE /* HAS_QUAD not usable outside the core */
901
902PERL_STATIC_INLINE unsigned
995a4954
KW
903Perl_single_1bit_pos64(U64 word)
904{
905 /* Given a 64-bit word known to contain all zero bits except one 1 bit,
906 * find and return the 1's position: 0..63 */
907
908# ifdef PERL_CORE /* macro not exported */
909 ASSUME(isPOWER_OF_2(word));
910# else
911 ASSUME(word && (word & (word-1)) == 0);
912# endif
913
2e0bc9ce
KW
914 /* The only set bit is both the most and least significant bit. If we have
915 * a fast way of finding either one, use that.
916 *
917 * It may appear at first glance that those functions call this one, but
918 * they don't if the corresponding #define is set */
919
920# ifdef PERL_HAS_FAST_GET_MSB_POS64
921
922 return msbit_pos64(word);
923
924# elif defined(PERL_HAS_FAST_GET_LSB_POS64)
925
926 return lsbit_pos64(word);
927
928# else
929
995a4954
KW
930 /* The position of the only set bit in a word can be quickly calculated
931 * using deBruijn sequences. See for example
932 * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
933 return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
934 >> PERL_deBruijnShift64_];
2e0bc9ce
KW
935# endif
936
995a4954
KW
937}
938
939#endif
940
bf874180
KW
941PERL_STATIC_INLINE unsigned
942Perl_single_1bit_pos32(U32 word)
943{
944 /* Given a 32-bit word known to contain all zero bits except one 1 bit,
945 * find and return the 1's position: 0..31 */
946
947#ifdef PERL_CORE /* macro not exported */
948 ASSUME(isPOWER_OF_2(word));
949#else
950 ASSUME(word && (word & (word-1)) == 0);
951#endif
2e0bc9ce
KW
952#ifdef PERL_HAS_FAST_GET_MSB_POS32
953
954 return msbit_pos32(word);
955
956#elif defined(PERL_HAS_FAST_GET_LSB_POS32)
957
958 return lsbit_pos32(word);
959
960/* Unlikely, but possible for the platform to have a wider fast operation but
961 * not a narrower one. But easy enough to handle the case by widening the
962 * parameter size. (Going the other way, emulating 64 bit by two 32 bit ops
963 * would be slower than the deBruijn method.) */
964#elif defined(PERL_HAS_FAST_GET_MSB_POS64)
965
966 return msbit_pos64(word);
967
968#elif defined(PERL_HAS_FAST_GET_LSB_POS64)
969
970 return lsbit_pos64(word);
971
972#else
bf874180 973
bf874180
KW
974 return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
975 >> PERL_deBruijnShift32_];
2e0bc9ce
KW
976#endif
977
bf874180 978}
1e599354 979
23a7ee81
KW
980#ifndef EBCDIC
981
1d2af574 982PERL_STATIC_INLINE unsigned int
73f0a2eb 983Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574 984{
1d2af574
KW
985 /* This returns the position in a word (0..7) of the first variant byte in
986 * it. This is a helper function. Note that there are no branches */
987
1d2af574
KW
988 /* Get just the msb bits of each byte */
989 word &= PERL_VARIANTS_WORD_MASK;
990
58ddb8c5
KW
991 /* This should only be called if we know there is a variant byte in the
992 * word */
993 assert(word);
994
7adf2470 995# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
996
997 /* Bytes are stored like
998 * Byte8 ... Byte2 Byte1
999 * 63..56...15...8 7...0
19d2c525
KW
1000 * so getting the lsb of the whole modified word is getting the msb of the
1001 * first byte that has its msb set */
1002 word = lsbit_pos_uintmax_(word);
1003
1004 /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert
1005 * to 0..7 */
1006 return (unsigned int) ((word + 1) >> 3) - 1;
1d2af574
KW
1007
1008# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1009
1010 /* Bytes are stored like
1011 * Byte1 Byte2 ... Byte8
1012 * 63..56 55..47 ... 7...0
330cd0ce
KW
1013 * so getting the msb of the whole modified word is getting the msb of the
1014 * first byte that has its msb set */
1015 word = msbit_pos_uintmax_(word);
1d2af574 1016
330cd0ce
KW
1017 /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert
1018 * to 0..7 */
1d2af574
KW
1019 word = ((word + 1) >> 3) - 1;
1020
330cd0ce
KW
1021 /* And invert the result because of the reversed byte order on this
1022 * platform */
1d2af574
KW
1023 word = CHARBITS - word - 1;
1024
330cd0ce
KW
1025 return (unsigned int) word;
1026
1027# else
1028# error Unexpected byte order
1d2af574
KW
1029# endif
1030
1d2af574
KW
1031}
1032
23a7ee81 1033#endif
03c1e4ab
KW
1034#if defined(PERL_CORE) || defined(PERL_EXT)
1035
1036/*
1037=for apidoc variant_under_utf8_count
1038
1039This function looks at the sequence of bytes between C<s> and C<e>, which are
1040assumed to be encoded in ASCII/Latin1, and returns how many of them would
1041change should the string be translated into UTF-8. Due to the nature of UTF-8,
1042each of these would occupy two bytes instead of the single one in the input
1043string. Thus, this function returns the precise number of bytes the string
1044would expand by when translated to UTF-8.
1045
1046Unlike most of the other functions that have C<utf8> in their name, the input
1047to this function is NOT a UTF-8-encoded string. The function name is slightly
1048I<odd> to emphasize this.
1049
1050This function is internal to Perl because khw thinks that any XS code that
1051would want this is probably operating too close to the internals. Presenting a
1052valid use case could change that.
1053
1054See also
1055C<L<perlapi/is_utf8_invariant_string>>
1056and
1057C<L<perlapi/is_utf8_invariant_string_loc>>,
1058
1059=cut
1060
1061*/
1062
1063PERL_STATIC_INLINE Size_t
1064S_variant_under_utf8_count(const U8* const s, const U8* const e)
1065{
1066 const U8* x = s;
1067 Size_t count = 0;
1068
1069 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1070
1071# ifndef EBCDIC
1072
5d0379de
KW
1073 /* Test if the string is long enough to use word-at-a-time. (Logic is the
1074 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
1075 if ((STRLEN) (e - x) >= PERL_WORDSIZE
1076 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1077 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1078 {
1079
1080 /* Process per-byte until reach word boundary. XXX This loop could be
1081 * eliminated if we knew that this platform had fast unaligned reads */
1082 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1083 count += ! UTF8_IS_INVARIANT(*x++);
1084 }
1085
1086 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
1087 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1088 explanation of how this works */
e5863284
KW
1089 PERL_UINTMAX_T increment
1090 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
1091 * PERL_COUNT_MULTIPLIER)
1092 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 1093 count += (Size_t) increment;
03c1e4ab
KW
1094 x += PERL_WORDSIZE;
1095 } while (x + PERL_WORDSIZE <= e);
1096 }
1097
1098# endif
1099
1100 /* Process per-byte */
1101 while (x < e) {
1604cfb0 1102 if (! UTF8_IS_INVARIANT(*x)) {
03c1e4ab
KW
1103 count++;
1104 }
1105
1106 x++;
1107 }
1108
1109 return count;
1110}
1111
1112#endif
1113
9f4248c9
KW
1114#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
1115# undef PERL_WORDSIZE
1116# undef PERL_COUNT_MULTIPLIER
1117# undef PERL_WORD_BOUNDARY_MASK
1118# undef PERL_VARIANTS_WORD_MASK
1119#endif
1120
7c93d8f0 1121/*
5ff889fb
KW
1122=for apidoc is_utf8_string
1123
82c5d941
KW
1124Returns TRUE if the first C<len> bytes of string C<s> form a valid
1125Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
1126be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1127can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1128byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1129
2717076a
KW
1130This function considers Perl's extended UTF-8 to be valid. That means that
1131code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
1132considered valid by this function. Use C<L</is_strict_utf8_string>>,
1133C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1134code points are considered valid.
5ff889fb 1135
9f2abfde
KW
1136See also
1137C<L</is_utf8_invariant_string>>,
0cbf5865 1138C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1139C<L</is_utf8_string_loc>>,
1140C<L</is_utf8_string_loclen>>,
8bc127bf
KW
1141C<L</is_utf8_fixed_width_buf_flags>>,
1142C<L</is_utf8_fixed_width_buf_loc_flags>>,
1143C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
1144
1145=cut
1146*/
1147
dd237e82 1148#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 1149
c9cd936b
KW
1150#if defined(PERL_CORE) || defined (PERL_EXT)
1151
1152/*
1153=for apidoc is_utf8_non_invariant_string
1154
1155Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1156C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1157UTF-8; otherwise returns FALSE.
1158
1159A TRUE return means that at least one code point represented by the sequence
1160either is a wide character not representable as a single byte, or the
1161representation differs depending on whether the sequence is encoded in UTF-8 or
1162not.
1163
1164See also
1165C<L<perlapi/is_utf8_invariant_string>>,
1166C<L<perlapi/is_utf8_string>>
1167
1168=cut
1169
1170This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
1171It generally needn't be if its string is entirely UTF-8 invariant, and it
1172shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
1173
1174It is an internal function because khw thinks that XS code shouldn't be working
1175at this low a level. A valid use case could change that.
1176
1177*/
1178
1179PERL_STATIC_INLINE bool
86a87e17 1180Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
c9cd936b
KW
1181{
1182 const U8 * first_variant;
1183
1184 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1185
1186 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1187 return FALSE;
1188 }
1189
1190 return is_utf8_string(first_variant, len - (first_variant - s));
1191}
1192
1193#endif
1194
5ff889fb 1195/*
9f2abfde
KW
1196=for apidoc is_strict_utf8_string
1197
1198Returns TRUE if the first C<len> bytes of string C<s> form a valid
1199UTF-8-encoded string that is fully interchangeable by any application using
1200Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
1201calculated using C<strlen(s)> (which means if you use this option, that C<s>
1202can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1203byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1204
1205This function returns FALSE for strings containing any
1206code points above the Unicode max of 0x10FFFF, surrogate code points, or
1207non-character code points.
1208
1209See also
1210C<L</is_utf8_invariant_string>>,
0cbf5865 1211C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1212C<L</is_utf8_string>>,
1213C<L</is_utf8_string_flags>>,
1214C<L</is_utf8_string_loc>>,
1215C<L</is_utf8_string_loc_flags>>,
1216C<L</is_utf8_string_loclen>>,
1217C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
1218C<L</is_utf8_fixed_width_buf_flags>>,
1219C<L</is_utf8_fixed_width_buf_loc_flags>>,
1220C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
1221C<L</is_strict_utf8_string_loc>>,
1222C<L</is_strict_utf8_string_loclen>>,
1223C<L</is_c9strict_utf8_string>>,
1224C<L</is_c9strict_utf8_string_loc>>,
1225and
1226C<L</is_c9strict_utf8_string_loclen>>.
1227
1228=cut
1229*/
1230
dd237e82 1231#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
1232
1233/*
1234=for apidoc is_c9strict_utf8_string
1235
1236Returns TRUE if the first C<len> bytes of string C<s> form a valid
1237UTF-8-encoded string that conforms to
1238L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1239otherwise it returns FALSE. If C<len> is 0, it will be calculated using
1240C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1241C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
1242characters being ASCII constitute 'a valid UTF-8 string'.
1243
1244This function returns FALSE for strings containing any code points above the
1245Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1246code points per
1247L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1248
1249See also
1250C<L</is_utf8_invariant_string>>,
0cbf5865 1251C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1252C<L</is_utf8_string>>,
1253C<L</is_utf8_string_flags>>,
1254C<L</is_utf8_string_loc>>,
1255C<L</is_utf8_string_loc_flags>>,
1256C<L</is_utf8_string_loclen>>,
1257C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
1258C<L</is_utf8_fixed_width_buf_flags>>,
1259C<L</is_utf8_fixed_width_buf_loc_flags>>,
1260C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
1261C<L</is_strict_utf8_string>>,
1262C<L</is_strict_utf8_string_loc>>,
1263C<L</is_strict_utf8_string_loclen>>,
1264C<L</is_c9strict_utf8_string_loc>>,
1265and
1266C<L</is_c9strict_utf8_string_loclen>>.
1267
1268=cut
1269*/
1270
dd237e82 1271#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
1272
1273/*
1274=for apidoc is_utf8_string_flags
1275
1276Returns TRUE if the first C<len> bytes of string C<s> form a valid
1277UTF-8 string, subject to the restrictions imposed by C<flags>;
1278returns FALSE otherwise. If C<len> is 0, it will be calculated
1279using C<strlen(s)> (which means if you use this option, that C<s> can't have
1280embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
1281that all characters being ASCII constitute 'a valid UTF-8 string'.
1282
1283If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1284C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1285as C<L</is_strict_utf8_string>>; and if C<flags> is
1286C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1287C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
1288combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1289C<L</utf8n_to_uvchr>>, with the same meanings.
1290
1291See also
1292C<L</is_utf8_invariant_string>>,
0cbf5865 1293C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
1294C<L</is_utf8_string>>,
1295C<L</is_utf8_string_loc>>,
1296C<L</is_utf8_string_loc_flags>>,
1297C<L</is_utf8_string_loclen>>,
1298C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
1299C<L</is_utf8_fixed_width_buf_flags>>,
1300C<L</is_utf8_fixed_width_buf_loc_flags>>,
1301C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
1302C<L</is_strict_utf8_string>>,
1303C<L</is_strict_utf8_string_loc>>,
1304C<L</is_strict_utf8_string_loclen>>,
1305C<L</is_c9strict_utf8_string>>,
1306C<L</is_c9strict_utf8_string_loc>>,
1307and
1308C<L</is_c9strict_utf8_string_loclen>>.
1309
1310=cut
1311*/
1312
1313PERL_STATIC_INLINE bool
c9182d9c 1314Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 1315{
33756530 1316 const U8 * first_variant;
9f2abfde
KW
1317
1318 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1319 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1320 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1321
f60f61fd
KW
1322 if (len == 0) {
1323 len = strlen((const char *)s);
1324 }
1325
9f2abfde
KW
1326 if (flags == 0) {
1327 return is_utf8_string(s, len);
1328 }
1329
d044b7a7 1330 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1331 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1332 {
1333 return is_strict_utf8_string(s, len);
1334 }
1335
d044b7a7 1336 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1337 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1338 {
1339 return is_c9strict_utf8_string(s, len);
1340 }
1341
33756530
KW
1342 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1343 const U8* const send = s + len;
1344 const U8* x = first_variant;
1345
a0d7f935
KW
1346 while (x < send) {
1347 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1348 if (UNLIKELY(! cur_len)) {
1349 return FALSE;
1350 }
1351 x += cur_len;
9f2abfde 1352 }
33756530 1353 }
9f2abfde
KW
1354
1355 return TRUE;
1356}
1357
1358/*
5ff889fb
KW
1359
1360=for apidoc is_utf8_string_loc
1361
2717076a 1362Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 1363case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 1364"utf8ness success") in the C<ep> pointer.
5ff889fb 1365
2717076a 1366See also C<L</is_utf8_string_loclen>>.
5ff889fb 1367
3964c812
KW
1368=cut
1369*/
1370
1371#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
1372
1373/*
1374
5ff889fb
KW
1375=for apidoc is_utf8_string_loclen
1376
2717076a 1377Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 1378case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 1379"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 1380encoded characters in the C<el> pointer.
5ff889fb 1381
2717076a 1382See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
1383
1384=cut
1385*/
1386
56e4cf64 1387PERL_STATIC_INLINE bool
33756530 1388Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 1389{
33756530 1390 const U8 * first_variant;
5ff889fb
KW
1391
1392 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1393
33756530
KW
1394 if (len == 0) {
1395 len = strlen((const char *) s);
1396 }
1397
1398 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1399 if (el)
1400 *el = len;
1401
1402 if (ep) {
1403 *ep = s + len;
1404 }
1405
1406 return TRUE;
1407 }
1408
1409 {
1410 const U8* const send = s + len;
1411 const U8* x = first_variant;
1412 STRLEN outlen = first_variant - s;
1413
a0d7f935
KW
1414 while (x < send) {
1415 const STRLEN cur_len = isUTF8_CHAR(x, send);
1416 if (UNLIKELY(! cur_len)) {
1417 break;
1418 }
1419 x += cur_len;
1420 outlen++;
5ff889fb 1421 }
5ff889fb 1422
a0d7f935
KW
1423 if (el)
1424 *el = outlen;
5ff889fb 1425
a0d7f935
KW
1426 if (ep) {
1427 *ep = x;
1428 }
5ff889fb 1429
a0d7f935 1430 return (x == send);
33756530 1431 }
5ff889fb
KW
1432}
1433
213dc9d1
KW
1434/* The perl core arranges to never call the DFA below without there being at
1435 * least one byte available to look at. This allows the DFA to use a do {}
1436 * while loop which means that calling it with a UTF-8 invariant has a single
1437 * conditional, same as the calling code checking for invariance ahead of time.
1438 * And having the calling code remove that conditional speeds up by that
1439 * conditional, the case where it wasn't invariant. So there's no reason to
1440 * check before caling this.
1441 *
1442 * But we don't know this for non-core calls, so have to retain the check for
1443 * them. */
1444#ifdef PERL_CORE
1445# define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s))
1446#else
1447# define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE
1448#endif
1449
5ff889fb 1450/*
50f7a4ce
KW
1451 * DFA for checking input is valid UTF-8 syntax.
1452 *
1453 * This uses adaptations of the table and algorithm given in
1454 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1455 * documentation of the original version. A copyright notice for the original
1456 * version is given at the beginning of this file. The Perl adapations are
1457 * documented at the definition of PL_extended_utf8_dfa_tab[].
1458 *
1459 * This dfa is fast. There are three exit conditions:
1460 * 1) a well-formed code point, acceptable to the table
1461 * 2) the beginning bytes of an incomplete character, whose completion might
1462 * or might not be acceptable
1463 * 3) unacceptable to the table. Some of the adaptations have certain,
1464 * hopefully less likely to occur, legal inputs be unacceptable to the
1465 * table, so these must be sorted out afterwards.
1466 *
1467 * This macro is a complete implementation of the code executing the DFA. It
1468 * is passed the input sequence bounds and the table to use, and what to do
1469 * for each of the exit conditions. There are three canned actions, likely to
1470 * be the ones you want:
1471 * DFA_RETURN_SUCCESS_
1472 * DFA_RETURN_FAILURE_
1473 * DFA_GOTO_TEASE_APART_FF_
1474 *
1475 * You pass a parameter giving the action to take for each of the three
1476 * possible exit conditions:
1477 *
1478 * 'accept_action' This is executed when the DFA accepts the input.
1479 * DFA_RETURN_SUCCESS_ is the most likely candidate.
1480 * 'reject_action' This is executed when the DFA rejects the input.
1481 * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1482 * you have written code to distinguish the rejecting state
1483 * results. Because it happens in several places, and
1484 * involves #ifdefs, the special action
1485 * DFA_GOTO_TEASE_APART_FF_ is what you want with
1486 * PL_extended_utf8_dfa_tab. On platforms without
1487 * EXTRA_LONG_UTF8, there is no need to tease anything apart,
1488 * so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1489 * need to have a label 'tease_apart_FF' that it will transfer
1490 * to.
1491 * 'incomplete_char_action' This is executed when the DFA ran off the end
1492 * before accepting or rejecting the input.
1493 * DFA_RETURN_FAILURE_ is the likely action, but you could
1494 * have a 'goto', or NOOP. In the latter case the DFA drops
1495 * off the end, and you place your code to handle this case
1496 * immediately after it.
1497 */
1498
1499#define DFA_RETURN_SUCCESS_ return s - s0
1500#define DFA_RETURN_FAILURE_ return 0
1501#ifdef HAS_EXTRA_LONG_UTF8
1502# define DFA_TEASE_APART_FF_ goto tease_apart_FF
1503#else
1504# define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_
1505#endif
1506
1507#define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \
1508 accept_action, \
1509 reject_action, \
1510 incomplete_char_action) \
1511 STMT_START { \
1512 const U8 * s = s0; \
5da55c47 1513 const U8 * e_ = e; \
50f7a4ce
KW
1514 UV state = 0; \
1515 \
5da55c47 1516 PERL_NON_CORE_CHECK_EMPTY(s, e_); \
213dc9d1
KW
1517 \
1518 do { \
50f7a4ce
KW
1519 state = dfa_tab[256 + state + dfa_tab[*s]]; \
1520 s++; \
1521 \
1522 if (state == 0) { /* Accepting state */ \
1523 accept_action; \
1524 } \
1525 \
1526 if (UNLIKELY(state == 1)) { /* Rejecting state */ \
1527 reject_action; \
1528 } \
5da55c47 1529 } while (s < e_); \
50f7a4ce
KW
1530 \
1531 /* Here, dropped out of loop before end-of-char */ \
1532 incomplete_char_action; \
1533 } STMT_END
1534
1535
1536/*
9f2abfde 1537
44170c9a 1538=for apidoc isUTF8_CHAR
8ed185f9
KW
1539
1540Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1541looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1542that represents some code point; otherwise it evaluates to 0. If non-zero, the
1543value gives how many bytes starting at C<s> comprise the code point's
1544representation. Any bytes remaining before C<e>, but beyond the ones needed to
1545form the first code point in C<s>, are not examined.
1546
13aab5dd 1547The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1548extension to official UTF-8 to represent those higher than the Unicode maximum
1549of 0x10FFFF. That means that this macro is used to efficiently decide if the
1550next few bytes in C<s> is legal UTF-8 for a single character.
1551
1552Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1553defined by Unicode to be fully interchangeable across applications;
1554C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1555#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1556code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1557
1558Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1559C<L</is_utf8_string_loclen>> to check entire strings.
1560
13aab5dd
KW
1561Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1562machines) is a valid UTF-8 character.
8ed185f9
KW
1563
1564=cut
1565
1566This uses an adaptation of the table and algorithm given in
f6521f7c 1567https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
8ed185f9
KW
1568documentation of the original version. A copyright notice for the original
1569version is given at the beginning of this file. The Perl adapation is
71525f77 1570documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1571*/
1572
1573PERL_STATIC_INLINE Size_t
c9182d9c 1574Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9 1575{
8ed185f9
KW
1576 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1577
50f7a4ce
KW
1578 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1579 DFA_RETURN_SUCCESS_,
1580 DFA_TEASE_APART_FF_,
1581 DFA_RETURN_FAILURE_);
8ed185f9 1582
50f7a4ce
KW
1583 /* Here, we didn't return success, but dropped out of the loop. In the
1584 * case of PL_extended_utf8_dfa_tab, this means the input is either
1585 * malformed, or the start byte was FF on a platform that the dfa doesn't
1586 * handle FF's. Call a helper function. */
ffea7477 1587
50f7a4ce 1588#ifdef HAS_EXTRA_LONG_UTF8
8ed185f9 1589
50f7a4ce 1590 tease_apart_FF:
8ed185f9 1591
50f7a4ce
KW
1592 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1593 * either malformed, or was for the largest possible start byte, which we
1594 * now check, not inline */
1595 if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1596 return 0;
8ed185f9
KW
1597 }
1598
50f7a4ce
KW
1599 return is_utf8_FF_helper_(s0, e,
1600 FALSE /* require full, not partial char */
1601 );
8ed185f9
KW
1602#endif
1603
8ed185f9
KW
1604}
1605
1606/*
1607
67049a5f
KW
1608=for apidoc isSTRICT_UTF8_CHAR
1609
1610Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1611looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1612Unicode code point completely acceptable for open interchange between all
1613applications; otherwise it evaluates to 0. If non-zero, the value gives how
1614many bytes starting at C<s> comprise the code point's representation. Any
1615bytes remaining before C<e>, but beyond the ones needed to form the first code
1616point in C<s>, are not examined.
1617
1618The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1619be a surrogate nor a non-character code point. Thus this excludes any code
1620point from Perl's extended UTF-8.
1621
1622This is used to efficiently decide if the next few bytes in C<s> is
1623legal Unicode-acceptable UTF-8 for a single character.
1624
1625Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1626#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1627code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1628and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1629
1630Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1631C<L</is_strict_utf8_string_loclen>> to check entire strings.
1632
1633=cut
1634
1635This uses an adaptation of the tables and algorithm given in
f6521f7c 1636https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
67049a5f
KW
1637documentation of the original version. A copyright notice for the original
1638version is given at the beginning of this file. The Perl adapation is
1639documented at the definition of strict_extended_utf8_dfa_tab[].
1640
1641*/
1642
1643PERL_STATIC_INLINE Size_t
c9182d9c 1644Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f 1645{
67049a5f
KW
1646 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1647
50f7a4ce
KW
1648 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1649 DFA_RETURN_SUCCESS_,
1650 goto check_hanguls,
1651 DFA_RETURN_FAILURE_);
1652 check_hanguls:
67049a5f 1653
50f7a4ce
KW
1654 /* Here, we didn't return success, but dropped out of the loop. In the
1655 * case of PL_strict_utf8_dfa_tab, this means the input is either
1656 * malformed, or was for certain Hanguls; handle them specially */
67049a5f 1657
67260a96
KW
1658 /* The dfa above drops out for incomplete or illegal inputs, and certain
1659 * legal Hanguls; check and return accordingly */
1660 return is_HANGUL_ED_utf8_safe(s0, e);
67049a5f
KW
1661}
1662
1663/*
1664
44170c9a 1665=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1666
1667Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1668looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1669Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1670the value gives how many bytes starting at C<s> comprise the code point's
1671representation. Any bytes remaining before C<e>, but beyond the ones needed to
1672form the first code point in C<s>, are not examined.
1673
1674The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1675differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1676code points. This corresponds to
1677L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1678which said that non-character code points are merely discouraged rather than
1679completely forbidden in open interchange. See
1680L<perlunicode/Noncharacter code points>.
1681
1682Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1683C<L</isUTF8_CHAR_flags>> for a more customized definition.
1684
1685Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1686C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1687
1688=cut
1689
1690This uses an adaptation of the tables and algorithm given in
f6521f7c 1691https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
c5bfbb64
KW
1692documentation of the original version. A copyright notice for the original
1693version is given at the beginning of this file. The Perl adapation is
71525f77 1694documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1695
1696*/
1697
1698PERL_STATIC_INLINE Size_t
c9182d9c 1699Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64 1700{
c5bfbb64
KW
1701 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1702
50f7a4ce
KW
1703 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1704 DFA_RETURN_SUCCESS_,
1705 DFA_RETURN_FAILURE_,
1706 DFA_RETURN_FAILURE_);
c5bfbb64
KW
1707}
1708
1709/*
1710
9f2abfde
KW
1711=for apidoc is_strict_utf8_string_loc
1712
1713Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1714case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1715"utf8ness success") in the C<ep> pointer.
1716
1717See also C<L</is_strict_utf8_string_loclen>>.
1718
1719=cut
1720*/
1721
1722#define is_strict_utf8_string_loc(s, len, ep) \
1723 is_strict_utf8_string_loclen(s, len, ep, 0)
1724
1725/*
1726
1727=for apidoc is_strict_utf8_string_loclen
1728
1729Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1730case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1731"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1732encoded characters in the C<el> pointer.
1733
1734See also C<L</is_strict_utf8_string_loc>>.
1735
1736=cut
1737*/
1738
1739PERL_STATIC_INLINE bool
c9182d9c 1740Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1741{
33756530 1742 const U8 * first_variant;
9f2abfde
KW
1743
1744 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1745
33756530
KW
1746 if (len == 0) {
1747 len = strlen((const char *) s);
1748 }
1749
1750 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1751 if (el)
1752 *el = len;
1753
1754 if (ep) {
1755 *ep = s + len;
1756 }
1757
1758 return TRUE;
1759 }
1760
1761 {
1762 const U8* const send = s + len;
1763 const U8* x = first_variant;
1764 STRLEN outlen = first_variant - s;
1765
a0d7f935
KW
1766 while (x < send) {
1767 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1768 if (UNLIKELY(! cur_len)) {
1769 break;
1770 }
1771 x += cur_len;
1772 outlen++;
9f2abfde 1773 }
9f2abfde 1774
a0d7f935
KW
1775 if (el)
1776 *el = outlen;
9f2abfde 1777
a0d7f935
KW
1778 if (ep) {
1779 *ep = x;
1780 }
9f2abfde 1781
a0d7f935 1782 return (x == send);
33756530 1783 }
9f2abfde
KW
1784}
1785
1786/*
1787
1788=for apidoc is_c9strict_utf8_string_loc
1789
1790Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1791the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1792"utf8ness success") in the C<ep> pointer.
1793
1794See also C<L</is_c9strict_utf8_string_loclen>>.
1795
1796=cut
1797*/
1798
1799#define is_c9strict_utf8_string_loc(s, len, ep) \
1800 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1801
1802/*
1803
1804=for apidoc is_c9strict_utf8_string_loclen
1805
1806Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1807the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1808"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1809characters in the C<el> pointer.
1810
1811See also C<L</is_c9strict_utf8_string_loc>>.
1812
1813=cut
1814*/
1815
1816PERL_STATIC_INLINE bool
c9182d9c 1817Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1818{
33756530 1819 const U8 * first_variant;
9f2abfde
KW
1820
1821 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1822
33756530
KW
1823 if (len == 0) {
1824 len = strlen((const char *) s);
1825 }
1826
1827 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1828 if (el)
1829 *el = len;
1830
1831 if (ep) {
1832 *ep = s + len;
1833 }
1834
1835 return TRUE;
1836 }
1837
1838 {
1839 const U8* const send = s + len;
1840 const U8* x = first_variant;
1841 STRLEN outlen = first_variant - s;
1842
a0d7f935
KW
1843 while (x < send) {
1844 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1845 if (UNLIKELY(! cur_len)) {
1846 break;
1847 }
1848 x += cur_len;
1849 outlen++;
9f2abfde 1850 }
9f2abfde 1851
a0d7f935
KW
1852 if (el)
1853 *el = outlen;
9f2abfde 1854
a0d7f935
KW
1855 if (ep) {
1856 *ep = x;
1857 }
9f2abfde 1858
a0d7f935 1859 return (x == send);
33756530 1860 }
9f2abfde
KW
1861}
1862
1863/*
1864
1865=for apidoc is_utf8_string_loc_flags
1866
1867Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1868case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1869"utf8ness success") in the C<ep> pointer.
1870
1871See also C<L</is_utf8_string_loclen_flags>>.
1872
1873=cut
1874*/
1875
1876#define is_utf8_string_loc_flags(s, len, ep, flags) \
1877 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1878
1879
1880/* The above 3 actual functions could have been moved into the more general one
1881 * just below, and made #defines that call it with the right 'flags'. They are
1882 * currently kept separate to increase their chances of getting inlined */
1883
1884/*
1885
1886=for apidoc is_utf8_string_loclen_flags
1887
1888Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1889case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1890"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1891encoded characters in the C<el> pointer.
1892
1893See also C<L</is_utf8_string_loc_flags>>.
1894
1895=cut
1896*/
1897
1898PERL_STATIC_INLINE bool
c9182d9c 1899Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1900{
33756530 1901 const U8 * first_variant;
9f2abfde
KW
1902
1903 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1904 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1905 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1906
f60f61fd 1907 if (len == 0) {
a0d7f935 1908 len = strlen((const char *) s);
f60f61fd
KW
1909 }
1910
9f2abfde
KW
1911 if (flags == 0) {
1912 return is_utf8_string_loclen(s, len, ep, el);
1913 }
1914
d044b7a7 1915 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1916 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1917 {
1918 return is_strict_utf8_string_loclen(s, len, ep, el);
1919 }
1920
d044b7a7 1921 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1922 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1923 {
1924 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1925 }
1926
33756530
KW
1927 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1928 if (el)
1929 *el = len;
1930
1931 if (ep) {
1932 *ep = s + len;
1933 }
1934
1935 return TRUE;
1936 }
1937
1938 {
1939 const U8* send = s + len;
1940 const U8* x = first_variant;
1941 STRLEN outlen = first_variant - s;
1942
a0d7f935
KW
1943 while (x < send) {
1944 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1945 if (UNLIKELY(! cur_len)) {
1946 break;
1947 }
1948 x += cur_len;
1949 outlen++;
9f2abfde 1950 }
9f2abfde 1951
a0d7f935
KW
1952 if (el)
1953 *el = outlen;
9f2abfde 1954
a0d7f935
KW
1955 if (ep) {
1956 *ep = x;
1957 }
9f2abfde 1958
a0d7f935 1959 return (x == send);
33756530 1960 }
9f2abfde
KW
1961}
1962
1963/*
7c93d8f0
KW
1964=for apidoc utf8_distance
1965
1966Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1967and C<b>.
1968
1969WARNING: use only if you *know* that the pointers point inside the
1970same UTF-8 buffer.
1971
1972=cut
1973*/
1974
1975PERL_STATIC_INLINE IV
1976Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1977{
1978 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1979
1980 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1981}
1982
1983/*
1984=for apidoc utf8_hop
1985
1986Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1987forward or backward.
1988
1989WARNING: do not use the following unless you *know* C<off> is within
1990the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1991on the first byte of character or just after the last byte of a character.
1992
1993=cut
1994*/
1995
1996PERL_STATIC_INLINE U8 *
1997Perl_utf8_hop(const U8 *s, SSize_t off)
1998{
1999 PERL_ARGS_ASSERT_UTF8_HOP;
2000
2001 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2002 * the bitops (especially ~) can create illegal UTF-8.
2003 * In other words: in Perl UTF-8 is not just for Unicode. */
2004
2005 if (off >= 0) {
1604cfb0
MS
2006 while (off--)
2007 s += UTF8SKIP(s);
7c93d8f0
KW
2008 }
2009 else {
1604cfb0
MS
2010 while (off++) {
2011 s--;
2012 while (UTF8_IS_CONTINUATION(*s))
2013 s--;
2014 }
7c93d8f0 2015 }
e099ea69 2016 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 2017 return (U8 *)s;
e099ea69 2018 GCC_DIAG_RESTORE
7c93d8f0
KW
2019}
2020
4dab108f 2021/*
65df57a8
TC
2022=for apidoc utf8_hop_forward
2023
2024Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2025forward.
2026
2027C<off> must be non-negative.
2028
2029C<s> must be before or equal to C<end>.
2030
2031When moving forward it will not move beyond C<end>.
2032
2033Will not exceed this limit even if the string is not valid "UTF-8".
2034
2035=cut
2036*/
2037
2038PERL_STATIC_INLINE U8 *
2039Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2040{
2041 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2042
2043 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2044 * the bitops (especially ~) can create illegal UTF-8.
2045 * In other words: in Perl UTF-8 is not just for Unicode. */
2046
2047 assert(s <= end);
2048 assert(off >= 0);
2049
2050 while (off--) {
2051 STRLEN skip = UTF8SKIP(s);
de979548 2052 if ((STRLEN)(end - s) <= skip) {
e099ea69 2053 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 2054 return (U8 *)end;
e099ea69 2055 GCC_DIAG_RESTORE
de979548 2056 }
65df57a8
TC
2057 s += skip;
2058 }
2059
e099ea69 2060 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 2061 return (U8 *)s;
e099ea69 2062 GCC_DIAG_RESTORE
65df57a8
TC
2063}
2064
2065/*
2066=for apidoc utf8_hop_back
2067
2068Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2069backward.
2070
2071C<off> must be non-positive.
2072
2073C<s> must be after or equal to C<start>.
2074
2075When moving backward it will not move before C<start>.
2076
2077Will not exceed this limit even if the string is not valid "UTF-8".
2078
2079=cut
2080*/
2081
2082PERL_STATIC_INLINE U8 *
2083Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2084{
2085 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2086
9f4248c9 2087 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
65df57a8
TC
2088 * the bitops (especially ~) can create illegal UTF-8.
2089 * In other words: in Perl UTF-8 is not just for Unicode. */
2090
2091 assert(start <= s);
2092 assert(off <= 0);
2093
9f4248c9
KW
2094 /* Note: if we know that the input is well-formed, we can do per-word
2095 * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
2096 * that. But it was reverted because doing per-word has some
2097 * start-up/tear-down overhead, so only makes sense if the distance to be
2098 * moved is large, and core perl doesn't currently move more than a few
2099 * characters at a time. You can reinstate it if it does become
2100 * advantageous. */
2101 while (off++ && s > start) {
e7185695 2102 do {
65df57a8 2103 s--;
e7185695 2104 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8 2105 }
f6521f7c 2106
e099ea69 2107 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 2108 return (U8 *)s;
e099ea69 2109 GCC_DIAG_RESTORE
65df57a8
TC
2110}
2111
2112/*
2113=for apidoc utf8_hop_safe
2114
2115Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2116either forward or backward.
2117
2118When moving backward it will not move before C<start>.
2119
2120When moving forward it will not move beyond C<end>.
2121
2122Will not exceed those limits even if the string is not valid "UTF-8".
2123
2124=cut
2125*/
2126
2127PERL_STATIC_INLINE U8 *
2128Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2129{
2130 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2131
2132 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2133 * the bitops (especially ~) can create illegal UTF-8.
2134 * In other words: in Perl UTF-8 is not just for Unicode. */
2135
2136 assert(start <= s && s <= end);
2137
2138 if (off >= 0) {
2139 return utf8_hop_forward(s, off, end);
2140 }
2141 else {
2142 return utf8_hop_back(s, off, start);
2143 }
2144}
2145
2146/*
4dab108f 2147
247cc51e 2148=for apidoc isUTF8_CHAR_flags
22f363ff
KW
2149
2150Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2151looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2152that represents some code point, subject to the restrictions given by C<flags>;
2153otherwise it evaluates to 0. If non-zero, the value gives how many bytes
2154starting at C<s> comprise the code point's representation. Any bytes remaining
2155before C<e>, but beyond the ones needed to form the first code point in C<s>,
2156are not examined.
2157
2158If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2159if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2160as C<L</isSTRICT_UTF8_CHAR>>;
2161and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2162the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2163Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2164understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2165
2166The three alternative macros are for the most commonly needed validations; they
2167are likely to run somewhat faster than this more general one, as they can be
2168inlined into your code.
2169
2170Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2171L</is_utf8_string_loclen_flags> to check entire strings.
2172
2173=cut
2174*/
2175
2176PERL_STATIC_INLINE STRLEN
2177Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2178{
2179 PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2180 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2181 |UTF8_DISALLOW_PERL_EXTENDED)));
2182
2183 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2184 goto check_success,
2185 DFA_TEASE_APART_FF_,
2186 DFA_RETURN_FAILURE_);
2187
2188 check_success:
2189
1aa501c2 2190 return is_utf8_char_helper_(s0, e, flags);
22f363ff
KW
2191
2192#ifdef HAS_EXTRA_LONG_UTF8
2193
2194 tease_apart_FF:
2195
2196 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2197 * either malformed, or was for the largest possible start byte, which
2198 * indicates perl extended UTF-8, well above the Unicode maximum */
2199 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2200 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2201 {
2202 return 0;
2203 }
2204
2205 /* Otherwise examine the sequence not inline */
2206 return is_utf8_FF_helper_(s0, e,
2207 FALSE /* require full, not partial char */
2208 );
2209#endif
2210
2211}
2212
2213/*
2214
4dab108f
KW
2215=for apidoc is_utf8_valid_partial_char
2216
6cbb9248
KW
2217Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2218S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2219points. Otherwise, it returns 1 if there exists at least one non-empty
2220sequence of bytes that when appended to sequence C<s>, starting at position
2221C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2222otherwise returns 0.
2223
2224In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2225point.
2226
2227This is useful when a fixed-length buffer is being tested for being well-formed
2228UTF-8, but the final few bytes in it don't comprise a full character; that is,
2229it is split somewhere in the middle of the final code point's UTF-8
2230representation. (Presumably when the buffer is refreshed with the next chunk
2231of data, the new first bytes will complete the partial code point.) This
2232function is used to verify that the final bytes in the current buffer are in
2233fact the legal beginning of some code point, so that if they aren't, the
2234failure can be signalled without having to wait for the next read.
4dab108f
KW
2235
2236=cut
2237*/
2717076a
KW
2238#define is_utf8_valid_partial_char(s, e) \
2239 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
2240
2241/*
2242
2243=for apidoc is_utf8_valid_partial_char_flags
2244
2245Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2246or not the input is a valid UTF-8 encoded partial character, but it takes an
2247extra parameter, C<flags>, which can further restrict which code points are
2248considered valid.
2249
2250If C<flags> is 0, this behaves identically to
2251C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
2252of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
2253there is any sequence of bytes that can complete the input partial character in
2254such a way that a non-prohibited character is formed, the function returns
2717076a
KW
2255TRUE; otherwise FALSE. Non character code points cannot be determined based on
2256partial character input. But many of the other possible excluded types can be
f1c999a7
KW
2257determined from just the first one or two bytes.
2258
2259=cut
2260 */
2261
56e4cf64 2262PERL_STATIC_INLINE bool
22afef87 2263Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
4dab108f 2264{
f1c999a7 2265 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
f1c999a7 2266 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 2267 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 2268
22afef87
KW
2269 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2270 DFA_RETURN_FAILURE_,
2271 DFA_TEASE_APART_FF_,
2272 NOOP);
2273
2274 /* The NOOP above causes the DFA to drop down here iff the input was a
2275 * partial character. flags=0 => can return TRUE immediately; otherwise we
2276 * need to check (not inline) if the partial character is the beginning of
2277 * a disallowed one */
2278 if (flags == 0) {
2279 return TRUE;
2280 }
2281
1aa501c2 2282 return cBOOL(is_utf8_char_helper_(s0, e, flags));
22afef87
KW
2283
2284#ifdef HAS_EXTRA_LONG_UTF8
2285
2286 tease_apart_FF:
2287
2288 /* Getting here means the input is either malformed, or, in the case of
2289 * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
2290 * latter case has to be extended UTF-8, so can fail immediately if that is
2291 * forbidden */
2292
2293 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2294 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2295 {
2296 return 0;
4dab108f
KW
2297 }
2298
22afef87
KW
2299 return is_utf8_FF_helper_(s0, e,
2300 TRUE /* Require to be a partial character */
2301 );
2302#endif
2303
4dab108f
KW
2304}
2305
8bc127bf
KW
2306/*
2307
2308=for apidoc is_utf8_fixed_width_buf_flags
2309
2310Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2311is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2312otherwise it returns FALSE.
2313
2314If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2315without restriction. If the final few bytes of the buffer do not form a
2316complete code point, this will return TRUE anyway, provided that
2317C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2318
2319If C<flags> in non-zero, it can be any combination of the
2320C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2321same meanings.
2322
2323This function differs from C<L</is_utf8_string_flags>> only in that the latter
2324returns FALSE if the final few bytes of the string don't form a complete code
2325point.
2326
2327=cut
2328 */
2329#define is_utf8_fixed_width_buf_flags(s, len, flags) \
2330 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2331
2332/*
2333
2334=for apidoc is_utf8_fixed_width_buf_loc_flags
2335
2336Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2337failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
2338to the beginning of any partial character at the end of the buffer; if there is
2339no partial character C<*ep> will contain C<s>+C<len>.
2340
2341See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2342
2343=cut
2344*/
2345
2346#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
2347 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2348
2349/*
2350
2351=for apidoc is_utf8_fixed_width_buf_loclen_flags
2352
2353Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2354complete, valid characters found in the C<el> pointer.
2355
2356=cut
2357*/
2358
2359PERL_STATIC_INLINE bool
c9182d9c 2360Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 2361 STRLEN len,
8bc127bf
KW
2362 const U8 **ep,
2363 STRLEN *el,
2364 const U32 flags)
2365{
2366 const U8 * maybe_partial;
2367
2368 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2369
2370 if (! ep) {
2371 ep = &maybe_partial;
2372 }
2373
2374 /* If it's entirely valid, return that; otherwise see if the only error is
2375 * that the final few bytes are for a partial character */
2376 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
2377 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2378}
2379
e6a4ffc3 2380PERL_STATIC_INLINE UV
c9182d9c 2381Perl_utf8n_to_uvchr_msgs(const U8 *s,
59c18386
KW
2382 STRLEN curlen,
2383 STRLEN *retlen,
2384 const U32 flags,
2385 U32 * errors,
2386 AV ** msgs)
e6a4ffc3
KW
2387{
2388 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
2389 * simple cases, and, if necessary calls a helper function to deal with the
2390 * more complex ones. Almost all well-formed non-problematic code points
2391 * are considered simple, so that it's unlikely that the helper function
2392 * will need to be called.
2393 *
2394 * This is an adaptation of the tables and algorithm given in
f6521f7c 2395 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
e6a4ffc3
KW
2396 * comprehensive documentation of the original version. A copyright notice
2397 * for the original version is given at the beginning of this file. The
71525f77 2398 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
2399 */
2400
2401 const U8 * const s0 = s;
2402 const U8 * send = s0 + curlen;
a4609251
KW
2403 UV type;
2404 UV uv;
e6a4ffc3
KW
2405
2406 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2407
2408 /* This dfa is fast. If it accepts the input, it was for a well-formed,
2409 * non-problematic code point, which can be returned immediately.
2410 * Otherwise we call a helper function to figure out the more complicated
2411 * cases. */
2412
a4609251 2413 /* No calls from core pass in an empty string; non-core need a check */
d1e771d8
KW
2414#ifdef PERL_CORE
2415 assert(curlen > 0);
2416#else
2417 if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
2418 flags, errors, msgs);
2419#endif
e6a4ffc3 2420
a4609251 2421 type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3 2422
a4609251
KW
2423 /* The table is structured so that 'type' is 0 iff the input byte is
2424 * represented identically regardless of the UTF-8ness of the string */
2425 if (type == 0) { /* UTF-8 invariants are returned unchanged */
2426 uv = *s;
2427 }
2428 else {
2429 UV state = PL_strict_utf8_dfa_tab[256 + type];
2430 uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
e6a4ffc3 2431
a4609251
KW
2432 while (++s < send) {
2433 type = PL_strict_utf8_dfa_tab[*s];
2434 state = PL_strict_utf8_dfa_tab[256 + state + type];
2435
2436 uv = UTF8_ACCUMULATE(uv, *s);
2437
2438 if (state == 0) {
b9f49957
KW
2439#ifdef EBCDIC
2440 uv = UNI_TO_NATIVE(uv);
2441#endif
a4609251
KW
2442 goto success;
2443 }
2444
2445 if (UNLIKELY(state == 1)) {
2446 break;
2447 }
e6a4ffc3
KW
2448 }
2449
a4609251
KW
2450 /* Here is potentially problematic. Use the full mechanism */
2451 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
2452 errors, msgs);
2453 }
2454
2455 success:
2456 if (retlen) {
2457 *retlen = s - s0 + 1;
2458 }
2459 if (errors) {
2460 *errors = 0;
2461 }
2462 if (msgs) {
2463 *msgs = NULL;
e6a4ffc3
KW
2464 }
2465
b9f49957 2466 return uv;
e6a4ffc3
KW
2467}
2468
82651abe 2469PERL_STATIC_INLINE UV
9a9a6c98 2470Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 2471{
9a9a6c98 2472 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
2473
2474 assert(s < send);
2475
2476 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
2477
2478 /* EMPTY is not really allowed, and asserts on debugging builds. But
2479 * on non-debugging we have to deal with it, and this causes it to
2480 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 2481 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 2482 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
2483 }
2484 else {
2485 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
286a1bfd 2486 if (retlen && ret == 0 && (send <= s || *s != '\0')) {
82651abe
KW
2487 *retlen = (STRLEN) -1;
2488 }
2489
2490 return ret;
2491 }
2492}
2493
c8028aa6
TC
2494/* ------------------------------- perl.h ----------------------------- */
2495
2496/*
3f620621 2497=for apidoc_section $utility
dcccc8ff 2498
44170c9a 2499=for apidoc is_safe_syscall
c8028aa6 2500
1a0efc9a
KW
2501Test that the given C<pv> (with length C<len>) doesn't contain any internal
2502C<NUL> characters.
2503If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2504category, and return FALSE.
c8028aa6
TC
2505
2506Return TRUE if the name is safe.
2507
1a0efc9a
KW
2508C<what> and C<op_name> are used in any warning.
2509
796b6530 2510Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
2511
2512=cut
2513*/
2514
2515PERL_STATIC_INLINE bool
ffd62fc2
KW
2516Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2517{
c8028aa6
TC
2518 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2519 * perl itself uses xce*() functions which accept 8-bit strings.
2520 */
2521
2522 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2523
6c4650b3 2524 if (len > 1) {
c8028aa6 2525 char *null_at;
41188aa0 2526 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 2527 SETERRNO(ENOENT, LIB_INVARG);
1d505182 2528 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 2529 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 2530 what, op_name, pv, null_at+1);
c8028aa6
TC
2531 return FALSE;
2532 }
2533 }
2534
2535 return TRUE;
2536}
2537
2538/*
7cb3f959
TC
2539
2540Return true if the supplied filename has a newline character
fa6c7d00 2541immediately before the first (hopefully only) NUL.
7cb3f959
TC
2542
2543My original look at this incorrectly used the len from SvPV(), but
2544that's incorrect, since we allow for a NUL in pv[len-1].
2545
2546So instead, strlen() and work from there.
2547
2548This allow for the user reading a filename, forgetting to chomp it,
2549then calling:
2550
2551 open my $foo, "$file\0";
2552
2553*/
2554
2555#ifdef PERL_CORE
2556
2557PERL_STATIC_INLINE bool
ffd62fc2
KW
2558S_should_warn_nl(const char *pv)
2559{
7cb3f959
TC
2560 STRLEN len;
2561
2562 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2563
2564 len = strlen(pv);
2565
2566 return len > 0 && pv[len-1] == '\n';
2567}
2568
2569#endif
2570
3a019afd
KW
2571#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2572
2573PERL_STATIC_INLINE bool
2574S_lossless_NV_to_IV(const NV nv, IV *ivp)
2575{
2576 /* This function determines if the input NV 'nv' may be converted without
2577 * loss of data to an IV. If not, it returns FALSE taking no other action.
2578 * But if it is possible, it does the conversion, returning TRUE, and
2579 * storing the converted result in '*ivp' */
2580
2581 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2582
cd304e76
DM
2583# if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2584 /* Normally any comparison with a NaN returns false; if we can't rely
2585 * on that behaviour, check explicitly */
3a019afd
KW
2586 if (UNLIKELY(Perl_isnan(nv))) {
2587 return FALSE;
2588 }
3a019afd
KW
2589# endif
2590
cd304e76
DM
2591 /* Written this way so that with an always-false NaN comparison we
2592 * return false */
ef0a8475 2593 if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
3a019afd
KW
2594 return FALSE;
2595 }
2596
2597 if ((IV) nv != nv) {
2598 return FALSE;
2599 }
2600
2601 *ivp = (IV) nv;
2602 return TRUE;
2603}
2604
2605#endif
2606
81d52ecd
JH
2607/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2608
94b0cb42
KW
2609#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2610
81d52ecd
JH
2611#define MAX_CHARSET_NAME_LENGTH 2
2612
2613PERL_STATIC_INLINE const char *
94b0cb42 2614S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 2615{
94b0cb42
KW
2616 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2617
81d52ecd
JH
2618 /* Returns a string that corresponds to the name of the regex character set
2619 * given by 'flags', and *lenp is set the length of that string, which
2620 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2621
2622 *lenp = 1;
2623 switch (get_regex_charset(flags)) {
2624 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2625 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2626 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
1604cfb0
MS
2627 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2628 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2629 *lenp = 2;
2630 return ASCII_MORE_RESTRICT_PAT_MODS;
81d52ecd
JH
2631 }
2632 /* The NOT_REACHED; hides an assert() which has a rather complex
2633 * definition in perl.h. */
2634 NOT_REACHED; /* NOTREACHED */
2635 return "?"; /* Unknown */
2636}
2637
94b0cb42
KW
2638#endif
2639
7cb3f959 2640/*
ed382232
TC
2641
2642Return false if any get magic is on the SV other than taint magic.
2643
2644*/
2645
2646PERL_STATIC_INLINE bool
ffd62fc2
KW
2647Perl_sv_only_taint_gmagic(SV *sv)
2648{
ed382232
TC
2649 MAGIC *mg = SvMAGIC(sv);
2650
2651 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2652
2653 while (mg) {
2654 if (mg->mg_type != PERL_MAGIC_taint
2655 && !(mg->mg_flags & MGf_GSKIP)
2656 && mg->mg_virtual->svt_get) {
2657 return FALSE;
2658 }
2659 mg = mg->mg_moremagic;
2660 }
2661
2662 return TRUE;
2663}
2664
ed8ff0f3
DM
2665/* ------------------ cop.h ------------------------------------------- */
2666
5b6f7443
DM
2667/* implement GIMME_V() macro */
2668
2669PERL_STATIC_INLINE U8
2670Perl_gimme_V(pTHX)
2671{
2672 I32 cxix;
2673 U8 gimme = (PL_op->op_flags & OPf_WANT);
2674
2675 if (gimme)
2676 return gimme;
2677 cxix = PL_curstackinfo->si_cxsubix;
2678 if (cxix < 0)
390fe0c0 2679 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
5b6f7443
DM
2680 assert(cxstack[cxix].blk_gimme & G_WANT);
2681 return (cxstack[cxix].blk_gimme & G_WANT);
2682}
2683
ed8ff0f3
DM
2684
2685/* Enter a block. Push a new base context and return its address. */
2686
2687PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2688Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2689{
2690 PERL_CONTEXT * cx;
2691
2692 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2693
2694 CXINC;
2695 cx = CX_CUR();
2696 cx->cx_type = type;
2697 cx->blk_gimme = gimme;
2698 cx->blk_oldsaveix = saveix;
4caf7d8c 2699 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2700 cx->blk_oldcop = PL_curcop;
4caf7d8c 2701 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2702 cx->blk_oldscopesp = PL_scopestack_ix;
2703 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2704 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2705
2706 PL_tmps_floor = PL_tmps_ix;
2707 CX_DEBUG(cx, "PUSH");
2708 return cx;
2709}
2710
2711
2712/* Exit a block (RETURN and LAST). */
2713
2714PERL_STATIC_INLINE void
c9182d9c 2715Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2716{
2717 PERL_ARGS_ASSERT_CX_POPBLOCK;
2718
2719 CX_DEBUG(cx, "POP");
2720 /* these 3 are common to cx_popblock and cx_topblock */
2721 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2722 PL_scopestack_ix = cx->blk_oldscopesp;
2723 PL_curpm = cx->blk_oldpm;
2724
2725 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2726 * and leaves a CX entry lying around for repeated use, so
2727 * skip for multicall */ \
2728 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2729 || PL_savestack_ix == cx->blk_oldsaveix);
2730 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2731 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2732}
2733
2734/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2735 * Whereas cx_popblock() restores the state to the point just before
2736 * cx_pushblock() was called, cx_topblock() restores it to the point just
2737 * *after* cx_pushblock() was called. */
2738
2739PERL_STATIC_INLINE void
c9182d9c 2740Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2741{
2742 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2743
2744 CX_DEBUG(cx, "TOP");
2745 /* these 3 are common to cx_popblock and cx_topblock */
2746 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2747 PL_scopestack_ix = cx->blk_oldscopesp;
2748 PL_curpm = cx->blk_oldpm;
2749
2750 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2751}
2752
2753
a73d8813 2754PERL_STATIC_INLINE void
c9182d9c 2755Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2756{
2757 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2758
2759 PERL_ARGS_ASSERT_CX_PUSHSUB;
2760
3f6bd23a 2761 PERL_DTRACE_PROBE_ENTRY(cv);
5b6f7443
DM
2762 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2763 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
a73d8813
DM
2764 cx->blk_sub.cv = cv;
2765 cx->blk_sub.olddepth = CvDEPTH(cv);
2766 cx->blk_sub.prevcomppad = PL_comppad;
2767 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2768 cx->blk_sub.retop = retop;
2769 SvREFCNT_inc_simple_void_NN(cv);
2770 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2771}
2772
2773
2774/* subsets of cx_popsub() */
2775
2776PERL_STATIC_INLINE void
c9182d9c 2777Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2778{
2779 CV *cv;
2780
2781 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2782 assert(CxTYPE(cx) == CXt_SUB);
2783
2784 PL_comppad = cx->blk_sub.prevcomppad;
2785 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2786 cv = cx->blk_sub.cv;
2787 CvDEPTH(cv) = cx->blk_sub.olddepth;
2788 cx->blk_sub.cv = NULL;
2789 SvREFCNT_dec(cv);
5b6f7443 2790 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
a73d8813
DM
2791}
2792
2793
2794/* handle the @_ part of leaving a sub */
2795
2796PERL_STATIC_INLINE void
c9182d9c 2797Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2798{
2799 AV *av;
2800
2801 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2802 assert(CxTYPE(cx) == CXt_SUB);
2803 assert(AvARRAY(MUTABLE_AV(
2804 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2805 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2806
2807 CX_POP_SAVEARRAY(cx);
2808 av = MUTABLE_AV(PAD_SVl(0));
2809 if (UNLIKELY(AvREAL(av)))
2810 /* abandon @_ if it got reified */
2811 clear_defarray(av, 0);
2812 else {
2813 CLEAR_ARGARRAY(av);
2814 }
2815}
2816
2817
2818PERL_STATIC_INLINE void
c9182d9c 2819Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2820{
2821 PERL_ARGS_ASSERT_CX_POPSUB;
2822 assert(CxTYPE(cx) == CXt_SUB);
2823
3f6bd23a 2824 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2825
2826 if (CxHASARGS(cx))
2827 cx_popsub_args(cx);
2828 cx_popsub_common(cx);
2829}
2830
2831
6a7d52cc 2832PERL_STATIC_INLINE void
c9182d9c 2833Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2834{
2835 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2836
5b6f7443
DM
2837 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2838 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
6a7d52cc
DM
2839 cx->blk_format.cv = cv;
2840 cx->blk_format.retop = retop;
2841 cx->blk_format.gv = gv;
2842 cx->blk_format.dfoutgv = PL_defoutgv;
2843 cx->blk_format.prevcomppad = PL_comppad;
2844 cx->blk_u16 = 0;
2845
2846 SvREFCNT_inc_simple_void_NN(cv);
2847 CvDEPTH(cv)++;
2848 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2849}
2850
2851
2852PERL_STATIC_INLINE void
c9182d9c 2853Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2854{
2855 CV *cv;
2856 GV *dfout;
2857
2858 PERL_ARGS_ASSERT_CX_POPFORMAT;
2859 assert(CxTYPE(cx) == CXt_FORMAT);
2860
2861 dfout = cx->blk_format.dfoutgv;
2862 setdefout(dfout);
2863 cx->blk_format.dfoutgv = NULL;
2864 SvREFCNT_dec_NN(dfout);
2865
2866 PL_comppad = cx->blk_format.prevcomppad;
2867 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2868 cv = cx->blk_format.cv;
2869 cx->blk_format.cv = NULL;
2870 --CvDEPTH(cv);
2871 SvREFCNT_dec_NN(cv);
5b6f7443 2872 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
6a7d52cc
DM
2873}
2874
2875
13febba5 2876PERL_STATIC_INLINE void
6b729d24 2877Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5 2878{
13febba5
DM
2879 cx->blk_eval.retop = retop;
2880 cx->blk_eval.old_namesv = namesv;
2881 cx->blk_eval.old_eval_root = PL_eval_root;
2882 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2883 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2884 cx->blk_eval.cur_top_env = PL_top_env;
2885
4c57ced5 2886 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2887 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2888 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2889}
2890
6b729d24
TC
2891PERL_STATIC_INLINE void
2892Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2893{
2894 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2895
2896 Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2897
2898 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2899 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2900}
2901
2902PERL_STATIC_INLINE void
2903Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2904{
2905 PERL_ARGS_ASSERT_CX_PUSHTRY;
2906
2907 Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2908
2909 /* Don't actually change it, just store the current value so it's restored
2910 * by the common popeval */
2911 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2912}
2913
13febba5
DM
2914
2915PERL_STATIC_INLINE void
c9182d9c 2916Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2917{
2918 SV *sv;
2919
2920 PERL_ARGS_ASSERT_CX_POPEVAL;
2921 assert(CxTYPE(cx) == CXt_EVAL);
2922
2923 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2924 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2925 PL_eval_root = cx->blk_eval.old_eval_root;
2926 sv = cx->blk_eval.cur_text;
4c57ced5 2927 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2928 cx->blk_eval.cur_text = NULL;
2929 SvREFCNT_dec_NN(sv);
2930 }
2931
2932 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2933 if (sv) {
2934 cx->blk_eval.old_namesv = NULL;
2935 SvREFCNT_dec_NN(sv);
2936 }
5b6f7443 2937 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
13febba5 2938}
6a7d52cc 2939
a73d8813 2940
d1b6bf72
DM
2941/* push a plain loop, i.e.
2942 * { block }
2943 * while (cond) { block }
2944 * for (init;cond;continue) { block }
2945 * This loop can be last/redo'ed etc.
2946 */
2947
2948PERL_STATIC_INLINE void
c9182d9c 2949Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2950{
2951 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2952 cx->blk_loop.my_op = cLOOP;
2953}
2954
2955
2956/* push a true for loop, i.e.
2957 * for var (list) { block }
2958 */
2959
2960PERL_STATIC_INLINE void
c9182d9c 2961Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
2962{
2963 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2964
2965 /* this one line is common with cx_pushloop_plain */
2966 cx->blk_loop.my_op = cLOOP;
2967
2968 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2969 cx->blk_loop.itersave = itersave;
2970#ifdef USE_ITHREADS
2971 cx->blk_loop.oldcomppad = PL_comppad;
2972#endif
2973}
2974
2975
2976/* pop all loop types, including plain */
2977
2978PERL_STATIC_INLINE void
c9182d9c 2979Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2980{
2981 PERL_ARGS_ASSERT_CX_POPLOOP;
2982
2983 assert(CxTYPE_is_LOOP(cx));
2984 if ( CxTYPE(cx) == CXt_LOOP_ARY
2985 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2986 {
2987 /* Free ary or cur. This assumes that state_u.ary.ary
2988 * aligns with state_u.lazysv.cur. See cx_dup() */
2989 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2990 cx->blk_loop.state_u.lazysv.cur = NULL;
2991 SvREFCNT_dec_NN(sv);
2992 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2993 sv = cx->blk_loop.state_u.lazysv.end;
2994 cx->blk_loop.state_u.lazysv.end = NULL;
2995 SvREFCNT_dec_NN(sv);
2996 }
2997 }
2998 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2999 SV *cursv;
3000 SV **svp = (cx)->blk_loop.itervar_u.svp;
3001 if ((cx->cx_type & CXp_FOR_GV))
3002 svp = &GvSV((GV*)svp);
3003 cursv = *svp;
3004 *svp = cx->blk_loop.itersave;
3005 cx->blk_loop.itersave = NULL;
3006 SvREFCNT_dec(cursv);
3007 }
3008}
3009
2a7b7c61
DM
3010
3011PERL_STATIC_INLINE void
c9182d9c 3012Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 3013{
7896dde7 3014 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 3015
7896dde7 3016 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
3017}
3018
3019
3020PERL_STATIC_INLINE void
c9182d9c 3021Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 3022{
7896dde7
Z
3023 PERL_ARGS_ASSERT_CX_POPWHEN;
3024 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
3025
3026 PERL_UNUSED_ARG(cx);
59a14f30 3027 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
3028 /* currently NOOP */
3029}
3030
3031
7896dde7 3032PERL_STATIC_INLINE void
c9182d9c 3033Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
3034{
3035 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3036
3037 cx->blk_givwhen.leave_op = cLOGOP->op_other;
3038 cx->blk_givwhen.defsv_save = orig_defsv;
3039}
3040
3041
3042PERL_STATIC_INLINE void
c9182d9c 3043Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
3044{
3045 SV *sv;
3046
3047 PERL_ARGS_ASSERT_CX_POPGIVEN;
3048 assert(CxTYPE(cx) == CXt_GIVEN);
3049
3050 sv = GvSV(PL_defgv);
3051 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3052 cx->blk_givwhen.defsv_save = NULL;
3053 SvREFCNT_dec(sv);
3054}
3055
ec2c235b
KW
3056/* ------------------ util.h ------------------------------------------- */
3057
3058/*
3f620621 3059=for apidoc_section $string
ec2c235b
KW
3060
3061=for apidoc foldEQ
3062
3063Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3064same
3065case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
3066match themselves and their opposite case counterparts. Non-cased and non-ASCII
3067range bytes match only themselves.
3068
3069=cut
3070*/
3071
3072PERL_STATIC_INLINE I32
c3c9077b 3073Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len)
ec2c235b
KW
3074{
3075 const U8 *a = (const U8 *)s1;
3076 const U8 *b = (const U8 *)s2;
3077
3078 PERL_ARGS_ASSERT_FOLDEQ;
3079
3080 assert(len >= 0);
3081
3082 while (len--) {
1604cfb0
MS
3083 if (*a != *b && *a != PL_fold[*b])
3084 return 0;
3085 a++,b++;
ec2c235b
KW
3086 }
3087 return 1;
3088}
3089
0f9cb40c 3090PERL_STATIC_INLINE I32
c3c9077b 3091Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len)
ec2c235b 3092{
79a1fabd
KW
3093 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
3094 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3095 * does not check for this. Nor does it check that the strings each have
3096 * at least 'len' characters. */
ec2c235b
KW
3097
3098 const U8 *a = (const U8 *)s1;
3099 const U8 *b = (const U8 *)s2;
3100
3101 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3102
3103 assert(len >= 0);
3104
3105 while (len--) {
1604cfb0
MS
3106 if (*a != *b && *a != PL_fold_latin1[*b]) {
3107 return 0;
3108 }
3109 a++, b++;
ec2c235b
KW
3110 }
3111 return 1;
3112}
3113
3114/*
3f620621 3115=for apidoc_section $locale
ec2c235b
KW
3116=for apidoc foldEQ_locale
3117
3118Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3119same case-insensitively in the current locale; false otherwise.
3120
3121=cut
3122*/
3123
0f9cb40c 3124PERL_STATIC_INLINE I32
c3c9077b 3125Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len)
ec2c235b 3126{
ec2c235b
KW
3127 const U8 *a = (const U8 *)s1;
3128 const U8 *b = (const U8 *)s2;
3129
3130 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3131
3132 assert(len >= 0);
3133
3134 while (len--) {
1604cfb0
MS
3135 if (*a != *b && *a != PL_fold_locale[*b])
3136 return 0;
3137 a++,b++;
ec2c235b
KW
3138 }
3139 return 1;
3140}
3141
1ab100a8 3142/*
3f620621 3143=for apidoc_section $string
1ab100a8
KW
3144=for apidoc my_strnlen
3145
3146The C library C<strnlen> if available, or a Perl implementation of it.
3147
3148C<my_strnlen()> computes the length of the string, up to C<maxlen>
a3815e44 3149characters. It will never attempt to address more than C<maxlen>
1ab100a8
KW
3150characters, making it suitable for use with strings that are not
3151guaranteed to be NUL-terminated.
3152
3153=cut
3154
3155Description stolen from http://man.openbsd.org/strnlen.3,
3156implementation stolen from PostgreSQL.
3157*/
3158#ifndef HAS_STRNLEN
3159
3160PERL_STATIC_INLINE Size_t
3161Perl_my_strnlen(const char *str, Size_t maxlen)
3162{
3163 const char *end = (char *) memchr(str, '\0', maxlen);
3164
3165 PERL_ARGS_ASSERT_MY_STRNLEN;
3166
3167 if (end == NULL) return maxlen;
3168 return end - str;
3169}
3170
3171#endif
3172
6dba01e2
KW
3173#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3174
3175PERL_STATIC_INLINE void *
3176S_my_memrchr(const char * s, const char c, const STRLEN len)
3177{
3178 /* memrchr(), since many platforms lack it */
3179
3180 const char * t = s + len - 1;
3181
3182 PERL_ARGS_ASSERT_MY_MEMRCHR;
3183
3184 while (t >= s) {
3185 if (*t == c) {
3186 return (void *) t;
3187 }
3188 t--;
3189 }
3190
3191 return NULL;
3192}
3193
3194#endif
3195
24f3e849
KW
3196PERL_STATIC_INLINE char *
3197Perl_mortal_getenv(const char * str)
3198{
3199 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3200 *
03694582
KW
3201 * It's (mostly) thread-safe because it uses a mutex to prevent other
3202 * threads (that look at this mutex) from destroying the result before this
3203 * routine has a chance to copy the result to a place that won't be
3204 * destroyed before the caller gets a chance to handle it. That place is a
3205 * mortal SV. khw chose this over SAVEFREEPV because he is under the
3206 * impression that the SV will hang around longer under more circumstances
24f3e849 3207 *
03694582
KW
3208 * The reason it isn't completely thread-safe is that other code could
3209 * simply not pay attention to the mutex. All of the Perl core uses the
3210 * mutex, but it is possible for code from, say XS, to not use this mutex,
3211 * defeating the safety.
24f3e849 3212 *
03694582
KW
3213 * getenv() returns, in some implementations, a pointer to a spot in the
3214 * **environ array, which could be invalidated at any time by this or
3215 * another thread changing the environment. Other implementations copy the
3216 * **environ value to a static buffer, returning a pointer to that. That
3217 * buffer might or might not be invalidated by a getenv() call in another
3218 * thread. If it does get zapped, we need an exclusive lock. Otherwise,
3219 * many getenv() calls can safely be running simultaneously, so a
3220 * many-reader (but no simultaneous writers) lock is ok. There is a
3221 * Configure probe to see if another thread destroys the buffer, and the
3222 * mutex is defined accordingly.
3223 *
3224 * But in all cases, using the mutex prevents these problems, as long as
57681073 3225 * all code uses the same mutex.
24f3e849
KW
3226 *
3227 * A complication is that this can be called during phases where the
3228 * mortalization process isn't available. These are in interpreter
3229 * destruction or early in construction. khw believes that at these times
3230 * there shouldn't be anything else going on, so plain getenv is safe AS
3231 * LONG AS the caller acts on the return before calling it again. */
3232
3233 char * ret;
3234 dTHX;
3235
3236 PERL_ARGS_ASSERT_MORTAL_GETENV;
3237
3238 /* Can't mortalize without stacks. khw believes that no other threads
3239 * should be running, so no need to lock things, and this may be during a
3240 * phase when locking isn't even available */
3241 if (UNLIKELY(PL_scopestack_ix == 0)) {
3242 return getenv(str);
3243 }
3244
03694582
KW
3245#ifdef PERL_MEM_LOG
3246
3247 /* A major complication arises under PERL_MEM_LOG. When that is active,
3248 * every memory allocation may result in logging, depending on the value of
3249 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for
3250 * saving ENV{foo}'s value (but before saving it), the logging code will
3251 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some
3252 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3253 * lock a boolean mutex recursively); 3) destroying the getenv() static
3254 * buffer; or 4) destroying the temporary created by this for the copy
3255 * causes a log entry to be made which could cause a new temporary to be
3256 * created, which will need to be destroyed at some point, leading to an
3257 * infinite loop.
3258 *
3259 * The solution adopted here (after some gnashing of teeth) is to detect
3260 * the recursive calls and calls from the logger, and treat them specially.
3261 * Let's say we want to do getenv("foo"). We first find
3262 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3263 * variable, so no temporary is required. Then we do getenv(foo}, and in
3264 * the process of creating a temporary to save it, this function will be
3265 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call,
3266 * we detect that it is such a call and return our saved value instead of
3267 * locking and doing a new getenv(). This solves all of problems 1), 2),
3268 * and 3). Because all the getenv()s are done while the mutex is locked,
3269 * the state cannot have changed. To solve 4), we don't create a temporary
3270 * when this is called from the logging code. That code disposes of the
3271 * return value while the mutex is still locked.
3272 *
3273 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3274 * digits and 3 particular letters are significant; the rest are ignored by
3275 * the memory logging code. Thus the per-interpreter variable only needs
3276 * to be large enough to save the significant information, the size of
3277 * which is known at compile time. The first byte is extra, reserved for
3278 * flags for our use. To protect against overflowing, only the reserved
3279 * byte, as many digits as don't overflow, and the three letters are
3280 * stored.
3281 *
3282 * The reserved byte has two bits:
3283 * 0x1 if set indicates that if we get here, it is a recursive call of
3284 * getenv()
3285 * 0x2 if set indicates that the call is from the logging code.
3286 *
3287 * If the flag indicates this is a recursive call, just return the stored
3288 * value of PL_mem_log; An empty value gets turned into NULL. */
3289 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3290 if (PL_mem_log[1] == '\0') {
3291 return NULL;
3292 } else {
3293 return PL_mem_log + 1;
3294 }
3295 }
3296
3297#endif
3298
35bcf7ff 3299 GETENV_LOCK;
24f3e849 3300
03694582
KW
3301#ifdef PERL_MEM_LOG
3302
3303 /* Here we are in a critical section. As explained above, we do our own
3304 * getenv(PERL_MEM_LOG), saving the result safely. */
3305 ret = getenv("PERL_MEM_LOG");
3306 if (ret == NULL) { /* No logging active */
3307
3308 /* Return that immediately if called from the logging code */
3309 if (PL_mem_log[0] & 0x2) {
3310 GETENV_UNLOCK;
3311 return NULL;
3312 }
3313
3314 PL_mem_log[1] = '\0';
3315 }
3316 else {
3317 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */
3318
3319 /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3320 * extremely long string. But we want only a few characters from it.
3321 * PL_mem_log has been made large enough to hold just the ones we need.
3322 * First the file descriptor. */
3323 if (isDIGIT(*ret)) {
3324 const char * s = ret;
3325 if (UNLIKELY(*s == '0')) {
3326
3327 /* Reduce multiple leading zeros to a single one. This is to
3328 * allow the caller to change what to do with leading zeros. */
3329 *mem_log_meat++ = '0';
3330 s++;
3331 while (*s == '0') {
3332 s++;
3333 }
3334 }
3335
3336 /* If the input overflows, copy just enough for the result to also
3337 * overflow, plus 1 to make sure */
3338 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3339 *mem_log_meat++ = *s++;
3340 }
3341 }
3342
467fdaa2 3343 /* Then each of the four significant characters */
03694582
KW
3344 if (strchr(ret, 'm')) {
3345 *mem_log_meat++ = 'm';
3346 }
3347 if (strchr(ret, 's')) {
3348 *mem_log_meat++ = 's';
3349 }
3350 if (strchr(ret, 't')) {
3351 *mem_log_meat++ = 't';
3352 }
467fdaa2
PE
3353 if (strchr(ret, 'c')) {
3354 *mem_log_meat++ = 'c';
3355 }
03694582
KW
3356 *mem_log_meat = '\0';
3357
3358 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3359 }
3360
3361 /* If we are being called from the logger, it only needs the significant
3362 * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3363 if (PL_mem_log[0] & 0x2) {
3364 assert(strEQ(str, "PERL_MEM_LOG"));
3365 GETENV_UNLOCK;
3366 return PL_mem_log + 1;
3367 }
3368
3369 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that
3370 * is coming from other than the logging code, so it should be treated the
3371 * same as any other getenv(), returning the full value, not just the
3372 * significant part, and having its value saved. Set the flag that
3373 * indicates any call to this routine will be a recursion from here */
3374 PL_mem_log[0] = 0x1;
3375
3376#endif
3377
3378 /* Now get the value of the real desired variable, and save a copy */
24f3e849
KW
3379 ret = getenv(str);
3380
3381 if (ret != NULL) {
c80a8618 3382 ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
24f3e849
KW
3383 }
3384
35bcf7ff
KW
3385 GETENV_UNLOCK;
3386
03694582
KW
3387#ifdef PERL_MEM_LOG
3388
3389 /* Clear the buffer */
3390 Zero(PL_mem_log, sizeof(PL_mem_log), char);
3391
3392#endif
3393
24f3e849
KW
3394 return ret;
3395}
3396
1d0d673f
PE
3397PERL_STATIC_INLINE bool
3398Perl_sv_isbool(pTHX_ const SV *sv)
3399{
57e785fd
YO
3400 /* change to the following in 5.37, logically the same but
3401 * more efficient and more future proof */
3402#if 0
3403 return (SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv));
3404#else
3405 return SvIOK(sv) && SvPOK(sv) && SvIsCOW_static(sv) &&
3406 (SvPVX_const(sv) == PL_Yes || SvPVX_const(sv) == PL_No);
3407#endif
3408
1d0d673f
PE
3409}
3410
9c913148
TC
3411#ifdef USE_ITHREADS
3412
3413PERL_STATIC_INLINE AV *
3414Perl_cop_file_avn(pTHX_ const COP *cop) {
3415
3416 PERL_ARGS_ASSERT_COP_FILE_AVN;
3417
3418 const char *file = CopFILE(cop);
3419 if (file) {
3420 GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3421 if (gv) {
3422 return GvAVn(gv);
3423 }
3424 else
3425 return NULL;
3426 }
3427 else
3428 return NULL;
3429}
3430
3431#endif
3432
ed382232 3433/*
c8028aa6
TC
3434 * ex: set ts=8 sts=4 sw=4 et:
3435 */