This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perldelta: Plan9 and 9front i386
[perl5.git] / inline.h
CommitLineData
25468daa
FC
1/* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8ed185f9 8 * This file contains tables and code adapted from
f6521f7c 9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
8ed185f9
KW
10 * copyright notice:
11
12Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14Permission is hereby granted, free of charge, to any person obtaining a copy of
15this software and associated documentation files (the "Software"), to deal in
16the Software without restriction, including without limitation the rights to
17use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18of the Software, and to permit persons to whom the Software is furnished to do
19so, subject to the following conditions:
20
21The above copyright notice and this permission notice shall be included in all
22copies or substantial portions of the Software.
23
24THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30SOFTWARE.
31
32 *
25468daa 33 * This file is a home for static inline functions that cannot go in other
e15e54ff 34 * header files, because they depend on proto.h (included after most other
25468daa
FC
35 * headers) or struct definitions.
36 *
37 * Each section names the header file that the functions "belong" to.
38 */
27669aa4 39
be3a7a5d
KW
40/* ------------------------------- av.h ------------------------------- */
41
c70927a6 42PERL_STATIC_INLINE SSize_t
c9182d9c 43Perl_av_top_index(pTHX_ AV *av)
be3a7a5d
KW
44{
45 PERL_ARGS_ASSERT_AV_TOP_INDEX;
46 assert(SvTYPE(av) == SVt_PVAV);
47
48 return AvFILL(av);
49}
50
1afe1db1
FC
51/* ------------------------------- cv.h ------------------------------- */
52
ae77754a 53PERL_STATIC_INLINE GV *
c9182d9c 54Perl_CvGV(pTHX_ CV *sv)
ae77754a 55{
74804ad1
KW
56 PERL_ARGS_ASSERT_CVGV;
57
ae77754a
FC
58 return CvNAMED(sv)
59 ? Perl_cvgv_from_hek(aTHX_ sv)
60 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
61}
62
1afe1db1 63PERL_STATIC_INLINE I32 *
74804ad1 64Perl_CvDEPTH(const CV * const sv)
1afe1db1 65{
74804ad1 66 PERL_ARGS_ASSERT_CVDEPTH;
1afe1db1 67 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
74804ad1 68
8de47657 69 return &((XPVCV*)SvANY(sv))->xcv_depth;
1afe1db1
FC
70}
71
d16269d8
PM
72/*
73 CvPROTO returns the prototype as stored, which is not necessarily what
74 the interpreter should be using. Specifically, the interpreter assumes
75 that spaces have been stripped, which has been the case if the prototype
76 was added by toke.c, but is generally not the case if it was added elsewhere.
77 Since we can't enforce the spacelessness at assignment time, this routine
78 provides a temporary copy at parse time with spaces removed.
79 I<orig> is the start of the original buffer, I<len> is the length of the
80 prototype and will be updated when this returns.
81 */
82
5b67adb8 83#ifdef PERL_CORE
d16269d8
PM
84PERL_STATIC_INLINE char *
85S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
86{
87 SV * tmpsv;
88 char * tmps;
89 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
90 tmps = SvPVX(tmpsv);
91 while ((*len)--) {
92 if (!isSPACE(*orig))
93 *tmps++ = *orig;
94 orig++;
95 }
96 *tmps = '\0';
97 *len = tmps - SvPVX(tmpsv);
98 return SvPVX(tmpsv);
99}
5b67adb8 100#endif
d16269d8 101
25fdce4a
FC
102/* ------------------------------- mg.h ------------------------------- */
103
104#if defined(PERL_CORE) || defined(PERL_EXT)
105/* assumes get-magic and stringification have already occurred */
106PERL_STATIC_INLINE STRLEN
107S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
108{
109 assert(mg->mg_type == PERL_MAGIC_regex_global);
110 assert(mg->mg_len != -1);
111 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
112 return (STRLEN)mg->mg_len;
113 else {
114 const STRLEN pos = (STRLEN)mg->mg_len;
115 /* Without this check, we may read past the end of the buffer: */
116 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
117 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
118 }
119}
120#endif
121
03414f05
FC
122/* ------------------------------- pad.h ------------------------------ */
123
124#if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
125PERL_STATIC_INLINE bool
b9d5702c 126S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
03414f05 127{
b9d5702c
KW
128 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
129
03414f05
FC
130 /* is seq within the range _LOW to _HIGH ?
131 * This is complicated by the fact that PL_cop_seqmax
132 * may have wrapped around at some point */
133 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
134 return FALSE; /* not yet introduced */
135
136 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
137 /* in compiling scope */
138 if (
139 (seq > COP_SEQ_RANGE_LOW(pn))
140 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
141 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
142 )
143 return TRUE;
144 }
145 else if (
146 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
147 ?
148 ( seq > COP_SEQ_RANGE_LOW(pn)
149 || seq <= COP_SEQ_RANGE_HIGH(pn))
150
151 : ( seq > COP_SEQ_RANGE_LOW(pn)
152 && seq <= COP_SEQ_RANGE_HIGH(pn))
153 )
154 return TRUE;
155 return FALSE;
156}
157#endif
158
33a4312b
FC
159/* ------------------------------- pp.h ------------------------------- */
160
161PERL_STATIC_INLINE I32
c9182d9c 162Perl_TOPMARK(pTHX)
33a4312b
FC
163{
164 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 165 "MARK top %p %" IVdf "\n",
33a4312b
FC
166 PL_markstack_ptr,
167 (IV)*PL_markstack_ptr)));
168 return *PL_markstack_ptr;
169}
170
171PERL_STATIC_INLINE I32
c9182d9c 172Perl_POPMARK(pTHX)
33a4312b
FC
173{
174 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
147e3846 175 "MARK pop %p %" IVdf "\n",
33a4312b
FC
176 (PL_markstack_ptr-1),
177 (IV)*(PL_markstack_ptr-1))));
178 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
179 return *PL_markstack_ptr--;
180}
181
8d919b0a
FC
182/* ----------------------------- regexp.h ----------------------------- */
183
184PERL_STATIC_INLINE struct regexp *
c9182d9c 185Perl_ReANY(const REGEXP * const re)
8d919b0a 186{
df6b4bd5 187 XPV* const p = (XPV*)SvANY(re);
bdef45de
KW
188
189 PERL_ARGS_ASSERT_REANY;
8d919b0a 190 assert(isREGEXP(re));
bdef45de 191
df6b4bd5
DM
192 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
193 : (struct regexp *)p;
8d919b0a
FC
194}
195
27669aa4
FC
196/* ------------------------------- sv.h ------------------------------- */
197
a887b094 198PERL_STATIC_INLINE bool
9d0469db
PLE
199Perl_SvTRUE(pTHX_ SV *sv) {
200 if (!LIKELY(sv))
201 return FALSE;
7b9b9e32
PLE
202 SvGETMAGIC(sv);
203 return SvTRUE_nomg_NN(sv);
a887b094
PLE
204}
205
27669aa4 206PERL_STATIC_INLINE SV *
c9182d9c 207Perl_SvREFCNT_inc(SV *sv)
27669aa4 208{
2439e033 209 if (LIKELY(sv != NULL))
27669aa4
FC
210 SvREFCNT(sv)++;
211 return sv;
212}
213PERL_STATIC_INLINE SV *
c9182d9c 214Perl_SvREFCNT_inc_NN(SV *sv)
27669aa4 215{
3f2f854a
KW
216 PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
217
27669aa4
FC
218 SvREFCNT(sv)++;
219 return sv;
220}
221PERL_STATIC_INLINE void
c9182d9c 222Perl_SvREFCNT_inc_void(SV *sv)
27669aa4 223{
2439e033 224 if (LIKELY(sv != NULL))
27669aa4
FC
225 SvREFCNT(sv)++;
226}
75e16a44 227PERL_STATIC_INLINE void
c9182d9c 228Perl_SvREFCNT_dec(pTHX_ SV *sv)
75e16a44 229{
2439e033 230 if (LIKELY(sv != NULL)) {
75a9bf96 231 U32 rc = SvREFCNT(sv);
79e2a32a 232 if (LIKELY(rc > 1))
75a9bf96
DM
233 SvREFCNT(sv) = rc - 1;
234 else
235 Perl_sv_free2(aTHX_ sv, rc);
75e16a44
FC
236 }
237}
541377b1
FC
238
239PERL_STATIC_INLINE void
c9182d9c 240Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
4a9a56a7
DM
241{
242 U32 rc = SvREFCNT(sv);
3f2f854a
KW
243
244 PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
245
79e2a32a 246 if (LIKELY(rc > 1))
4a9a56a7
DM
247 SvREFCNT(sv) = rc - 1;
248 else
249 Perl_sv_free2(aTHX_ sv, rc);
250}
251
252PERL_STATIC_INLINE void
1bd041dc 253Perl_SvAMAGIC_on(SV *sv)
541377b1 254{
1bd041dc 255 PERL_ARGS_ASSERT_SVAMAGIC_ON;
541377b1 256 assert(SvROK(sv));
1bd041dc 257
541377b1
FC
258 if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
259}
260PERL_STATIC_INLINE void
1bd041dc 261Perl_SvAMAGIC_off(SV *sv)
541377b1 262{
1bd041dc
KW
263 PERL_ARGS_ASSERT_SVAMAGIC_OFF;
264
541377b1
FC
265 if (SvROK(sv) && SvOBJECT(SvRV(sv)))
266 HvAMAGIC_off(SvSTASH(SvRV(sv)));
267}
268
269PERL_STATIC_INLINE U32
c9182d9c 270Perl_SvPADSTALE_on(SV *sv)
541377b1 271{
c0683843 272 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
273 return SvFLAGS(sv) |= SVs_PADSTALE;
274}
275PERL_STATIC_INLINE U32
c9182d9c 276Perl_SvPADSTALE_off(SV *sv)
541377b1 277{
c0683843 278 assert(!(SvFLAGS(sv) & SVs_PADTMP));
541377b1
FC
279 return SvFLAGS(sv) &= ~SVs_PADSTALE;
280}
25fdce4a 281#if defined(PERL_CORE) || defined (PERL_EXT)
4ddea69a 282PERL_STATIC_INLINE STRLEN
6964422a 283S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
4ddea69a 284{
25fdce4a 285 PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
4ddea69a
FC
286 if (SvGAMAGIC(sv)) {
287 U8 *hopped = utf8_hop((U8 *)pv, pos);
288 if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
289 return (STRLEN)(hopped - (U8 *)pv);
290 }
291 return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
292}
293#endif
f019c49e 294
a8a2ceaa
KW
295/* ------------------------------- utf8.h ------------------------------- */
296
2fe720e2
KW
297/*
298=head1 Unicode Support
299*/
300
55d09dc8 301PERL_STATIC_INLINE void
c9182d9c 302Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
55d09dc8
KW
303{
304 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
305 * encoded string at '*dest', updating '*dest' to include it */
306
55d09dc8
KW
307 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
308
6f2d5cbc 309 if (NATIVE_BYTE_IS_INVARIANT(byte))
a09ec51a 310 *((*dest)++) = byte;
55d09dc8 311 else {
a09ec51a
KW
312 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
313 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
55d09dc8
KW
314 }
315}
316
e123187a 317/*
2fe720e2 318=for apidoc valid_utf8_to_uvchr
09232555
KW
319Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
320known that the next character in the input UTF-8 string C<s> is well-formed
321(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
322points, and non-Unicode code points are allowed.
2fe720e2
KW
323
324=cut
325
326 */
327
328PERL_STATIC_INLINE UV
329Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
330{
c41b2540 331 const UV expectlen = UTF8SKIP(s);
2fe720e2
KW
332 const U8* send = s + expectlen;
333 UV uv = *s;
334
335 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
336
337 if (retlen) {
338 *retlen = expectlen;
339 }
340
341 /* An invariant is trivially returned */
342 if (expectlen == 1) {
343 return uv;
344 }
345
346 /* Remove the leading bits that indicate the number of bytes, leaving just
347 * the bits that are part of the value */
348 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
349
350 /* Now, loop through the remaining bytes, accumulating each into the
351 * working total as we go. (I khw tried unrolling the loop for up to 4
352 * bytes, but there was no performance improvement) */
353 for (++s; s < send; s++) {
354 uv = UTF8_ACCUMULATE(uv, *s);
355 }
356
357 return UNI_TO_NATIVE(uv);
358
359}
360
1e599354
KW
361/*
362=for apidoc is_utf8_invariant_string
363
82c5d941 364Returns TRUE if the first C<len> bytes of the string C<s> are the same
1e599354 365regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
82c5d941
KW
366EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
367are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
368the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
369characters are invariant, but so also are the C1 controls.
1e599354
KW
370
371If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
372use this option, that C<s> can't have embedded C<NUL> characters and has to
373have a terminating C<NUL> byte).
374
9f2abfde
KW
375See also
376C<L</is_utf8_string>>,
377C<L</is_utf8_string_flags>>,
378C<L</is_utf8_string_loc>>,
379C<L</is_utf8_string_loc_flags>>,
380C<L</is_utf8_string_loclen>>,
381C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
382C<L</is_utf8_fixed_width_buf_flags>>,
383C<L</is_utf8_fixed_width_buf_loc_flags>>,
384C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
385C<L</is_strict_utf8_string>>,
386C<L</is_strict_utf8_string_loc>>,
387C<L</is_strict_utf8_string_loclen>>,
388C<L</is_c9strict_utf8_string>>,
389C<L</is_c9strict_utf8_string_loc>>,
390and
391C<L</is_c9strict_utf8_string_loclen>>.
1e599354
KW
392
393=cut
0cbf5865
KW
394
395*/
396
397#define is_utf8_invariant_string(s, len) \
398 is_utf8_invariant_string_loc(s, len, NULL)
399
400/*
401=for apidoc is_utf8_invariant_string_loc
402
403Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
404the first UTF-8 variant character in the C<ep> pointer; if all characters are
405UTF-8 invariant, this function does not change the contents of C<*ep>.
406
407=cut
408
1e599354
KW
409*/
410
411PERL_STATIC_INLINE bool
c9182d9c 412Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1e599354 413{
e17544a6 414 const U8* send;
1e599354
KW
415 const U8* x = s;
416
0cbf5865
KW
417 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
418
e17544a6
KW
419 if (len == 0) {
420 len = strlen((const char *)s);
421 }
422
423 send = s + len;
424
4ab2fd9b 425/* This looks like 0x010101... */
2c5c8af5 426# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
4ab2fd9b
KW
427
428/* This looks like 0x808080... */
2c5c8af5 429# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
e099ea69 430# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
2c5c8af5 431# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
e17544a6 432
099e59a4
KW
433/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
434 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
435 * optimized out completely on a 32-bit system, and its mask gets optimized out
436 * on a 64-bit system */
2c5c8af5 437# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
5eabe374
KW
438 | ( PTR2nat(x) >> 1) \
439 | ( ( (PTR2nat(x) \
440 & PERL_WORD_BOUNDARY_MASK) >> 2))))
099e59a4 441
3f515a2e
KW
442#ifndef EBCDIC
443
099e59a4
KW
444 /* Do the word-at-a-time iff there is at least one usable full word. That
445 * means that after advancing to a word boundary, there still is at least a
446 * full word left. The number of bytes needed to advance is 'wordsize -
447 * offset' unless offset is 0. */
448 if ((STRLEN) (send - x) >= PERL_WORDSIZE
449
450 /* This term is wordsize if subword; 0 if not */
451 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
452
453 /* 'offset' */
454 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
455 {
b40579ff 456
46bb68f6
KW
457 /* Process per-byte until reach word boundary. XXX This loop could be
458 * eliminated if we knew that this platform had fast unaligned reads */
b40579ff 459 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
46bb68f6
KW
460 if (! UTF8_IS_INVARIANT(*x)) {
461 if (ep) {
462 *ep = x;
463 }
e17544a6 464
46bb68f6
KW
465 return FALSE;
466 }
467 x++;
e17544a6 468 }
e17544a6 469
099e59a4
KW
470 /* Here, we know we have at least one full word to process. Process
471 * per-word as long as we have at least a full word left */
472 do {
4ab2fd9b 473 if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
e17544a6 474
46bb68f6
KW
475 /* Found a variant. Just return if caller doesn't want its
476 * exact position */
477 if (! ep) {
478 return FALSE;
479 }
e17544a6 480
2c5c8af5
KW
481# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
482 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1d2af574 483
73f0a2eb 484 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
1d2af574
KW
485 assert(*ep >= s && *ep < send);
486
487 return FALSE;
488
2c5c8af5 489# else /* If weird byte order, drop into next loop to do byte-at-a-time
1d2af574
KW
490 checks. */
491
46bb68f6 492 break;
2c5c8af5 493# endif
46bb68f6 494 }
1d2af574 495
46bb68f6 496 x += PERL_WORDSIZE;
1d2af574 497
099e59a4 498 } while (x + PERL_WORDSIZE <= send);
b40579ff 499 }
e17544a6 500
0b08cab0 501#endif /* End of ! EBCDIC */
e17544a6
KW
502
503 /* Process per-byte */
504 while (x < send) {
505 if (! UTF8_IS_INVARIANT(*x)) {
506 if (ep) {
507 *ep = x;
508 }
0cbf5865 509
e17544a6 510 return FALSE;
0cbf5865 511 }
1e599354 512
e17544a6 513 x++;
1e599354
KW
514 }
515
516 return TRUE;
517}
518
23a7ee81
KW
519#ifndef EBCDIC
520
1d2af574 521PERL_STATIC_INLINE unsigned int
73f0a2eb 522Perl_variant_byte_number(PERL_UINTMAX_T word)
1d2af574
KW
523{
524
525 /* This returns the position in a word (0..7) of the first variant byte in
526 * it. This is a helper function. Note that there are no branches */
527
528 assert(word);
529
530 /* Get just the msb bits of each byte */
531 word &= PERL_VARIANTS_WORD_MASK;
532
7adf2470 533# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1d2af574
KW
534
535 /* Bytes are stored like
536 * Byte8 ... Byte2 Byte1
537 * 63..56...15...8 7...0
538 *
539 * Isolate the lsb;
540 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
541 *
a3815e44 542 * The word will look like this, with a rightmost set bit in position 's':
1d2af574
KW
543 * ('x's are don't cares)
544 * s
545 * x..x100..0
546 * x..xx10..0 Right shift (rightmost 0 is shifted off)
547 * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
548 * the 1 just to their left into a 0; the remainder is
549 * untouched
4fa92663
KW
550 * 0..0011..1 The xor with the original, x..xx10..0, clears that
551 * remainder, sets the bottom to all 1
1d2af574
KW
552 * 0..0100..0 Add 1 to clear the word except for the bit in 's'
553 *
554 * Another method is to do 'word &= -word'; but it generates a compiler
555 * message on some platforms about taking the negative of an unsigned */
556
557 word >>= 1;
558 word = 1 + (word ^ (word - 1));
559
560# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
561
562 /* Bytes are stored like
563 * Byte1 Byte2 ... Byte8
564 * 63..56 55..47 ... 7...0
565 *
566 * Isolate the msb; http://codeforces.com/blog/entry/10330
567 *
568 * Only the most significant set bit matters. Or'ing word with its right
569 * shift of 1 makes that bit and the next one to its right both 1. Then
570 * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
571 * msb and all to the right being 1. */
572 word |= word >> 1;
573 word |= word >> 2;
574 word |= word >> 4;
575 word |= word >> 8;
576 word |= word >> 16;
577 word |= word >> 32; /* This should get optimized out on 32-bit systems. */
578
579 /* Then subtracting the right shift by 1 clears all but the left-most of
580 * the 1 bits, which is our desired result */
581 word -= (word >> 1);
582
583# else
584# error Unexpected byte order
585# endif
586
7cf2d6c7
KW
587 /* Here 'word' has a single bit set: the msb of the first byte in which it
588 * is set. Calculate that position in the word. We can use this
1d2af574 589 * specialized solution: https://stackoverflow.com/a/32339674/1626653,
67e12c5c
KW
590 * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
591 * just get shifted off at compile time) */
592 word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
593 | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
594 | (39 << 24) | (47 << 16)
595 | (55 << 8) | (63 << 0));
1d2af574
KW
596 word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
597
598 /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
599 word = ((word + 1) >> 3) - 1;
600
601# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
602
603 /* And invert the result */
604 word = CHARBITS - word - 1;
605
606# endif
607
608 return (unsigned int) word;
609}
610
23a7ee81 611#endif
03c1e4ab
KW
612#if defined(PERL_CORE) || defined(PERL_EXT)
613
614/*
615=for apidoc variant_under_utf8_count
616
617This function looks at the sequence of bytes between C<s> and C<e>, which are
618assumed to be encoded in ASCII/Latin1, and returns how many of them would
619change should the string be translated into UTF-8. Due to the nature of UTF-8,
620each of these would occupy two bytes instead of the single one in the input
621string. Thus, this function returns the precise number of bytes the string
622would expand by when translated to UTF-8.
623
624Unlike most of the other functions that have C<utf8> in their name, the input
625to this function is NOT a UTF-8-encoded string. The function name is slightly
626I<odd> to emphasize this.
627
628This function is internal to Perl because khw thinks that any XS code that
629would want this is probably operating too close to the internals. Presenting a
630valid use case could change that.
631
632See also
633C<L<perlapi/is_utf8_invariant_string>>
634and
635C<L<perlapi/is_utf8_invariant_string_loc>>,
636
637=cut
638
639*/
640
641PERL_STATIC_INLINE Size_t
642S_variant_under_utf8_count(const U8* const s, const U8* const e)
643{
644 const U8* x = s;
645 Size_t count = 0;
646
647 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
648
649# ifndef EBCDIC
650
5d0379de
KW
651 /* Test if the string is long enough to use word-at-a-time. (Logic is the
652 * same as for is_utf8_invariant_string()) */
03c1e4ab
KW
653 if ((STRLEN) (e - x) >= PERL_WORDSIZE
654 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
655 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
656 {
657
658 /* Process per-byte until reach word boundary. XXX This loop could be
659 * eliminated if we knew that this platform had fast unaligned reads */
660 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
661 count += ! UTF8_IS_INVARIANT(*x++);
662 }
663
664 /* Process per-word as long as we have at least a full word left */
74472cc2
KW
665 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
666 explanation of how this works */
e5863284
KW
667 PERL_UINTMAX_T increment
668 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
03c1e4ab
KW
669 * PERL_COUNT_MULTIPLIER)
670 >> ((PERL_WORDSIZE - 1) * CHARBITS);
e5863284 671 count += (Size_t) increment;
03c1e4ab
KW
672 x += PERL_WORDSIZE;
673 } while (x + PERL_WORDSIZE <= e);
674 }
675
676# endif
677
678 /* Process per-byte */
679 while (x < e) {
680 if (! UTF8_IS_INVARIANT(*x)) {
681 count++;
682 }
683
684 x++;
685 }
686
687 return count;
688}
689
690#endif
691
aff4cafe
KW
692#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
693# undef PERL_WORDSIZE
694# undef PERL_COUNT_MULTIPLIER
695# undef PERL_WORD_BOUNDARY_MASK
696# undef PERL_VARIANTS_WORD_MASK
697#endif
03c1e4ab 698
7c93d8f0 699/*
5ff889fb
KW
700=for apidoc is_utf8_string
701
82c5d941
KW
702Returns TRUE if the first C<len> bytes of string C<s> form a valid
703Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
704be calculated using C<strlen(s)> (which means if you use this option, that C<s>
705can't have embedded C<NUL> characters and has to have a terminating C<NUL>
706byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
707
2717076a
KW
708This function considers Perl's extended UTF-8 to be valid. That means that
709code points above Unicode, surrogates, and non-character code points are
9f2abfde
KW
710considered valid by this function. Use C<L</is_strict_utf8_string>>,
711C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
712code points are considered valid.
5ff889fb 713
9f2abfde
KW
714See also
715C<L</is_utf8_invariant_string>>,
0cbf5865 716C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
717C<L</is_utf8_string_loc>>,
718C<L</is_utf8_string_loclen>>,
8bc127bf
KW
719C<L</is_utf8_fixed_width_buf_flags>>,
720C<L</is_utf8_fixed_width_buf_loc_flags>>,
721C<L</is_utf8_fixed_width_buf_loclen_flags>>,
5ff889fb
KW
722
723=cut
724*/
725
dd237e82 726#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
5ff889fb 727
c9cd936b
KW
728#if defined(PERL_CORE) || defined (PERL_EXT)
729
730/*
731=for apidoc is_utf8_non_invariant_string
732
733Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
734C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
735UTF-8; otherwise returns FALSE.
736
737A TRUE return means that at least one code point represented by the sequence
738either is a wide character not representable as a single byte, or the
739representation differs depending on whether the sequence is encoded in UTF-8 or
740not.
741
742See also
743C<L<perlapi/is_utf8_invariant_string>>,
744C<L<perlapi/is_utf8_string>>
745
746=cut
747
748This is commonly used to determine if a SV's UTF-8 flag should be turned on.
b3b93dfe
KW
749It generally needn't be if its string is entirely UTF-8 invariant, and it
750shouldn't be if it otherwise contains invalid UTF-8.
c9cd936b
KW
751
752It is an internal function because khw thinks that XS code shouldn't be working
753at this low a level. A valid use case could change that.
754
755*/
756
757PERL_STATIC_INLINE bool
86a87e17 758Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
c9cd936b
KW
759{
760 const U8 * first_variant;
761
762 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
763
764 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
765 return FALSE;
766 }
767
768 return is_utf8_string(first_variant, len - (first_variant - s));
769}
770
771#endif
772
5ff889fb 773/*
9f2abfde
KW
774=for apidoc is_strict_utf8_string
775
776Returns TRUE if the first C<len> bytes of string C<s> form a valid
777UTF-8-encoded string that is fully interchangeable by any application using
778Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
779calculated using C<strlen(s)> (which means if you use this option, that C<s>
780can't have embedded C<NUL> characters and has to have a terminating C<NUL>
781byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
782
783This function returns FALSE for strings containing any
784code points above the Unicode max of 0x10FFFF, surrogate code points, or
785non-character code points.
786
787See also
788C<L</is_utf8_invariant_string>>,
0cbf5865 789C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
790C<L</is_utf8_string>>,
791C<L</is_utf8_string_flags>>,
792C<L</is_utf8_string_loc>>,
793C<L</is_utf8_string_loc_flags>>,
794C<L</is_utf8_string_loclen>>,
795C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
796C<L</is_utf8_fixed_width_buf_flags>>,
797C<L</is_utf8_fixed_width_buf_loc_flags>>,
798C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
799C<L</is_strict_utf8_string_loc>>,
800C<L</is_strict_utf8_string_loclen>>,
801C<L</is_c9strict_utf8_string>>,
802C<L</is_c9strict_utf8_string_loc>>,
803and
804C<L</is_c9strict_utf8_string_loclen>>.
805
806=cut
807*/
808
dd237e82 809#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
9f2abfde
KW
810
811/*
812=for apidoc is_c9strict_utf8_string
813
814Returns TRUE if the first C<len> bytes of string C<s> form a valid
815UTF-8-encoded string that conforms to
816L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
817otherwise it returns FALSE. If C<len> is 0, it will be calculated using
818C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
819C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
820characters being ASCII constitute 'a valid UTF-8 string'.
821
822This function returns FALSE for strings containing any code points above the
823Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
824code points per
825L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
826
827See also
828C<L</is_utf8_invariant_string>>,
0cbf5865 829C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
830C<L</is_utf8_string>>,
831C<L</is_utf8_string_flags>>,
832C<L</is_utf8_string_loc>>,
833C<L</is_utf8_string_loc_flags>>,
834C<L</is_utf8_string_loclen>>,
835C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
836C<L</is_utf8_fixed_width_buf_flags>>,
837C<L</is_utf8_fixed_width_buf_loc_flags>>,
838C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
839C<L</is_strict_utf8_string>>,
840C<L</is_strict_utf8_string_loc>>,
841C<L</is_strict_utf8_string_loclen>>,
842C<L</is_c9strict_utf8_string_loc>>,
843and
844C<L</is_c9strict_utf8_string_loclen>>.
845
846=cut
847*/
848
dd237e82 849#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
9f2abfde
KW
850
851/*
852=for apidoc is_utf8_string_flags
853
854Returns TRUE if the first C<len> bytes of string C<s> form a valid
855UTF-8 string, subject to the restrictions imposed by C<flags>;
856returns FALSE otherwise. If C<len> is 0, it will be calculated
857using C<strlen(s)> (which means if you use this option, that C<s> can't have
858embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
859that all characters being ASCII constitute 'a valid UTF-8 string'.
860
861If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
862C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
863as C<L</is_strict_utf8_string>>; and if C<flags> is
864C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
865C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
866combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
867C<L</utf8n_to_uvchr>>, with the same meanings.
868
869See also
870C<L</is_utf8_invariant_string>>,
0cbf5865 871C<L</is_utf8_invariant_string_loc>>,
9f2abfde
KW
872C<L</is_utf8_string>>,
873C<L</is_utf8_string_loc>>,
874C<L</is_utf8_string_loc_flags>>,
875C<L</is_utf8_string_loclen>>,
876C<L</is_utf8_string_loclen_flags>>,
8bc127bf
KW
877C<L</is_utf8_fixed_width_buf_flags>>,
878C<L</is_utf8_fixed_width_buf_loc_flags>>,
879C<L</is_utf8_fixed_width_buf_loclen_flags>>,
9f2abfde
KW
880C<L</is_strict_utf8_string>>,
881C<L</is_strict_utf8_string_loc>>,
882C<L</is_strict_utf8_string_loclen>>,
883C<L</is_c9strict_utf8_string>>,
884C<L</is_c9strict_utf8_string_loc>>,
885and
886C<L</is_c9strict_utf8_string_loclen>>.
887
888=cut
889*/
890
891PERL_STATIC_INLINE bool
c9182d9c 892Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
9f2abfde 893{
33756530 894 const U8 * first_variant;
9f2abfde
KW
895
896 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
897 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 898 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 899
f60f61fd
KW
900 if (len == 0) {
901 len = strlen((const char *)s);
902 }
903
9f2abfde
KW
904 if (flags == 0) {
905 return is_utf8_string(s, len);
906 }
907
d044b7a7 908 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
909 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
910 {
911 return is_strict_utf8_string(s, len);
912 }
913
d044b7a7 914 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
915 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
916 {
917 return is_c9strict_utf8_string(s, len);
918 }
919
33756530
KW
920 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
921 const U8* const send = s + len;
922 const U8* x = first_variant;
923
a0d7f935
KW
924 while (x < send) {
925 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
926 if (UNLIKELY(! cur_len)) {
927 return FALSE;
928 }
929 x += cur_len;
9f2abfde 930 }
33756530 931 }
9f2abfde
KW
932
933 return TRUE;
934}
935
936/*
5ff889fb
KW
937
938=for apidoc is_utf8_string_loc
939
2717076a 940Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 941case of "utf8ness failure") or the location C<s>+C<len> (in the case of
82c5d941 942"utf8ness success") in the C<ep> pointer.
5ff889fb 943
2717076a 944See also C<L</is_utf8_string_loclen>>.
5ff889fb 945
3964c812
KW
946=cut
947*/
948
949#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
950
951/*
952
5ff889fb
KW
953=for apidoc is_utf8_string_loclen
954
2717076a 955Like C<L</is_utf8_string>> but stores the location of the failure (in the
5ff889fb 956case of "utf8ness failure") or the location C<s>+C<len> (in the case of
9f2abfde 957"utf8ness success") in the C<ep> pointer, and the number of UTF-8
82c5d941 958encoded characters in the C<el> pointer.
5ff889fb 959
2717076a 960See also C<L</is_utf8_string_loc>>.
5ff889fb
KW
961
962=cut
963*/
964
56e4cf64 965PERL_STATIC_INLINE bool
33756530 966Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
5ff889fb 967{
33756530 968 const U8 * first_variant;
5ff889fb
KW
969
970 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
971
33756530
KW
972 if (len == 0) {
973 len = strlen((const char *) s);
974 }
975
976 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
977 if (el)
978 *el = len;
979
980 if (ep) {
981 *ep = s + len;
982 }
983
984 return TRUE;
985 }
986
987 {
988 const U8* const send = s + len;
989 const U8* x = first_variant;
990 STRLEN outlen = first_variant - s;
991
a0d7f935
KW
992 while (x < send) {
993 const STRLEN cur_len = isUTF8_CHAR(x, send);
994 if (UNLIKELY(! cur_len)) {
995 break;
996 }
997 x += cur_len;
998 outlen++;
5ff889fb 999 }
5ff889fb 1000
a0d7f935
KW
1001 if (el)
1002 *el = outlen;
5ff889fb 1003
a0d7f935
KW
1004 if (ep) {
1005 *ep = x;
1006 }
5ff889fb 1007
a0d7f935 1008 return (x == send);
33756530 1009 }
5ff889fb
KW
1010}
1011
1012/*
9f2abfde 1013
44170c9a 1014=for apidoc isUTF8_CHAR
8ed185f9
KW
1015
1016Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1017looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1018that represents some code point; otherwise it evaluates to 0. If non-zero, the
1019value gives how many bytes starting at C<s> comprise the code point's
1020representation. Any bytes remaining before C<e>, but beyond the ones needed to
1021form the first code point in C<s>, are not examined.
1022
13aab5dd 1023The code point can be any that will fit in an IV on this machine, using Perl's
8ed185f9
KW
1024extension to official UTF-8 to represent those higher than the Unicode maximum
1025of 0x10FFFF. That means that this macro is used to efficiently decide if the
1026next few bytes in C<s> is legal UTF-8 for a single character.
1027
1028Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1029defined by Unicode to be fully interchangeable across applications;
1030C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1031#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1032code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1033
1034Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1035C<L</is_utf8_string_loclen>> to check entire strings.
1036
13aab5dd
KW
1037Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1038machines) is a valid UTF-8 character.
8ed185f9
KW
1039
1040=cut
1041
1042This uses an adaptation of the table and algorithm given in
f6521f7c 1043https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
8ed185f9
KW
1044documentation of the original version. A copyright notice for the original
1045version is given at the beginning of this file. The Perl adapation is
71525f77 1046documented at the definition of PL_extended_utf8_dfa_tab[].
8ed185f9
KW
1047
1048*/
1049
1050PERL_STATIC_INLINE Size_t
c9182d9c 1051Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
8ed185f9
KW
1052{
1053 const U8 * s = s0;
1054 UV state = 0;
1055
1056 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1057
1058 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1059 * code point, which can be returned immediately. Otherwise, it is either
1060 * malformed, or for the start byte FF which the dfa doesn't handle (except
1061 * on 32-bit ASCII platforms where it trivially is an error). Call a
1062 * helper function for the other platforms. */
1063
1064 while (s < e && LIKELY(state != 1)) {
71525f77 1065 state = PL_extended_utf8_dfa_tab[256
8ed185f9 1066 + state
71525f77 1067 + PL_extended_utf8_dfa_tab[*s]];
8ed185f9
KW
1068 if (state != 0) {
1069 s++;
1070 continue;
1071 }
1072
1073 return s - s0 + 1;
1074 }
1075
1076#if defined(UV_IS_QUAD) || defined(EBCDIC)
1077
1078 if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1376b35c 1079 return is_utf8_char_helper(s0, e, 0);
8ed185f9
KW
1080 }
1081
1082#endif
1083
1084 return 0;
1085}
1086
1087/*
1088
67049a5f
KW
1089=for apidoc isSTRICT_UTF8_CHAR
1090
1091Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1092looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1093Unicode code point completely acceptable for open interchange between all
1094applications; otherwise it evaluates to 0. If non-zero, the value gives how
1095many bytes starting at C<s> comprise the code point's representation. Any
1096bytes remaining before C<e>, but beyond the ones needed to form the first code
1097point in C<s>, are not examined.
1098
1099The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1100be a surrogate nor a non-character code point. Thus this excludes any code
1101point from Perl's extended UTF-8.
1102
1103This is used to efficiently decide if the next few bytes in C<s> is
1104legal Unicode-acceptable UTF-8 for a single character.
1105
1106Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1107#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1108code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1109and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1110
1111Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1112C<L</is_strict_utf8_string_loclen>> to check entire strings.
1113
1114=cut
1115
1116This uses an adaptation of the tables and algorithm given in
f6521f7c 1117https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
67049a5f
KW
1118documentation of the original version. A copyright notice for the original
1119version is given at the beginning of this file. The Perl adapation is
1120documented at the definition of strict_extended_utf8_dfa_tab[].
1121
1122*/
1123
1124PERL_STATIC_INLINE Size_t
c9182d9c 1125Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
67049a5f
KW
1126{
1127 const U8 * s = s0;
1128 UV state = 0;
1129
1130 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1131
1132 while (s < e && LIKELY(state != 1)) {
71525f77 1133 state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
67049a5f
KW
1134
1135 if (state != 0) {
1136 s++;
1137 continue;
1138 }
1139
1140 return s - s0 + 1;
1141 }
1142
1143#ifndef EBCDIC
1144
1145 /* The dfa above drops out for certain Hanguls; handle them specially */
1146 if (is_HANGUL_ED_utf8_safe(s0, e)) {
1147 return 3;
1148 }
1149
1150#endif
1151
1152 return 0;
1153}
1154
1155/*
1156
44170c9a 1157=for apidoc isC9_STRICT_UTF8_CHAR
c5bfbb64
KW
1158
1159Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1160looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1161Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1162the value gives how many bytes starting at C<s> comprise the code point's
1163representation. Any bytes remaining before C<e>, but beyond the ones needed to
1164form the first code point in C<s>, are not examined.
1165
1166The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1167differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1168code points. This corresponds to
1169L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1170which said that non-character code points are merely discouraged rather than
1171completely forbidden in open interchange. See
1172L<perlunicode/Noncharacter code points>.
1173
1174Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1175C<L</isUTF8_CHAR_flags>> for a more customized definition.
1176
1177Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1178C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1179
1180=cut
1181
1182This uses an adaptation of the tables and algorithm given in
f6521f7c 1183https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
c5bfbb64
KW
1184documentation of the original version. A copyright notice for the original
1185version is given at the beginning of this file. The Perl adapation is
71525f77 1186documented at the definition of PL_c9_utf8_dfa_tab[].
c5bfbb64
KW
1187
1188*/
1189
1190PERL_STATIC_INLINE Size_t
c9182d9c 1191Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
c5bfbb64
KW
1192{
1193 const U8 * s = s0;
1194 UV state = 0;
1195
1196 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1197
1198 while (s < e && LIKELY(state != 1)) {
71525f77 1199 state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
c5bfbb64
KW
1200
1201 if (state != 0) {
1202 s++;
1203 continue;
1204 }
1205
1206 return s - s0 + 1;
1207 }
1208
1209 return 0;
1210}
1211
1212/*
1213
9f2abfde
KW
1214=for apidoc is_strict_utf8_string_loc
1215
1216Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1217case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1218"utf8ness success") in the C<ep> pointer.
1219
1220See also C<L</is_strict_utf8_string_loclen>>.
1221
1222=cut
1223*/
1224
1225#define is_strict_utf8_string_loc(s, len, ep) \
1226 is_strict_utf8_string_loclen(s, len, ep, 0)
1227
1228/*
1229
1230=for apidoc is_strict_utf8_string_loclen
1231
1232Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1233case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1234"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1235encoded characters in the C<el> pointer.
1236
1237See also C<L</is_strict_utf8_string_loc>>.
1238
1239=cut
1240*/
1241
1242PERL_STATIC_INLINE bool
c9182d9c 1243Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1244{
33756530 1245 const U8 * first_variant;
9f2abfde
KW
1246
1247 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1248
33756530
KW
1249 if (len == 0) {
1250 len = strlen((const char *) s);
1251 }
1252
1253 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1254 if (el)
1255 *el = len;
1256
1257 if (ep) {
1258 *ep = s + len;
1259 }
1260
1261 return TRUE;
1262 }
1263
1264 {
1265 const U8* const send = s + len;
1266 const U8* x = first_variant;
1267 STRLEN outlen = first_variant - s;
1268
a0d7f935
KW
1269 while (x < send) {
1270 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1271 if (UNLIKELY(! cur_len)) {
1272 break;
1273 }
1274 x += cur_len;
1275 outlen++;
9f2abfde 1276 }
9f2abfde 1277
a0d7f935
KW
1278 if (el)
1279 *el = outlen;
9f2abfde 1280
a0d7f935
KW
1281 if (ep) {
1282 *ep = x;
1283 }
9f2abfde 1284
a0d7f935 1285 return (x == send);
33756530 1286 }
9f2abfde
KW
1287}
1288
1289/*
1290
1291=for apidoc is_c9strict_utf8_string_loc
1292
1293Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1294the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1295"utf8ness success") in the C<ep> pointer.
1296
1297See also C<L</is_c9strict_utf8_string_loclen>>.
1298
1299=cut
1300*/
1301
1302#define is_c9strict_utf8_string_loc(s, len, ep) \
1303 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1304
1305/*
1306
1307=for apidoc is_c9strict_utf8_string_loclen
1308
1309Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1310the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1311"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1312characters in the C<el> pointer.
1313
1314See also C<L</is_c9strict_utf8_string_loc>>.
1315
1316=cut
1317*/
1318
1319PERL_STATIC_INLINE bool
c9182d9c 1320Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9f2abfde 1321{
33756530 1322 const U8 * first_variant;
9f2abfde
KW
1323
1324 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1325
33756530
KW
1326 if (len == 0) {
1327 len = strlen((const char *) s);
1328 }
1329
1330 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1331 if (el)
1332 *el = len;
1333
1334 if (ep) {
1335 *ep = s + len;
1336 }
1337
1338 return TRUE;
1339 }
1340
1341 {
1342 const U8* const send = s + len;
1343 const U8* x = first_variant;
1344 STRLEN outlen = first_variant - s;
1345
a0d7f935
KW
1346 while (x < send) {
1347 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1348 if (UNLIKELY(! cur_len)) {
1349 break;
1350 }
1351 x += cur_len;
1352 outlen++;
9f2abfde 1353 }
9f2abfde 1354
a0d7f935
KW
1355 if (el)
1356 *el = outlen;
9f2abfde 1357
a0d7f935
KW
1358 if (ep) {
1359 *ep = x;
1360 }
9f2abfde 1361
a0d7f935 1362 return (x == send);
33756530 1363 }
9f2abfde
KW
1364}
1365
1366/*
1367
1368=for apidoc is_utf8_string_loc_flags
1369
1370Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1371case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1372"utf8ness success") in the C<ep> pointer.
1373
1374See also C<L</is_utf8_string_loclen_flags>>.
1375
1376=cut
1377*/
1378
1379#define is_utf8_string_loc_flags(s, len, ep, flags) \
1380 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1381
1382
1383/* The above 3 actual functions could have been moved into the more general one
1384 * just below, and made #defines that call it with the right 'flags'. They are
1385 * currently kept separate to increase their chances of getting inlined */
1386
1387/*
1388
1389=for apidoc is_utf8_string_loclen_flags
1390
1391Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1392case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1393"utf8ness success") in the C<ep> pointer, and the number of UTF-8
1394encoded characters in the C<el> pointer.
1395
1396See also C<L</is_utf8_string_loc_flags>>.
1397
1398=cut
1399*/
1400
1401PERL_STATIC_INLINE bool
c9182d9c 1402Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
9f2abfde 1403{
33756530 1404 const U8 * first_variant;
9f2abfde
KW
1405
1406 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1407 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1408 |UTF8_DISALLOW_PERL_EXTENDED)));
9f2abfde 1409
f60f61fd 1410 if (len == 0) {
a0d7f935 1411 len = strlen((const char *) s);
f60f61fd
KW
1412 }
1413
9f2abfde
KW
1414 if (flags == 0) {
1415 return is_utf8_string_loclen(s, len, ep, el);
1416 }
1417
d044b7a7 1418 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1419 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1420 {
1421 return is_strict_utf8_string_loclen(s, len, ep, el);
1422 }
1423
d044b7a7 1424 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
9f2abfde
KW
1425 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1426 {
1427 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1428 }
1429
33756530
KW
1430 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1431 if (el)
1432 *el = len;
1433
1434 if (ep) {
1435 *ep = s + len;
1436 }
1437
1438 return TRUE;
1439 }
1440
1441 {
1442 const U8* send = s + len;
1443 const U8* x = first_variant;
1444 STRLEN outlen = first_variant - s;
1445
a0d7f935
KW
1446 while (x < send) {
1447 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1448 if (UNLIKELY(! cur_len)) {
1449 break;
1450 }
1451 x += cur_len;
1452 outlen++;
9f2abfde 1453 }
9f2abfde 1454
a0d7f935
KW
1455 if (el)
1456 *el = outlen;
9f2abfde 1457
a0d7f935
KW
1458 if (ep) {
1459 *ep = x;
1460 }
9f2abfde 1461
a0d7f935 1462 return (x == send);
33756530 1463 }
9f2abfde
KW
1464}
1465
1466/*
7c93d8f0
KW
1467=for apidoc utf8_distance
1468
1469Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1470and C<b>.
1471
1472WARNING: use only if you *know* that the pointers point inside the
1473same UTF-8 buffer.
1474
1475=cut
1476*/
1477
1478PERL_STATIC_INLINE IV
1479Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1480{
1481 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1482
1483 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1484}
1485
1486/*
1487=for apidoc utf8_hop
1488
1489Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1490forward or backward.
1491
1492WARNING: do not use the following unless you *know* C<off> is within
1493the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1494on the first byte of character or just after the last byte of a character.
1495
1496=cut
1497*/
1498
1499PERL_STATIC_INLINE U8 *
1500Perl_utf8_hop(const U8 *s, SSize_t off)
1501{
1502 PERL_ARGS_ASSERT_UTF8_HOP;
1503
1504 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1505 * the bitops (especially ~) can create illegal UTF-8.
1506 * In other words: in Perl UTF-8 is not just for Unicode. */
1507
1508 if (off >= 0) {
1509 while (off--)
1510 s += UTF8SKIP(s);
1511 }
1512 else {
1513 while (off++) {
1514 s--;
1515 while (UTF8_IS_CONTINUATION(*s))
1516 s--;
1517 }
1518 }
e099ea69 1519 GCC_DIAG_IGNORE(-Wcast-qual)
7c93d8f0 1520 return (U8 *)s;
e099ea69 1521 GCC_DIAG_RESTORE
7c93d8f0
KW
1522}
1523
4dab108f 1524/*
65df57a8
TC
1525=for apidoc utf8_hop_forward
1526
1527Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1528forward.
1529
1530C<off> must be non-negative.
1531
1532C<s> must be before or equal to C<end>.
1533
1534When moving forward it will not move beyond C<end>.
1535
1536Will not exceed this limit even if the string is not valid "UTF-8".
1537
1538=cut
1539*/
1540
1541PERL_STATIC_INLINE U8 *
1542Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1543{
1544 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1545
1546 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1547 * the bitops (especially ~) can create illegal UTF-8.
1548 * In other words: in Perl UTF-8 is not just for Unicode. */
1549
1550 assert(s <= end);
1551 assert(off >= 0);
1552
1553 while (off--) {
1554 STRLEN skip = UTF8SKIP(s);
de979548 1555 if ((STRLEN)(end - s) <= skip) {
e099ea69 1556 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1557 return (U8 *)end;
e099ea69 1558 GCC_DIAG_RESTORE
de979548 1559 }
65df57a8
TC
1560 s += skip;
1561 }
1562
e099ea69 1563 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1564 return (U8 *)s;
e099ea69 1565 GCC_DIAG_RESTORE
65df57a8
TC
1566}
1567
1568/*
1569=for apidoc utf8_hop_back
1570
1571Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1572backward.
1573
1574C<off> must be non-positive.
1575
1576C<s> must be after or equal to C<start>.
1577
1578When moving backward it will not move before C<start>.
1579
1580Will not exceed this limit even if the string is not valid "UTF-8".
1581
1582=cut
1583*/
1584
1585PERL_STATIC_INLINE U8 *
1586Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1587{
1588 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1589
1590 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1591 * the bitops (especially ~) can create illegal UTF-8.
1592 * In other words: in Perl UTF-8 is not just for Unicode. */
1593
1594 assert(start <= s);
1595 assert(off <= 0);
1596
1597 while (off++ && s > start) {
e7185695 1598 do {
65df57a8 1599 s--;
e7185695 1600 } while (UTF8_IS_CONTINUATION(*s) && s > start);
65df57a8 1601 }
f6521f7c 1602
e099ea69 1603 GCC_DIAG_IGNORE(-Wcast-qual)
65df57a8 1604 return (U8 *)s;
e099ea69 1605 GCC_DIAG_RESTORE
65df57a8
TC
1606}
1607
1608/*
1609=for apidoc utf8_hop_safe
1610
1611Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1612either forward or backward.
1613
1614When moving backward it will not move before C<start>.
1615
1616When moving forward it will not move beyond C<end>.
1617
1618Will not exceed those limits even if the string is not valid "UTF-8".
1619
1620=cut
1621*/
1622
1623PERL_STATIC_INLINE U8 *
1624Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1625{
1626 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1627
1628 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1629 * the bitops (especially ~) can create illegal UTF-8.
1630 * In other words: in Perl UTF-8 is not just for Unicode. */
1631
1632 assert(start <= s && s <= end);
1633
1634 if (off >= 0) {
1635 return utf8_hop_forward(s, off, end);
1636 }
1637 else {
1638 return utf8_hop_back(s, off, start);
1639 }
1640}
1641
1642/*
4dab108f
KW
1643
1644=for apidoc is_utf8_valid_partial_char
1645
6cbb9248
KW
1646Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1647S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1648points. Otherwise, it returns 1 if there exists at least one non-empty
1649sequence of bytes that when appended to sequence C<s>, starting at position
1650C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1651otherwise returns 0.
1652
1653In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1654point.
1655
1656This is useful when a fixed-length buffer is being tested for being well-formed
1657UTF-8, but the final few bytes in it don't comprise a full character; that is,
1658it is split somewhere in the middle of the final code point's UTF-8
1659representation. (Presumably when the buffer is refreshed with the next chunk
1660of data, the new first bytes will complete the partial code point.) This
1661function is used to verify that the final bytes in the current buffer are in
1662fact the legal beginning of some code point, so that if they aren't, the
1663failure can be signalled without having to wait for the next read.
4dab108f
KW
1664
1665=cut
1666*/
2717076a
KW
1667#define is_utf8_valid_partial_char(s, e) \
1668 is_utf8_valid_partial_char_flags(s, e, 0)
f1c999a7
KW
1669
1670/*
1671
1672=for apidoc is_utf8_valid_partial_char_flags
1673
1674Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1675or not the input is a valid UTF-8 encoded partial character, but it takes an
1676extra parameter, C<flags>, which can further restrict which code points are
1677considered valid.
1678
1679If C<flags> is 0, this behaves identically to
1680C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
1681of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
1682there is any sequence of bytes that can complete the input partial character in
1683such a way that a non-prohibited character is formed, the function returns
2717076a
KW
1684TRUE; otherwise FALSE. Non character code points cannot be determined based on
1685partial character input. But many of the other possible excluded types can be
f1c999a7
KW
1686determined from just the first one or two bytes.
1687
1688=cut
1689 */
1690
56e4cf64 1691PERL_STATIC_INLINE bool
c9182d9c 1692Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
4dab108f 1693{
f1c999a7 1694 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
4dab108f 1695
f1c999a7 1696 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
d044b7a7 1697 |UTF8_DISALLOW_PERL_EXTENDED)));
4dab108f 1698
8875bd48 1699 if (s >= e || s + UTF8SKIP(s) <= e) {
4dab108f
KW
1700 return FALSE;
1701 }
1702
1376b35c 1703 return cBOOL(is_utf8_char_helper(s, e, flags));
4dab108f
KW
1704}
1705
8bc127bf
KW
1706/*
1707
1708=for apidoc is_utf8_fixed_width_buf_flags
1709
1710Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1711is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1712otherwise it returns FALSE.
1713
1714If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1715without restriction. If the final few bytes of the buffer do not form a
1716complete code point, this will return TRUE anyway, provided that
1717C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1718
1719If C<flags> in non-zero, it can be any combination of the
1720C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1721same meanings.
1722
1723This function differs from C<L</is_utf8_string_flags>> only in that the latter
1724returns FALSE if the final few bytes of the string don't form a complete code
1725point.
1726
1727=cut
1728 */
1729#define is_utf8_fixed_width_buf_flags(s, len, flags) \
1730 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1731
1732/*
1733
1734=for apidoc is_utf8_fixed_width_buf_loc_flags
1735
1736Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1737failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
1738to the beginning of any partial character at the end of the buffer; if there is
1739no partial character C<*ep> will contain C<s>+C<len>.
1740
1741See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1742
1743=cut
1744*/
1745
1746#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
1747 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1748
1749/*
1750
1751=for apidoc is_utf8_fixed_width_buf_loclen_flags
1752
1753Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1754complete, valid characters found in the C<el> pointer.
1755
1756=cut
1757*/
1758
1759PERL_STATIC_INLINE bool
c9182d9c 1760Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
33756530 1761 STRLEN len,
8bc127bf
KW
1762 const U8 **ep,
1763 STRLEN *el,
1764 const U32 flags)
1765{
1766 const U8 * maybe_partial;
1767
1768 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1769
1770 if (! ep) {
1771 ep = &maybe_partial;
1772 }
1773
1774 /* If it's entirely valid, return that; otherwise see if the only error is
1775 * that the final few bytes are for a partial character */
1776 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
1777 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1778}
1779
e6a4ffc3 1780PERL_STATIC_INLINE UV
c9182d9c 1781Perl_utf8n_to_uvchr_msgs(const U8 *s,
e6a4ffc3
KW
1782 STRLEN curlen,
1783 STRLEN *retlen,
1784 const U32 flags,
1785 U32 * errors,
1786 AV ** msgs)
1787{
1788 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
1789 * simple cases, and, if necessary calls a helper function to deal with the
1790 * more complex ones. Almost all well-formed non-problematic code points
1791 * are considered simple, so that it's unlikely that the helper function
1792 * will need to be called.
1793 *
1794 * This is an adaptation of the tables and algorithm given in
f6521f7c 1795 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
e6a4ffc3
KW
1796 * comprehensive documentation of the original version. A copyright notice
1797 * for the original version is given at the beginning of this file. The
71525f77 1798 * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
e6a4ffc3
KW
1799 */
1800
1801 const U8 * const s0 = s;
1802 const U8 * send = s0 + curlen;
f73cfe7d 1803 UV uv = 0; /* The 0 silences some stupid compilers */
e6a4ffc3
KW
1804 UV state = 0;
1805
1806 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1807
1808 /* This dfa is fast. If it accepts the input, it was for a well-formed,
1809 * non-problematic code point, which can be returned immediately.
1810 * Otherwise we call a helper function to figure out the more complicated
1811 * cases. */
1812
1813 while (s < send && LIKELY(state != 1)) {
71525f77 1814 UV type = PL_strict_utf8_dfa_tab[*s];
e6a4ffc3
KW
1815
1816 uv = (state == 0)
1817 ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1818 : UTF8_ACCUMULATE(uv, *s);
71525f77 1819 state = PL_strict_utf8_dfa_tab[256 + state + type];
e6a4ffc3
KW
1820
1821 if (state != 0) {
1822 s++;
1823 continue;
1824 }
1825
1826 if (retlen) {
1827 *retlen = s - s0 + 1;
1828 }
1829 if (errors) {
1830 *errors = 0;
1831 }
1832 if (msgs) {
1833 *msgs = NULL;
1834 }
1835
cfebc7aa 1836 return UNI_TO_NATIVE(uv);
e6a4ffc3
KW
1837 }
1838
1839 /* Here is potentially problematic. Use the full mechanism */
1840 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1841}
1842
82651abe 1843PERL_STATIC_INLINE UV
9a9a6c98 1844Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
82651abe 1845{
9a9a6c98 1846 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
82651abe
KW
1847
1848 assert(s < send);
1849
1850 if (! ckWARN_d(WARN_UTF8)) {
3eaa7592
KW
1851
1852 /* EMPTY is not really allowed, and asserts on debugging builds. But
1853 * on non-debugging we have to deal with it, and this causes it to
1854 * return the REPLACEMENT CHARACTER, as the documentation indicates */
82651abe 1855 return utf8n_to_uvchr(s, send - s, retlen,
3eaa7592 1856 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
82651abe
KW
1857 }
1858 else {
1859 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1860 if (retlen && ret == 0 && *s != '\0') {
1861 *retlen = (STRLEN) -1;
1862 }
1863
1864 return ret;
1865 }
1866}
1867
c8028aa6
TC
1868/* ------------------------------- perl.h ----------------------------- */
1869
1870/*
dcccc8ff
KW
1871=head1 Miscellaneous Functions
1872
44170c9a 1873=for apidoc is_safe_syscall
c8028aa6 1874
1a0efc9a
KW
1875Test that the given C<pv> (with length C<len>) doesn't contain any internal
1876C<NUL> characters.
1877If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
1878category, and return FALSE.
c8028aa6
TC
1879
1880Return TRUE if the name is safe.
1881
1a0efc9a
KW
1882C<what> and C<op_name> are used in any warning.
1883
796b6530 1884Used by the C<IS_SAFE_SYSCALL()> macro.
c8028aa6
TC
1885
1886=cut
1887*/
1888
1889PERL_STATIC_INLINE bool
ffd62fc2
KW
1890Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1891{
c8028aa6
TC
1892 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1893 * perl itself uses xce*() functions which accept 8-bit strings.
1894 */
1895
1896 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1897
6c4650b3 1898 if (len > 1) {
c8028aa6 1899 char *null_at;
41188aa0 1900 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
c8028aa6 1901 SETERRNO(ENOENT, LIB_INVARG);
1d505182 1902 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
c8028aa6 1903 "Invalid \\0 character in %s for %s: %s\\0%s",
41188aa0 1904 what, op_name, pv, null_at+1);
c8028aa6
TC
1905 return FALSE;
1906 }
1907 }
1908
1909 return TRUE;
1910}
1911
1912/*
7cb3f959
TC
1913
1914Return true if the supplied filename has a newline character
fa6c7d00 1915immediately before the first (hopefully only) NUL.
7cb3f959
TC
1916
1917My original look at this incorrectly used the len from SvPV(), but
1918that's incorrect, since we allow for a NUL in pv[len-1].
1919
1920So instead, strlen() and work from there.
1921
1922This allow for the user reading a filename, forgetting to chomp it,
1923then calling:
1924
1925 open my $foo, "$file\0";
1926
1927*/
1928
1929#ifdef PERL_CORE
1930
1931PERL_STATIC_INLINE bool
ffd62fc2
KW
1932S_should_warn_nl(const char *pv)
1933{
7cb3f959
TC
1934 STRLEN len;
1935
1936 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1937
1938 len = strlen(pv);
1939
1940 return len > 0 && pv[len-1] == '\n';
1941}
1942
1943#endif
1944
3a019afd
KW
1945#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1946
1947PERL_STATIC_INLINE bool
1948S_lossless_NV_to_IV(const NV nv, IV *ivp)
1949{
1950 /* This function determines if the input NV 'nv' may be converted without
1951 * loss of data to an IV. If not, it returns FALSE taking no other action.
1952 * But if it is possible, it does the conversion, returning TRUE, and
1953 * storing the converted result in '*ivp' */
1954
1955 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1956
1957# if defined(Perl_isnan)
1958
1959 if (UNLIKELY(Perl_isnan(nv))) {
1960 return FALSE;
1961 }
1962
1963# endif
1964
1965 if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1966 return FALSE;
1967 }
1968
1969 if ((IV) nv != nv) {
1970 return FALSE;
1971 }
1972
1973 *ivp = (IV) nv;
1974 return TRUE;
1975}
1976
1977#endif
1978
14fe5f8a
KW
1979/* ------------------ regcomp.c, toke.c ------------ */
1980
1981#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C)
1982
1983/*
1984 - regcurly - a little FSA that accepts {\d+,?\d*}
1985 Pulled from reg.c.
1986 */
6bbb95d4 1987PERL_STATIC_INLINE bool
14fe5f8a
KW
1988S_regcurly(const char *s)
1989{
1990 PERL_ARGS_ASSERT_REGCURLY;
1991
1992 if (*s++ != '{')
1993 return FALSE;
1994 if (!isDIGIT(*s))
1995 return FALSE;
1996 while (isDIGIT(*s))
1997 s++;
1998 if (*s == ',') {
1999 s++;
2000 while (isDIGIT(*s))
2001 s++;
2002 }
2003
2004 return *s == '}';
2005}
2006
2007#endif
2008
81d52ecd
JH
2009/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2010
94b0cb42
KW
2011#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2012
81d52ecd
JH
2013#define MAX_CHARSET_NAME_LENGTH 2
2014
2015PERL_STATIC_INLINE const char *
94b0cb42 2016S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
81d52ecd 2017{
94b0cb42
KW
2018 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2019
81d52ecd
JH
2020 /* Returns a string that corresponds to the name of the regex character set
2021 * given by 'flags', and *lenp is set the length of that string, which
2022 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2023
2024 *lenp = 1;
2025 switch (get_regex_charset(flags)) {
2026 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2027 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2028 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2029 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2030 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2031 *lenp = 2;
2032 return ASCII_MORE_RESTRICT_PAT_MODS;
2033 }
2034 /* The NOT_REACHED; hides an assert() which has a rather complex
2035 * definition in perl.h. */
2036 NOT_REACHED; /* NOTREACHED */
2037 return "?"; /* Unknown */
2038}
2039
94b0cb42
KW
2040#endif
2041
7cb3f959 2042/*
ed382232
TC
2043
2044Return false if any get magic is on the SV other than taint magic.
2045
2046*/
2047
2048PERL_STATIC_INLINE bool
ffd62fc2
KW
2049Perl_sv_only_taint_gmagic(SV *sv)
2050{
ed382232
TC
2051 MAGIC *mg = SvMAGIC(sv);
2052
2053 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2054
2055 while (mg) {
2056 if (mg->mg_type != PERL_MAGIC_taint
2057 && !(mg->mg_flags & MGf_GSKIP)
2058 && mg->mg_virtual->svt_get) {
2059 return FALSE;
2060 }
2061 mg = mg->mg_moremagic;
2062 }
2063
2064 return TRUE;
2065}
2066
ed8ff0f3
DM
2067/* ------------------ cop.h ------------------------------------------- */
2068
5b6f7443
DM
2069/* implement GIMME_V() macro */
2070
2071PERL_STATIC_INLINE U8
2072Perl_gimme_V(pTHX)
2073{
2074 I32 cxix;
2075 U8 gimme = (PL_op->op_flags & OPf_WANT);
2076
2077 if (gimme)
2078 return gimme;
2079 cxix = PL_curstackinfo->si_cxsubix;
2080 if (cxix < 0)
2081 return G_VOID;
2082 assert(cxstack[cxix].blk_gimme & G_WANT);
2083 return (cxstack[cxix].blk_gimme & G_WANT);
2084}
2085
ed8ff0f3
DM
2086
2087/* Enter a block. Push a new base context and return its address. */
2088
2089PERL_STATIC_INLINE PERL_CONTEXT *
c9182d9c 2090Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
ed8ff0f3
DM
2091{
2092 PERL_CONTEXT * cx;
2093
2094 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2095
2096 CXINC;
2097 cx = CX_CUR();
2098 cx->cx_type = type;
2099 cx->blk_gimme = gimme;
2100 cx->blk_oldsaveix = saveix;
4caf7d8c 2101 cx->blk_oldsp = (I32)(sp - PL_stack_base);
ed8ff0f3 2102 cx->blk_oldcop = PL_curcop;
4caf7d8c 2103 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
ed8ff0f3
DM
2104 cx->blk_oldscopesp = PL_scopestack_ix;
2105 cx->blk_oldpm = PL_curpm;
ce8bb8d8 2106 cx->blk_old_tmpsfloor = PL_tmps_floor;
ed8ff0f3
DM
2107
2108 PL_tmps_floor = PL_tmps_ix;
2109 CX_DEBUG(cx, "PUSH");
2110 return cx;
2111}
2112
2113
2114/* Exit a block (RETURN and LAST). */
2115
2116PERL_STATIC_INLINE void
c9182d9c 2117Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2118{
2119 PERL_ARGS_ASSERT_CX_POPBLOCK;
2120
2121 CX_DEBUG(cx, "POP");
2122 /* these 3 are common to cx_popblock and cx_topblock */
2123 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2124 PL_scopestack_ix = cx->blk_oldscopesp;
2125 PL_curpm = cx->blk_oldpm;
2126
2127 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2128 * and leaves a CX entry lying around for repeated use, so
2129 * skip for multicall */ \
2130 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2131 || PL_savestack_ix == cx->blk_oldsaveix);
2132 PL_curcop = cx->blk_oldcop;
ce8bb8d8 2133 PL_tmps_floor = cx->blk_old_tmpsfloor;
ed8ff0f3
DM
2134}
2135
2136/* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2137 * Whereas cx_popblock() restores the state to the point just before
2138 * cx_pushblock() was called, cx_topblock() restores it to the point just
2139 * *after* cx_pushblock() was called. */
2140
2141PERL_STATIC_INLINE void
c9182d9c 2142Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
ed8ff0f3
DM
2143{
2144 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2145
2146 CX_DEBUG(cx, "TOP");
2147 /* these 3 are common to cx_popblock and cx_topblock */
2148 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2149 PL_scopestack_ix = cx->blk_oldscopesp;
2150 PL_curpm = cx->blk_oldpm;
2151
2152 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2153}
2154
2155
a73d8813 2156PERL_STATIC_INLINE void
c9182d9c 2157Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
a73d8813
DM
2158{
2159 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2160
2161 PERL_ARGS_ASSERT_CX_PUSHSUB;
2162
3f6bd23a 2163 PERL_DTRACE_PROBE_ENTRY(cv);
5b6f7443
DM
2164 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2165 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
a73d8813
DM
2166 cx->blk_sub.cv = cv;
2167 cx->blk_sub.olddepth = CvDEPTH(cv);
2168 cx->blk_sub.prevcomppad = PL_comppad;
2169 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2170 cx->blk_sub.retop = retop;
2171 SvREFCNT_inc_simple_void_NN(cv);
2172 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2173}
2174
2175
2176/* subsets of cx_popsub() */
2177
2178PERL_STATIC_INLINE void
c9182d9c 2179Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2180{
2181 CV *cv;
2182
2183 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2184 assert(CxTYPE(cx) == CXt_SUB);
2185
2186 PL_comppad = cx->blk_sub.prevcomppad;
2187 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2188 cv = cx->blk_sub.cv;
2189 CvDEPTH(cv) = cx->blk_sub.olddepth;
2190 cx->blk_sub.cv = NULL;
2191 SvREFCNT_dec(cv);
5b6f7443 2192 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
a73d8813
DM
2193}
2194
2195
2196/* handle the @_ part of leaving a sub */
2197
2198PERL_STATIC_INLINE void
c9182d9c 2199Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2200{
2201 AV *av;
2202
2203 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2204 assert(CxTYPE(cx) == CXt_SUB);
2205 assert(AvARRAY(MUTABLE_AV(
2206 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2207 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2208
2209 CX_POP_SAVEARRAY(cx);
2210 av = MUTABLE_AV(PAD_SVl(0));
2211 if (UNLIKELY(AvREAL(av)))
2212 /* abandon @_ if it got reified */
2213 clear_defarray(av, 0);
2214 else {
2215 CLEAR_ARGARRAY(av);
2216 }
2217}
2218
2219
2220PERL_STATIC_INLINE void
c9182d9c 2221Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
a73d8813
DM
2222{
2223 PERL_ARGS_ASSERT_CX_POPSUB;
2224 assert(CxTYPE(cx) == CXt_SUB);
2225
3f6bd23a 2226 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
a73d8813
DM
2227
2228 if (CxHASARGS(cx))
2229 cx_popsub_args(cx);
2230 cx_popsub_common(cx);
2231}
2232
2233
6a7d52cc 2234PERL_STATIC_INLINE void
c9182d9c 2235Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
6a7d52cc
DM
2236{
2237 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2238
5b6f7443
DM
2239 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2240 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
6a7d52cc
DM
2241 cx->blk_format.cv = cv;
2242 cx->blk_format.retop = retop;
2243 cx->blk_format.gv = gv;
2244 cx->blk_format.dfoutgv = PL_defoutgv;
2245 cx->blk_format.prevcomppad = PL_comppad;
2246 cx->blk_u16 = 0;
2247
2248 SvREFCNT_inc_simple_void_NN(cv);
2249 CvDEPTH(cv)++;
2250 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2251}
2252
2253
2254PERL_STATIC_INLINE void
c9182d9c 2255Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
6a7d52cc
DM
2256{
2257 CV *cv;
2258 GV *dfout;
2259
2260 PERL_ARGS_ASSERT_CX_POPFORMAT;
2261 assert(CxTYPE(cx) == CXt_FORMAT);
2262
2263 dfout = cx->blk_format.dfoutgv;
2264 setdefout(dfout);
2265 cx->blk_format.dfoutgv = NULL;
2266 SvREFCNT_dec_NN(dfout);
2267
2268 PL_comppad = cx->blk_format.prevcomppad;
2269 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2270 cv = cx->blk_format.cv;
2271 cx->blk_format.cv = NULL;
2272 --CvDEPTH(cv);
2273 SvREFCNT_dec_NN(cv);
5b6f7443 2274 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
6a7d52cc
DM
2275}
2276
2277
13febba5 2278PERL_STATIC_INLINE void
c9182d9c 2279Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
13febba5
DM
2280{
2281 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2282
5b6f7443
DM
2283 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2284 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
13febba5
DM
2285 cx->blk_eval.retop = retop;
2286 cx->blk_eval.old_namesv = namesv;
2287 cx->blk_eval.old_eval_root = PL_eval_root;
2288 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2289 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2290 cx->blk_eval.cur_top_env = PL_top_env;
2291
4c57ced5 2292 assert(!(PL_in_eval & ~ 0x3F));
13febba5 2293 assert(!(PL_op->op_type & ~0x1FF));
4c57ced5 2294 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
13febba5
DM
2295}
2296
2297
2298PERL_STATIC_INLINE void
c9182d9c 2299Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
13febba5
DM
2300{
2301 SV *sv;
2302
2303 PERL_ARGS_ASSERT_CX_POPEVAL;
2304 assert(CxTYPE(cx) == CXt_EVAL);
2305
2306 PL_in_eval = CxOLD_IN_EVAL(cx);
4c57ced5 2307 assert(!(PL_in_eval & 0xc0));
13febba5
DM
2308 PL_eval_root = cx->blk_eval.old_eval_root;
2309 sv = cx->blk_eval.cur_text;
4c57ced5 2310 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
13febba5
DM
2311 cx->blk_eval.cur_text = NULL;
2312 SvREFCNT_dec_NN(sv);
2313 }
2314
2315 sv = cx->blk_eval.old_namesv;
2a1e0dfe
DM
2316 if (sv) {
2317 cx->blk_eval.old_namesv = NULL;
2318 SvREFCNT_dec_NN(sv);
2319 }
5b6f7443 2320 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
13febba5 2321}
6a7d52cc 2322
a73d8813 2323
d1b6bf72
DM
2324/* push a plain loop, i.e.
2325 * { block }
2326 * while (cond) { block }
2327 * for (init;cond;continue) { block }
2328 * This loop can be last/redo'ed etc.
2329 */
2330
2331PERL_STATIC_INLINE void
c9182d9c 2332Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2333{
2334 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2335 cx->blk_loop.my_op = cLOOP;
2336}
2337
2338
2339/* push a true for loop, i.e.
2340 * for var (list) { block }
2341 */
2342
2343PERL_STATIC_INLINE void
c9182d9c 2344Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
d1b6bf72
DM
2345{
2346 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2347
2348 /* this one line is common with cx_pushloop_plain */
2349 cx->blk_loop.my_op = cLOOP;
2350
2351 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2352 cx->blk_loop.itersave = itersave;
2353#ifdef USE_ITHREADS
2354 cx->blk_loop.oldcomppad = PL_comppad;
2355#endif
2356}
2357
2358
2359/* pop all loop types, including plain */
2360
2361PERL_STATIC_INLINE void
c9182d9c 2362Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
d1b6bf72
DM
2363{
2364 PERL_ARGS_ASSERT_CX_POPLOOP;
2365
2366 assert(CxTYPE_is_LOOP(cx));
2367 if ( CxTYPE(cx) == CXt_LOOP_ARY
2368 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2369 {
2370 /* Free ary or cur. This assumes that state_u.ary.ary
2371 * aligns with state_u.lazysv.cur. See cx_dup() */
2372 SV *sv = cx->blk_loop.state_u.lazysv.cur;
2373 cx->blk_loop.state_u.lazysv.cur = NULL;
2374 SvREFCNT_dec_NN(sv);
2375 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2376 sv = cx->blk_loop.state_u.lazysv.end;
2377 cx->blk_loop.state_u.lazysv.end = NULL;
2378 SvREFCNT_dec_NN(sv);
2379 }
2380 }
2381 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2382 SV *cursv;
2383 SV **svp = (cx)->blk_loop.itervar_u.svp;
2384 if ((cx->cx_type & CXp_FOR_GV))
2385 svp = &GvSV((GV*)svp);
2386 cursv = *svp;
2387 *svp = cx->blk_loop.itersave;
2388 cx->blk_loop.itersave = NULL;
2389 SvREFCNT_dec(cursv);
2390 }
2391}
2392
2a7b7c61
DM
2393
2394PERL_STATIC_INLINE void
c9182d9c 2395Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2396{
7896dde7 2397 PERL_ARGS_ASSERT_CX_PUSHWHEN;
2a7b7c61 2398
7896dde7 2399 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2a7b7c61
DM
2400}
2401
2402
2403PERL_STATIC_INLINE void
c9182d9c 2404Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2a7b7c61 2405{
7896dde7
Z
2406 PERL_ARGS_ASSERT_CX_POPWHEN;
2407 assert(CxTYPE(cx) == CXt_WHEN);
2a7b7c61
DM
2408
2409 PERL_UNUSED_ARG(cx);
59a14f30 2410 PERL_UNUSED_CONTEXT;
2a7b7c61
DM
2411 /* currently NOOP */
2412}
2413
2414
7896dde7 2415PERL_STATIC_INLINE void
c9182d9c 2416Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
7896dde7
Z
2417{
2418 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2419
2420 cx->blk_givwhen.leave_op = cLOGOP->op_other;
2421 cx->blk_givwhen.defsv_save = orig_defsv;
2422}
2423
2424
2425PERL_STATIC_INLINE void
c9182d9c 2426Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
7896dde7
Z
2427{
2428 SV *sv;
2429
2430 PERL_ARGS_ASSERT_CX_POPGIVEN;
2431 assert(CxTYPE(cx) == CXt_GIVEN);
2432
2433 sv = GvSV(PL_defgv);
2434 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2435 cx->blk_givwhen.defsv_save = NULL;
2436 SvREFCNT_dec(sv);
2437}
2438
ec2c235b
KW
2439/* ------------------ util.h ------------------------------------------- */
2440
2441/*
2442=head1 Miscellaneous Functions
2443
2444=for apidoc foldEQ
2445
2446Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2447same
2448case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
2449match themselves and their opposite case counterparts. Non-cased and non-ASCII
2450range bytes match only themselves.
2451
2452=cut
2453*/
2454
2455PERL_STATIC_INLINE I32
2456Perl_foldEQ(const char *s1, const char *s2, I32 len)
2457{
2458 const U8 *a = (const U8 *)s1;
2459 const U8 *b = (const U8 *)s2;
2460
2461 PERL_ARGS_ASSERT_FOLDEQ;
2462
2463 assert(len >= 0);
2464
2465 while (len--) {
2466 if (*a != *b && *a != PL_fold[*b])
2467 return 0;
2468 a++,b++;
2469 }
2470 return 1;
2471}
2472
0f9cb40c 2473PERL_STATIC_INLINE I32
ec2c235b
KW
2474Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2475{
79a1fabd
KW
2476 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
2477 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2478 * does not check for this. Nor does it check that the strings each have
2479 * at least 'len' characters. */
ec2c235b
KW
2480
2481 const U8 *a = (const U8 *)s1;
2482 const U8 *b = (const U8 *)s2;
2483
2484 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2485
2486 assert(len >= 0);
2487
2488 while (len--) {
2489 if (*a != *b && *a != PL_fold_latin1[*b]) {
2490 return 0;
2491 }
2492 a++, b++;
2493 }
2494 return 1;
2495}
2496
2497/*
2498=for apidoc foldEQ_locale
2499
2500Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2501same case-insensitively in the current locale; false otherwise.
2502
2503=cut
2504*/
2505
0f9cb40c 2506PERL_STATIC_INLINE I32
ec2c235b
KW
2507Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2508{
ec2c235b
KW
2509 const U8 *a = (const U8 *)s1;
2510 const U8 *b = (const U8 *)s2;
2511
2512 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2513
2514 assert(len >= 0);
2515
2516 while (len--) {
2517 if (*a != *b && *a != PL_fold_locale[*b])
2518 return 0;
2519 a++,b++;
2520 }
2521 return 1;
2522}
2523
1ab100a8
KW
2524/*
2525=for apidoc my_strnlen
2526
2527The C library C<strnlen> if available, or a Perl implementation of it.
2528
2529C<my_strnlen()> computes the length of the string, up to C<maxlen>
a3815e44 2530characters. It will never attempt to address more than C<maxlen>
1ab100a8
KW
2531characters, making it suitable for use with strings that are not
2532guaranteed to be NUL-terminated.
2533
2534=cut
2535
2536Description stolen from http://man.openbsd.org/strnlen.3,
2537implementation stolen from PostgreSQL.
2538*/
2539#ifndef HAS_STRNLEN
2540
2541PERL_STATIC_INLINE Size_t
2542Perl_my_strnlen(const char *str, Size_t maxlen)
2543{
2544 const char *end = (char *) memchr(str, '\0', maxlen);
2545
2546 PERL_ARGS_ASSERT_MY_STRNLEN;
2547
2548 if (end == NULL) return maxlen;
2549 return end - str;
2550}
2551
2552#endif
2553
6dba01e2
KW
2554#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2555
2556PERL_STATIC_INLINE void *
2557S_my_memrchr(const char * s, const char c, const STRLEN len)
2558{
2559 /* memrchr(), since many platforms lack it */
2560
2561 const char * t = s + len - 1;
2562
2563 PERL_ARGS_ASSERT_MY_MEMRCHR;
2564
2565 while (t >= s) {
2566 if (*t == c) {
2567 return (void *) t;
2568 }
2569 t--;
2570 }
2571
2572 return NULL;
2573}
2574
2575#endif
2576
24f3e849
KW
2577PERL_STATIC_INLINE char *
2578Perl_mortal_getenv(const char * str)
2579{
2580 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
2581 *
2582 * It's (mostly) thread-safe because it uses a mutex to prevent
2583 * simultaneous access from other threads that use the same mutex, and
2584 * makes a copy of the result before releasing that mutex. All of the Perl
2585 * core uses that mutex, but, like all mutexes, everything has to cooperate
2586 * for it to completely work. It is possible for code from, say XS, to not
2587 * use this mutex, defeating the safety.
2588 *
2589 * On some platforms, getenv() is not sequential-call-safe, because
2590 * subsequent calls destroy the static storage inside the C library
2591 * returned by an earlier call. The result must be copied or completely
2592 * acted upon before a subsequent getenv call. Those calls could come from
2593 * another thread. Again, making a copy while controlling the mutex
2594 * prevents these problems..
2595 *
2596 * To prevent leaks, the copy is made by creating a new SV containing it,
2597 * mortalizing the SV, and returning the SV's string (the copy). Thus this
2598 * is a drop-in replacement for getenv().
2599 *
2600 * A complication is that this can be called during phases where the
2601 * mortalization process isn't available. These are in interpreter
2602 * destruction or early in construction. khw believes that at these times
2603 * there shouldn't be anything else going on, so plain getenv is safe AS
2604 * LONG AS the caller acts on the return before calling it again. */
2605
2606 char * ret;
2607 dTHX;
2608
2609 PERL_ARGS_ASSERT_MORTAL_GETENV;
2610
2611 /* Can't mortalize without stacks. khw believes that no other threads
2612 * should be running, so no need to lock things, and this may be during a
2613 * phase when locking isn't even available */
2614 if (UNLIKELY(PL_scopestack_ix == 0)) {
2615 return getenv(str);
2616 }
2617
2618 ENV_LOCK;
2619
2620 ret = getenv(str);
2621
2622 if (ret != NULL) {
2623 ret = SvPVX(sv_2mortal(newSVpv(ret, 0)));
2624 }
2625
2626 ENV_UNLOCK;
2627 return ret;
2628}
2629
ed382232 2630/*
c8028aa6
TC
2631 * ex: set ts=8 sts=4 sw=4 et:
2632 */